Go to most recent revision |
Details |
Last modification |
View Log
| RSS feed
| Rev |
Author |
Line No. |
Line |
| 4039 |
varun.gupt |
1 |
'''
|
|
|
2 |
Created on 20-Sep-2011
|
|
|
3 |
|
|
|
4 |
@author: Varun Gupta
|
|
|
5 |
'''
|
|
|
6 |
import urllib2
|
|
|
7 |
|
|
|
8 |
class BaseScraper:
|
|
|
9 |
def __init__(self):
|
|
|
10 |
self.count_trials = 0
|
|
|
11 |
|
|
|
12 |
def read(self, url):
|
|
|
13 |
request = urllib2.Request(url)
|
|
|
14 |
request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
|
|
|
15 |
opener = urllib2.build_opener()
|
|
|
16 |
try:
|
|
|
17 |
response_data = opener.open(request).read()
|
|
|
18 |
|
|
|
19 |
except urllib2.HTTPError as e:
|
|
|
20 |
print 'ERROR:', e
|
|
|
21 |
print 'Retrying'
|
|
|
22 |
self.count_trials += 1
|
|
|
23 |
|
|
|
24 |
if self.count_trials < 3:
|
|
|
25 |
return self.read(url)
|
|
|
26 |
|
|
|
27 |
return response_data
|
|
|
28 |
|
|
|
29 |
if __name__ == '__main__':
|
|
|
30 |
scraper = BaseScraper()
|
|
|
31 |
print scraper.read('http://www.flipkart.com/mobiles/all')
|