| Line 10... |
Line 10... |
| 10 |
|
10 |
|
| 11 |
def read(self, url):
|
11 |
def read(self, url):
|
| 12 |
url = url.replace('www.flipkart.com','163.53.77.21')
|
12 |
url = url.replace('www.flipkart.com','163.53.77.21')
|
| 13 |
print url
|
13 |
print url
|
| 14 |
request = urllib2.Request(url)
|
14 |
request = urllib2.Request(url)
|
| - |
|
15 |
request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
| - |
|
16 |
#request.add_header('Accept-Charset','ISO-8859-1,utf-8;q=0.7,*;q=0.3')
|
| - |
|
17 |
#request.add_header('Accept-Encoding','gzip,deflate,sdch')
|
| - |
|
18 |
request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')
|
| 15 |
request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0')
|
19 |
request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0')
|
| - |
|
20 |
request.add_header('Connection','keep-alive')
|
| 16 |
opener = urllib2.build_opener()
|
21 |
request.add_header('Cookie','T=TI141106533261202044684051011971166779542511900764606324691282640130; __gads=ID=683ebf052dfc3143:T=1411293573:S=ALNI_MZ_Ii5vGWTfpp24h4M8eqj95_ctPA; __sonar=7756033766217071307; buyer=0; is_loggedin=1; km_lv=x; _ga=GA1.2.1763496909.1411627333; kvcd=1411645515976; km_ai=m2z93iskuj81qiid; km_ni=m2z93iskuj81qiid; TGSRC=semcmpid%3Asem_8024046704_brand_goog; GOOGSRC=semcmpid%3Asem_8024046704_brand_goog; currentSession=present; sessionCount=3; prd_day=6|1411762819830; visitCount=7; _we_wk_ss_lsf_=true; FK-CMP-DATA=; s_ppv=42; km_uq=; Tkt=67af0938; SN=2.VI45A1DC8A40884B39A24FBA0584587E3C.SI737D7515E5C94593A5DD0F9D1CFDCD20.VS141165407206939742793.1411654071; VID=2.VI45A1DC8A40884B39A24FBA0584587E3C.1411654071.VS141165407206939742793; NSID=2.SI737D7515E5C94593A5DD0F9D1CFDCD20.1411654071.VI45A1DC8A40884B39A24FBA0584587E3C; __utma=19769839.146415981.1411293538.1411647571.1411654082.5; __utmb=19769839.3.8.1411654082; __utmc=19769839; __utmz=19769839.1411647571.4.4.utmgclid=CMu2ifys_MACFQyTjgodWnMAwQ|utmccn=(not%20set)|utmcmd=(not%20set)|utmctr=(not%20provided); s_cc=true; gpv_pn=SellerListing%3AMobile%3AKarbonn%20K105s; gpv_pn_t=no%20value; s_sq=%5B%5BB%5D%5D')
|
| - |
|
22 |
request.add_header('Host','www.flipkart.com')
|
| - |
|
23 |
request.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36')
|
| 17 |
response_data = ""
|
24 |
response_data = ""
|
| 18 |
redirect_url = ""
|
25 |
redirect_url = ""
|
| 19 |
try:
|
26 |
try:
|
| 20 |
response = urllib2.urlopen(request)
|
27 |
response = urllib2.urlopen(request)
|
| 21 |
response_data = response.read()
|
28 |
response_data = response.read()
|
| 22 |
print "Fetched response from flipkart for %s" %(url)
|
29 |
print "Fetched response from flipkart for %s" %(url)
|
| 23 |
redirect_url = response.url
|
30 |
redirect_url = response.url
|
| 24 |
|
31 |
|
| 25 |
except urllib2.HTTPError as e:
|
32 |
except Exception as e:
|
| 26 |
print 'ERROR: ', e
|
33 |
print 'ERROR: ', e
|
| 27 |
print 'Retrying'
|
34 |
print 'Retrying'
|
| 28 |
self.count_trials += 1
|
35 |
self.count_trials += 1
|
| 29 |
|
36 |
|
| 30 |
if self.count_trials < 3:
|
37 |
if self.count_trials < 3:
|
| Line 140... |
Line 147... |
| 140 |
if not temp.has_key('shippingTime'):
|
147 |
if not temp.has_key('shippingTime'):
|
| 141 |
print "Populating shipping time from metrics"
|
148 |
print "Populating shipping time from metrics"
|
| 142 |
temp['shippingTime'] = dataMetric[3]
|
149 |
temp['shippingTime'] = dataMetric[3]
|
| 143 |
temp['sellerScore'] = int(dataMetric[4])
|
150 |
temp['sellerScore'] = int(dataMetric[4])
|
| 144 |
info.append(temp)
|
151 |
info.append(temp)
|
| - |
|
152 |
print info
|
| 145 |
print "Returning Json response from flipkart for %s" %(url)
|
153 |
print "Returning Json response from flipkart for %s" %(url)
|
| 146 |
return info
|
154 |
return info
|
| 147 |
|
155 |
|
| 148 |
if __name__ == '__main__':
|
156 |
if __name__ == '__main__':
|
| 149 |
scraper = FlipkartScraper()
|
157 |
scraper = FlipkartScraper()
|
| 150 |
print scraper.read('http://www.flipkart.com/ps/MOBDYFURT9PKAPSX')
|
158 |
scraper.read('http://www.flipkart.com/ps/MOBDY45GPWHXH9UY')
|