| Line 7... |
Line 7... |
| 7 |
def __init__(self):
|
7 |
def __init__(self):
|
| 8 |
self.count_trials = 0
|
8 |
self.count_trials = 0
|
| 9 |
self.redirectCount = 0
|
9 |
self.redirectCount = 0
|
| 10 |
|
10 |
|
| 11 |
def read(self, url):
|
11 |
def read(self, url):
|
| 12 |
#url = url.replace('www.flipkart.com','163.53.77.21')
|
- |
|
| 13 |
url = url.replace('www.flipkart.com','163.53.76.55')
|
- |
|
| 14 |
print url
|
- |
|
| 15 |
request = urllib2.Request(url)
|
12 |
request = urllib2.Request(url)
|
| 16 |
request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
13 |
request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
|
| 17 |
#request.add_header('Accept-Charset','ISO-8859-1,utf-8;q=0.7,*;q=0.3')
|
- |
|
| 18 |
#request.add_header('Accept-Encoding','gzip,deflate,sdch')
|
- |
|
| 19 |
request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')
|
14 |
request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')
|
| 20 |
request.add_header('Connection','keep-alive')
|
15 |
request.add_header('Connection','keep-alive')
|
| 21 |
request.add_header('Cookie','T=TI141257426738726661427143281839817329423126740566618323641725716448; __sonar=7237334677420142002; __gads=ID=c8b82101a0e4f451:T=1412574724:S=ALNI_MbPMbEOZj2nAGjM54z8ZHFMqwTOTQ; FK-CMP-DATA=; SN=2.VI11FB3FB6ED9D4693A796AB8C965B3417.SI802C325AC43444858830E870C4FD3324.VS141257426735693951472.1412576209; VID=2.VI11FB3FB6ED9D4693A796AB8C965B3417.1412576209.VS141257426735693951472; NSID=2.SI802C325AC43444858830E870C4FD3324.1412576209.VI11FB3FB6ED9D4693A796AB8C965B3417; __utma=19769839.709301254.1412574234.1412574234.1412574234.1; __utmb=19769839.23.10.1412574234; __utmc=19769839; __utmz=19769839.1412574234.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); s_cc=true; gpv_pn=SellerListing%3AMobile%3AMicromax%20Canvas%20Fire%20A093; gpv_pn_t=no%20value; s_sq=%5B%5BB%5D%5D; s_ppv=36')
|
16 |
request.add_header('Cookie','T=TI141257426738726661427143281839817329423126740566618323641725716448; __sonar=7237334677420142002; __gads=ID=c8b82101a0e4f451:T=1412574724:S=ALNI_MbPMbEOZj2nAGjM54z8ZHFMqwTOTQ; FK-CMP-DATA=; SN=2.VI11FB3FB6ED9D4693A796AB8C965B3417.SI802C325AC43444858830E870C4FD3324.VS141257426735693951472.1412576209; VID=2.VI11FB3FB6ED9D4693A796AB8C965B3417.1412576209.VS141257426735693951472; NSID=2.SI802C325AC43444858830E870C4FD3324.1412576209.VI11FB3FB6ED9D4693A796AB8C965B3417; __utma=19769839.709301254.1412574234.1412574234.1412574234.1; __utmb=19769839.23.10.1412574234; __utmc=19769839; __utmz=19769839.1412574234.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); s_cc=true; gpv_pn=SellerListing%3AMobile%3AMicromax%20Canvas%20Fire%20A093; gpv_pn_t=no%20value; s_sq=%5B%5BB%5D%5D; s_ppv=36')
|
| 22 |
request.add_header('Cache-Control','max-age=0')
|
17 |
request.add_header('Cache-Control','max-age=0')
|
| 23 |
request.add_header('Host','www.flipkart.com')
|
18 |
request.add_header('Host','www.flipkart.com')
|
| Line 38... |
Line 33... |
| 38 |
|
33 |
|
| 39 |
if self.count_trials < 3:
|
34 |
if self.count_trials < 3:
|
| 40 |
return self.read(url)
|
35 |
return self.read(url)
|
| 41 |
|
36 |
|
| 42 |
self.response_data=response_data
|
37 |
self.response_data=response_data
|
| 43 |
return self.createData(url,redirect_url)
|
38 |
return self.createSoup(url)
|
| 44 |
|
39 |
|
| 45 |
def scrapeRedirectedPage(self,soup,redirect_url):
|
40 |
# def scrapeRedirectedPage(self,soup,redirect_url):
|
| 46 |
print soup
|
41 |
# print soup
|
| 47 |
print redirect_url
|
42 |
# print redirect_url
|
| 48 |
t = soup.find("div" , {"class" : "seller-table fk-user-select-none line"})
|
43 |
# t = soup.find("div" , {"class" : "seller-table fk-user-select-none line"})
|
| 49 |
print t
|
44 |
# print t
|
| 50 |
table_rows = t.findAll("tr" , {"class" : re.compile('t-row.*')})
|
45 |
# table_rows = t.findAll("tr" , {"class" : re.compile('t-row.*')})
|
| 51 |
print table_rows
|
46 |
# print table_rows
|
| 52 |
for x in table_rows:
|
47 |
# for x in table_rows:
|
| 53 |
print x
|
48 |
# print x
|
| 54 |
|
49 |
#
|
| 55 |
def createData(self,url, redirect_url):
|
50 |
def createSoup(self, url):
|
| 56 |
print "Creating soup from flipkart data for %s" %(url)
|
51 |
print "Creating soup from flipkart data for %s" %(url)
|
| 57 |
#redirect_url = redirect_url.replace('www.flipkart.com','163.53.77.21')
|
- |
|
| 58 |
print "Redirect url is %s"%(redirect_url)
|
- |
|
| 59 |
page=self.response_data.decode("utf-8")
|
52 |
page=self.response_data.decode("utf-8")
|
| 60 |
self.soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
|
53 |
self.soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
|
| 61 |
page = None
|
54 |
page = None
|
| 62 |
self.response_data = None
|
55 |
self.response_data = None
|
| 63 |
print "Soup created from flipkart data for %s" %(url)
|
56 |
print "Soup created from flipkart data for %s" %(url)
|
| 64 |
if (url==redirect_url):
|
- |
|
| 65 |
return self.scrape(self.soup,url)
|
57 |
return self.scrape(self.soup,url)
|
| 66 |
else:
|
- |
|
| 67 |
print self.redirectCount
|
- |
|
| 68 |
self.redirectCount+=1
|
- |
|
| 69 |
if self.redirectCount >5:
|
- |
|
| 70 |
raise
|
- |
|
| 71 |
return self.read(url)
|
- |
|
| 72 |
|
- |
|
| 73 |
|
- |
|
| 74 |
|
- |
|
| 75 |
|
58 |
|
| 76 |
def scrape(self,soup,url):
|
59 |
def scrape(self,soup,url):
|
| 77 |
print "Inside json creator for %s" %(url)
|
60 |
print "Inside json creator for %s" %(url)
|
| 78 |
info = []
|
61 |
info = []
|
| 79 |
oddSeller = soup.findAll("div" , {"class" : "line seller-item odd "})
|
62 |
oddSeller = soup.findAll("div" , {"class" : "line seller-item odd "})
|