Rev 12402 | Rev 12411 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
from BeautifulSoup import BeautifulSoup, NavigableStringimport reimport sysimport datetimeimport grequestsimport reinvalid_tags = ['b', 'i', 'u']bestSellers = []def strip_tags(html, invalid_tags):soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)for tag in soup.findAll(True):if tag.name in invalid_tags:s = ""for c in tag.contents:if not isinstance(c, NavigableString):c = strip_tags(unicode(c), invalid_tags)s += unicode(c)tag.replaceWith(s)return soupclass AmazonAsyncScraper:def __init__(self):self.count_trials = 0def read(self, urls, findStore):returnMap = {}print datetime.datetime.now()header = {'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1'}rs = (grequests.get(u, headers=header) for u in urls)for x in grequests.map(rs):soup = strip_tags(x.text,invalid_tags)print soupfor tag in soup.findAll(True):if tag.name in invalid_tags:s = ""for c in tag.contents:if not isinstance(c, NavigableString):c = strip_tags(unicode(c), invalid_tags)s += unicode(c)tag.replaceWith(s)x.close()sellerCount=0info = []sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})dataLength = len(sellerData)print dataLengthfor data in sellerData:tempMap={}price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').textunitCost = float(price.replace("Rs.","").replace(",",""))shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').textif "FREE" in shippingCost:shippingCost = 0else:#print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))sellerColumn = data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})store=""storeUrl=""if findStore:storeUrl = sellerColumn.find('a')['href']temp = sellerColumn.find('a')store = temp.textif len(store)==0:print storeUrldom_in = storeUrl.find("www.amazon.in")print dom_inif dom_in ==-1:storeUrl="http://amazon.in"+storeUrlif storeUrl[storeUrl.rfind('/')+1:]=='AF6E3O0VE0X4D':store = 'Saholic'if len(store)!=0:tempMap['isStoreFront']='True'else:tempMap['isStoreFront']='False'tempMap['storeUrl'] =storeUrlasinind = x.url.index("offer-listing")refind = x.url.index("/ref=olp_sort_ps")asin = x.url[asinind+14:refind].strip()sellerCount+=1if sellerCount==1:tempMap['sellerName'] = store.strip()tempMap['sellerPrice'] = unitCost+shippingCostif sellerCount==2:tempMap['sellerName'] = store.strip()tempMap['sellerPrice'] = unitCost+shippingCostif sellerCount==3:tempMap['sellerName'] = store.strip()tempMap['sellerPrice'] = unitCost+shippingCostinfo.append(tempMap)if sellerCount==3 or sellerCount==dataLength:returnMap[asin] = infobreakif findStore:return self.findStoreFront(returnMap)else:return returnMapdef findStoreFront(self,returnMap):storeFront={}for arr in returnMap.itervalues():print "arr is ",arrfor dic in arr:print "dic ",dicif dic['isStoreFront']!='True':storeFront[dic.get('storeUrl')] =''rs = (grequests.get(u,stream=False) for u in storeFront.keys())for x in grequests.map(rs):soup = strip_tags(x.text,invalid_tags)x.close#print x.url.rfind('&me=')#print x.url[x.url.rfind('&me='):].rfind('&')mId= x.url[x.url.rfind('&me=')+4:x.url[x.url.rfind('&me='):].rfind('&')+x.url.rfind('&me=')]sellerName = soup.title.string#print mIdtry:ind = sellerName.index("@ Amazon.in")sellerName = sellerName[0:ind].strip()except:try:ind = sellerName.split(":")sellerName = ind[1].strip()except:sellerName =""#storeFront[re.compile('*'+mId+'.*')] = sellerName#print mId#print sellerNamemyRe = re.compile('.*'+mId+'.*')for key in storeFront:if myRe.match(key):#print "Match found ",keystoreFront[key] = sellerName.strip()#storeFront.get(re.compile('.*'+mId+'.*'))for arr in returnMap.itervalues():#print "arr is ",arrfor dic in arr:#print "dic ",dicif dic['isStoreFront']!='True':dic['sellerName'] =storeFront.get(dic.get('storeUrl'))dic['isStoreFront']='True'print "********"return returnMap# rs = (grequests.get(u,stream=False) for u in urls)# for x in grequests.map(rs):#return soup.title.stringif __name__ == '__main__':urls=[]urls.append("http://amazon.in/gp/offer-listing/B007VZFZO8/ref=olp_sort_ps")# asin = []# for a in amazonlisted:# asin.append(a.asin)# urls.append('http://www.amazon.in/gp/offer-listing/'+str(a.asin)+'/ref=olp_sort_ps')# if len(urls)==50:# breakprint urlsscraper = AmazonAsyncScraper()'http://www.amazon.in/gp/offer-listing/B003SNIN9Q/ref=olp_sort_ps'print len(urls)x = scraper.read(urls,True)print xprint "##################"# fetched = x.items()# print list(set(asin) - set(fetched))# for a,i in x.iteritems():# print a# for data in i:# print data# print "*********"#print scraper.createData()print datetime.datetime.now()