Rev 12197 | Rev 12256 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
import urllib2from BeautifulSoup import BeautifulSoup, NavigableStringimport reimport sysinvalid_tags = ['b', 'i', 'u']bestSellers = []def strip_tags(html, invalid_tags):soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)for tag in soup.findAll(True):if tag.name in invalid_tags:s = ""for c in tag.contents:if not isinstance(c, NavigableString):c = strip_tags(unicode(c), invalid_tags)s += unicode(c)tag.replaceWith(s)return soupclass AmazonScraper:def __init__(self):self.count_trials = 0def read(self, url):request = urllib2.Request(url)request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')opener = urllib2.build_opener()response_data = ""try:response_data = opener.open(request).read()except urllib2.HTTPError as e:print 'ERROR: ', eprint 'Retrying'self.count_trials += 1if self.count_trials < 3:return self.read(url)self.response_data=response_datadef createData(self):self.soup = strip_tags(self.response_data,invalid_tags)self.response_data =Nonereturn self.scrape(self.soup)def scrape(self,soup):sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})for data in sellerData:print "sellerData****"price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').textprint "Unit cost= ",float(price.replace("Rs.","").replace(",",""))unitCost = float(price.replace("Rs.","").replace(",",""))shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').textif "FREE" in shippingCost:print "shippingCost=0"shippingCost = 0else:print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))sellerColumn = data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})print "Seller info ",sellerColumnratingColumn = data.find('p', attrs={'class' : 'a-spacing-small'}).find('a').contents[0]print "Rating info ",ratingColumnprint "***********************"return unitCost+shippingCostdef getBestSellers(self,soup):global bestSellersbestSellerData = soup.findAll("div" , {"class" : "zg_itemImmersion"})for data in bestSellerData:temp = {}rankVal = data.find('span', attrs={'class' : 'zg_rankNumber'}).textprint "Rank = ",rankVal.lstrip()productUrl = data.find('a')['href']print "Product URL = ",productUrl.lstrip().replace("\n","")productUrl = productUrl.replace("http://www.amazon.in/","").lstrip()ind = productUrl.rindex("/dp/")productName = productUrl[0:productUrl.rindex("/dp/")]print "Product Name = ",productNameasin = productUrl[ind+4: productUrl.rindex("/ref=")]print "Asin = ",asinprint "**********************"temp['Rank'] = rankVal.lstrip().replace(".","")temp['Url'] = productUrl.lstrip().replace("\n","")temp['Product Name'] = productUrl[0:productUrl.rindex("/dp/")]temp['Asin'] = productUrl[ind+4: productUrl.rindex("/ref=")]bestSellers.append(temp)if __name__ == '__main__':scraper = AmazonScraper()scraper.read('http://www.amazon.in/gp/offer-listing/B001D0ROGO/ref=olp_sort_ps')print scraper.createData()