Rev 12275 | Blame | Compare with Previous | Last modification | View Log | RSS feed
import urllib2from BeautifulSoup import BeautifulSoup, NavigableStringfrom dtr.utils.utils import fetchResponseUsingProxyimport reimport sysinvalid_tags = ['b', 'i', 'u']bestSellers = []def strip_tags(html, invalid_tags):soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)for tag in soup.findAll(True):if tag.name in invalid_tags:s = ""for c in tag.contents:if not isinstance(c, NavigableString):c = strip_tags(unicode(c), invalid_tags)s += unicode(c)tag.replaceWith(s)return soupclass AmazonScraper:def __init__(self, livePricing=None):self.count_trials = 0self.livePricing = livePricingdef read(self, url, findStore):response_data = ""self.findStore = findStoretry:response_data = fetchResponseUsingProxy(url,livePricing=self.livePricing)except Exception as e:print 'ERROR: ', eprint 'Retrying'self.count_trials += 1if self.count_trials < 5:return self.read(url)self.response_data=response_dataif "Server Busy" in self.response_data:print "Captcha page, lets try again."self.count_trials += 1return self.read(url)return self.createData()def createData(self):self.soup = strip_tags(self.response_data,invalid_tags)self.response_data =Nonereturn self.scrape(self.soup)def scrape(self,soup):sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})for data in sellerData:print "sellerData****"price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').textprint "Unit cost= ",float(price.replace("Rs.","").replace(",",""))unitCost = float(price.replace("Rs.","").replace(",",""))shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').textif "FREE" in shippingCost:print "shippingCost=0"shippingCost = 0else:print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))sellerColumn = data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})store=""if self.findStore:print "Seller info ",sellerColumnx = sellerColumn.find('a')['href']print xtemp = sellerColumn.find('a')store = temp.textif len(store)==0:storeUrl = xdom_in = storeUrl.find("www.amazon.in")if dom_in ==-1:storeUrl="http://amazon.in"+storeUrlstore = self.findStoreFront(storeUrl)try:ind = store.index("@ Amazon.in")store = store[0:ind].strip()except:try:ind = store.split(":")store = ind[1].strip()except:store =""ratingColumn = data.find('p', attrs={'class' : 'a-spacing-small'}).find('a').contents[0]print "Rating info ",ratingColumnprint "***********************"return unitCost+shippingCost,storedef findStoreFront(self,storeUrl):try:response_data = fetchResponseUsingProxy(storeUrl,livePricing=None)except:return ""soup = strip_tags(response_data,invalid_tags)response_data =Nonereturn soup.title.stringif __name__ == '__main__':scraper = AmazonScraper()print scraper.read('http://www.amazon.in/gp/offer-listing/B006PB44NM/ref=olp_sort_ps',True)