Rev 17246 | Blame | Compare with Previous | Last modification | View Log | RSS feed
import urllib2from BeautifulSoup import BeautifulSoup, NavigableStringimport refrom dtr.utils.utils import fetchResponseUsingProxyinvalid_tags = ['b', 'i', 'u']bestSellers = []def strip_tags(html, invalid_tags):soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)for tag in soup.findAll(True):if tag.name in invalid_tags:s = ""for c in tag.contents:if not isinstance(c, NavigableString):c = strip_tags(unicode(c), invalid_tags)s += unicode(c)tag.replaceWith(s)return soupclass AmazonScraper:def __init__(self,livePricing=None):self.count_trials = 0self.livePricing = livePricingdef read(self, url):response_data = ""try:response_data = fetchResponseUsingProxy(url, livePricing=self.livePricing)except Exception as e:print 'ERROR: ', eprint 'Retrying'self.count_trials += 1if self.count_trials < 5:return self.read(url)self.response_data=response_dataif "Server Busy" in self.response_data:print "Server busy...Ahhhhh"self.count_trials += 1return self.read(url)return self.createData()def createData(self):self.soup = strip_tags(self.response_data,invalid_tags)self.response_data =Nonereturn self.scrape(self.soup)def scrape(self,soup):try:sellerData = soup.find("span" , {"id" : "priceblock_dealprice"})dealPrice = float(sellerData.text.replace("Rs.","").replace(",",""))print dealPriceexcept:dealPrice = 0.0try:dealStatus = soup.find('span',{'id':re.compile('dealStatusAvailability_*')})dealStatus = float(dealStatus.text.replace("%","").replace(",",""))except:dealStatus = 100if dealStatus < 100 and dealPrice > 0:return dealPriceelse:return 0.0if __name__ == '__main__':scraper = AmazonScraper(True)print scraper.read('http://www.amazon.in/dp/B015CSIA38/ref=gbdp_vlo_61287013_B015CSIA38?_encoding=UTF8&smid=A2WBY8FP973J47')