Rev 15265 | Blame | Compare with Previous | Last modification | View Log | RSS feed
from dtr.utils.utils import fetchResponseUsingProxyimport reimport datetimefrom pyquery import PyQueryimport tracebackinvalid_tags = ['b', 'i', 'u']bestSellers = []headers = {'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36','Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language' : 'en-US,en;q=0.8','Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3','Connection':'keep-alive','Accept-Encoding' : 'gzip,deflate,sdch'}class AmazonScraper:def __init__(self, livePricing=None):self.count_trials = 0self.livePricing = livePricingdef read(self, url):response_data = ""try:response_data = fetchResponseUsingProxy(url,headers=headers,livePricing=self.livePricing,proxy=True)except Exception as e:print 'ERROR: ', eprint 'Retrying'self.count_trials += 1if self.count_trials < 5:return self.read(url)self.response_data=response_dataif "Server Busy" in self.response_data:self.count_trials += 1return self.read(url)return self.createData()def createData(self):try:pq = PyQuery(self.response_data)tag = pq('div.olpOffer')infoDiv = pq(tag[0])price = infoDiv('span.olpOfferPrice')unitCost = float(price.text().replace("Rs.","").replace(",",""))shipping = infoDiv('span.olpShippingPrice')try:shippingCost = float(shipping.text().replace("Rs.","").replace(",",""))except:shippingCost = 0return unitCost + shippingCostexcept:return 0.0if __name__ == '__main__':print datetime.datetime.now()scraper = AmazonScraper(True)print scraper.read('http://www.amazon.in/gp/aw/ol/B00UTKPKHY')print datetime.datetime.now()