Subversion Repositories SmartDukaan

Rev

Rev 15265 | Blame | Compare with Previous | Last modification | View Log | RSS feed

from dtr.utils.utils import fetchResponseUsingProxy
import re
import datetime
from pyquery import PyQuery
import traceback

invalid_tags = ['b', 'i', 'u']
bestSellers = []

headers = {
            'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36',
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
            'Accept-Language' : 'en-US,en;q=0.8',                     
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            'Connection':'keep-alive',
            'Accept-Encoding' : 'gzip,deflate,sdch'
        }

class AmazonScraper:
    def __init__(self, livePricing=None):
        self.count_trials = 0
        self.livePricing = livePricing
    
    def read(self, url):
        response_data = ""
        try:
            response_data = fetchResponseUsingProxy(url,headers=headers,livePricing=self.livePricing,proxy=True)
        except Exception as e:
            print 'ERROR: ', e
            print 'Retrying'
            self.count_trials += 1
            
            if self.count_trials < 5:
                return self.read(url)
        
        self.response_data=response_data
        
        if "Server Busy" in self.response_data:
            self.count_trials += 1
            return self.read(url)
        return self.createData()
    
    def createData(self):
        try:
            pq = PyQuery(self.response_data)
            tag = pq('div.olpOffer')
            infoDiv =  pq(tag[0])
            price = infoDiv('span.olpOfferPrice')
            unitCost = float(price.text().replace("Rs.","").replace(",",""))
            shipping = infoDiv('span.olpShippingPrice')
            try:
                shippingCost = float(shipping.text().replace("Rs.","").replace(",",""))
            except:
                shippingCost = 0
            return unitCost + shippingCost 
        except:
            return 0.0
        
    
if __name__ == '__main__':
    print datetime.datetime.now()
    scraper = AmazonScraper(True)
    print scraper.read('http://www.amazon.in/gp/aw/ol/B00UTKPKHY')
    print datetime.datetime.now()