Subversion Repositories SmartDukaan

Rev

Rev 15265 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
14744 kshitij.so 1
from dtr.utils.utils import fetchResponseUsingProxy
13829 kshitij.so 2
import re
15950 kshitij.so 3
import datetime
4
from pyquery import PyQuery
5
import traceback
13829 kshitij.so 6
 
7
invalid_tags = ['b', 'i', 'u']
8
bestSellers = []
9
 
15950 kshitij.so 10
headers = {
11
            'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36',
12
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
13
            'Accept-Language' : 'en-US,en;q=0.8',                     
14
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
15
            'Connection':'keep-alive',
16
            'Accept-Encoding' : 'gzip,deflate,sdch'
17
        }
13829 kshitij.so 18
 
19
class AmazonScraper:
15212 kshitij.so 20
    def __init__(self, livePricing=None):
13829 kshitij.so 21
        self.count_trials = 0
15212 kshitij.so 22
        self.livePricing = livePricing
13829 kshitij.so 23
 
24
    def read(self, url):
25
        response_data = ""
26
        try:
15950 kshitij.so 27
            response_data = fetchResponseUsingProxy(url,headers=headers,livePricing=self.livePricing,proxy=True)
14744 kshitij.so 28
        except Exception as e:
13829 kshitij.so 29
            print 'ERROR: ', e
30
            print 'Retrying'
31
            self.count_trials += 1
32
 
15155 kshitij.so 33
            if self.count_trials < 5:
13829 kshitij.so 34
                return self.read(url)
35
 
36
        self.response_data=response_data
15265 kshitij.so 37
 
15153 kshitij.so 38
        if "Server Busy" in self.response_data:
39
            self.count_trials += 1
40
            return self.read(url)
13829 kshitij.so 41
        return self.createData()
42
 
43
    def createData(self):
15950 kshitij.so 44
        try:
45
            pq = PyQuery(self.response_data)
46
            tag = pq('div.olpOffer')
47
            infoDiv =  pq(tag[0])
48
            price = infoDiv('span.olpOfferPrice')
49
            unitCost = float(price.text().replace("Rs.","").replace(",",""))
50
            shipping = infoDiv('span.olpShippingPrice')
13829 kshitij.so 51
            try:
15950 kshitij.so 52
                shippingCost = float(shipping.text().replace("Rs.","").replace(",",""))
13829 kshitij.so 53
            except:
15950 kshitij.so 54
                shippingCost = 0
55
            return unitCost + shippingCost 
56
        except:
57
            return 0.0
58
 
59
 
13829 kshitij.so 60
if __name__ == '__main__':
15950 kshitij.so 61
    print datetime.datetime.now()
15212 kshitij.so 62
    scraper = AmazonScraper(True)
15950 kshitij.so 63
    print scraper.read('http://www.amazon.in/gp/aw/ol/B00UTKPKHY')
64
    print datetime.datetime.now()