Subversion Repositories SmartDukaan

Rev

Rev 16217 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
16004 kshitij.so 1
from dtr.utils.utils import fetchResponseUsingProxy, removePriceFormatting, \
2
transformUrl
16100 kshitij.so 3
import cssutils
16217 kshitij.so 4
from pyquery import PyQuery
15867 kshitij.so 5
 
6
headers = {
7
            'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36',
8
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
9
            'Accept-Language' : 'en-US,en;q=0.8',                     
10
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
11
            'Connection':'keep-alive',
12
            'Accept-Encoding' : 'gzip,deflate,sdch'
13
        }
14
 
15
class ShopCluesScraper:
16100 kshitij.so 16
    def __init__(self, livePricing=None, findThumbnail=None):
15867 kshitij.so 17
        self.count_trials = 0
18
        self.livePricing = livePricing
16100 kshitij.so 19
        self.findThumbnail = findThumbnail
15867 kshitij.so 20
 
21
    def read(self, url):
22
        response_data = ""
16004 kshitij.so 23
        url = transformUrl(url,5)
24
        print url
15867 kshitij.so 25
        try:
15896 kshitij.so 26
            response_data = fetchResponseUsingProxy(url, headers=headers, proxy=False)
15867 kshitij.so 27
        except Exception as e:
28
            print 'ERROR: ', e
29
            print 'Retrying'
30
            self.count_trials += 1
31
 
32
            if self.count_trials < 5:
33
                return self.read(url)
34
        self.response_data=response_data
16100 kshitij.so 35
        return self.scrape()
15867 kshitij.so 36
 
16100 kshitij.so 37
    def scrape(self):
16217 kshitij.so 38
        pq = PyQuery(self.response_data)
16100 kshitij.so 39
        thumbnailUrl = ""
16217 kshitij.so 40
        if self.findThumbnail:
41
            imgTag = pq('div.pd-image').attr['style']
42
            style = cssutils.parseStyle(imgTag)
43
            thumbnailUrl =  style['background'][style['background'].index('(')+1:style['background'].rfind(')')]
44
        headerDiv = pq('div.pd_name.clearfix')
45
        scin = str(headerDiv.attr['data-id'])
46
        infoDiv = pq('div.pd-price-cont.clearfix')
15867 kshitij.so 47
        try:
16217 kshitij.so 48
            price = float(removePriceFormatting(infoDiv('span#thirdPrice').text()))
16100 kshitij.so 49
        except:
16217 kshitij.so 50
            price = float(removePriceFormatting(infoDiv('span#sellingPrice').text()))
15867 kshitij.so 51
        inStock = 1
16217 kshitij.so 52
        if infoDiv('div.stock').text().strip().upper() == 'OUT OF STOCK':
15867 kshitij.so 53
            inStock = 0
54
        isCod = 1
16217 kshitij.so 55
        if len(infoDiv('li#notcod')) > 0:
15867 kshitij.so 56
            isCod = 0
57
        coupon = ""
16217 kshitij.so 58
        couponDiv =  (infoDiv('div.info.clearfix'))
59
        if len(couponDiv) > 0:
60
            coupon =  (couponDiv('span'))[1].text
16100 kshitij.so 61
        return {'scin':scin,'price':price,'inStock':inStock,'isCod':isCod,'coupon':coupon, "thumbnail":thumbnailUrl}
15867 kshitij.so 62
 
63
if __name__ == '__main__':
64
    import datetime
65
    print datetime.datetime.now()
16217 kshitij.so 66
    scraper = ShopCluesScraper(findThumbnail=True)
19203 kshitij.so 67
    print scraper.read('http://m.shopclues.com/apple-iphone-6s-16gb-9.html')
15867 kshitij.so 68
    print datetime.datetime.now()