Subversion Repositories SmartDukaan

Rev

Rev 16100 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 16100 Rev 16217
Line 1... Line -...
1
import urllib2
-
 
2
from BeautifulSoup import BeautifulSoup, NavigableString
-
 
3
from dtr.utils.utils import fetchResponseUsingProxy, removePriceFormatting, \
1
from dtr.utils.utils import fetchResponseUsingProxy, removePriceFormatting, \
4
transformUrl
2
transformUrl
5
import re
-
 
6
import cssutils
3
import cssutils
7
 
-
 
8
invalid_tags = ['b', 'i', 'u']
4
from pyquery import PyQuery
9
bestSellers = []
-
 
10
 
5
 
11
headers = {
6
headers = {
12
            'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36',
7
            'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36',
13
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
8
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
14
            'Accept-Language' : 'en-US,en;q=0.8',                     
9
            'Accept-Language' : 'en-US,en;q=0.8',                     
15
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
10
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
16
            'Connection':'keep-alive',
11
            'Connection':'keep-alive',
17
            'Accept-Encoding' : 'gzip,deflate,sdch'
12
            'Accept-Encoding' : 'gzip,deflate,sdch'
18
        }
13
        }
19
 
14
 
20
def strip_tags(html, invalid_tags):
-
 
21
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
-
 
22
 
-
 
23
    for tag in soup.findAll(True):
-
 
24
        if tag.name in invalid_tags:
-
 
25
            s = ""
-
 
26
 
-
 
27
            for c in tag.contents:
-
 
28
                if not isinstance(c, NavigableString):
-
 
29
                    c = strip_tags(unicode(c), invalid_tags)
-
 
30
                s += unicode(c)
-
 
31
 
-
 
32
            tag.replaceWith(s)
-
 
33
 
-
 
34
    return soup
-
 
35
 
-
 
36
class ShopCluesScraper:
15
class ShopCluesScraper:
37
    def __init__(self, livePricing=None, findThumbnail=None):
16
    def __init__(self, livePricing=None, findThumbnail=None):
38
        self.count_trials = 0
17
        self.count_trials = 0
39
        self.livePricing = livePricing
18
        self.livePricing = livePricing
40
        self.findThumbnail = findThumbnail
19
        self.findThumbnail = findThumbnail
Line 50... Line 29...
50
            print 'Retrying'
29
            print 'Retrying'
51
            self.count_trials += 1
30
            self.count_trials += 1
52
            
31
            
53
            if self.count_trials < 5:
32
            if self.count_trials < 5:
54
                return self.read(url)
33
                return self.read(url)
55
        
-
 
56
        self.response_data=response_data
34
        self.response_data=response_data
57
        return self.createData()
-
 
58
    
-
 
59
    def createData(self):
-
 
60
        self.soup = strip_tags(self.response_data,invalid_tags)
-
 
61
        self.response_data =None
-
 
62
        return self.scrape()
35
        return self.scrape()
63
    
36
    
64
    
-
 
65
    def scrape(self):
37
    def scrape(self):
66
        div = self.soup.find('div',{'class':'pd_name clearfix'})
-
 
67
        scin =  div['data-id']
38
        pq = PyQuery(self.response_data)
68
        thumbnailUrl = ""
39
        thumbnailUrl = ""
-
 
40
        if self.findThumbnail:
-
 
41
            imgTag = pq('div.pd-image').attr['style']
-
 
42
            style = cssutils.parseStyle(imgTag)
-
 
43
            thumbnailUrl =  style['background'][style['background'].index('(')+1:style['background'].rfind(')')]
-
 
44
        headerDiv = pq('div.pd_name.clearfix')
-
 
45
        scin = str(headerDiv.attr['data-id'])
-
 
46
        infoDiv = pq('div.pd-price-cont.clearfix')
69
        try:
47
        try:
70
            if self.findThumbnail:
-
 
71
                imgTag =  self.soup.find('div',{'class':'pd-image'})['style']
-
 
72
                style = cssutils.parseStyle(imgTag)
-
 
73
                thumbnailUrl =  style['background'][style['background'].index('(')+1:style['background'].rfind(')')]
-
 
74
        except:
-
 
75
            pass
-
 
76
        div2 = self.soup.find('div',{'class':'pd-price-cont clearfix'})
-
 
77
        try:
-
 
78
            price =  float(removePriceFormatting(div2.find('span',{'id':'thirdPrice'}).find('span').string))
48
            price = float(removePriceFormatting(infoDiv('span#thirdPrice').text()))
79
        except:
49
        except:
80
            price =  float(removePriceFormatting(div2.find('span',{'id':'sellingPrice'}).find('span').string))
50
            price = float(removePriceFormatting(infoDiv('span#sellingPrice').text()))
81
        inStock = 1
51
        inStock = 1
82
        if (div2.find('div',{'class':re.compile('stock.*')}).string).strip().upper() == 'OUT OF STOCK':
52
        if infoDiv('div.stock').text().strip().upper() == 'OUT OF STOCK':
83
            inStock = 0
53
            inStock = 0
84
        isCod = 1
54
        isCod = 1
85
        if div2.find('li',{'id':'iscod'}) is None:
55
        if len(infoDiv('li#notcod')) > 0:
86
            isCod = 0
56
            isCod = 0
87
        coupon = ""
57
        coupon = ""
88
        try:
-
 
89
            if div2.find('div',{'class':'info clearfix'}):
58
        couponDiv =  (infoDiv('div.info.clearfix'))
90
                coupon = div2.find('div',{'class':'info clearfix'}).findAll('span')[1].string
-
 
91
        except:
59
        if len(couponDiv) > 0:
92
            print "Unable to parse coupon code"
60
            coupon =  (couponDiv('span'))[1].text
93
        return {'scin':scin,'price':price,'inStock':inStock,'isCod':isCod,'coupon':coupon, "thumbnail":thumbnailUrl}
61
        return {'scin':scin,'price':price,'inStock':inStock,'isCod':isCod,'coupon':coupon, "thumbnail":thumbnailUrl}
94
            
-
 
95
            
-
 
96
 
62
 
97
if __name__ == '__main__':
63
if __name__ == '__main__':
98
    import datetime
64
    import datetime
99
    print datetime.datetime.now()
65
    print datetime.datetime.now()
100
    scraper = ShopCluesScraper()
66
    scraper = ShopCluesScraper(findThumbnail=True)
101
    print scraper.read('http://shopclues.com/samsung-galaxy-note-4-13.html')
67
    print scraper.read('http://m.shopclues.com/apple-iphone-6-16-gb-10.html')
102
    print datetime.datetime.now()
68
    print datetime.datetime.now()
103
69