Subversion Repositories SmartDukaan

Rev

Rev 16096 | Rev 16217 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
15867 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup, NavigableString
16004 kshitij.so 3
from dtr.utils.utils import fetchResponseUsingProxy, removePriceFormatting, \
4
transformUrl
15867 kshitij.so 5
import re
16100 kshitij.so 6
import cssutils
15867 kshitij.so 7
 
8
invalid_tags = ['b', 'i', 'u']
9
bestSellers = []
10
 
11
headers = {
12
            'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36',
13
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
14
            'Accept-Language' : 'en-US,en;q=0.8',                     
15
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
16
            'Connection':'keep-alive',
17
            'Accept-Encoding' : 'gzip,deflate,sdch'
18
        }
19
 
20
def strip_tags(html, invalid_tags):
21
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
22
 
23
    for tag in soup.findAll(True):
24
        if tag.name in invalid_tags:
25
            s = ""
26
 
27
            for c in tag.contents:
28
                if not isinstance(c, NavigableString):
29
                    c = strip_tags(unicode(c), invalid_tags)
30
                s += unicode(c)
31
 
32
            tag.replaceWith(s)
33
 
34
    return soup
35
 
36
class ShopCluesScraper:
16100 kshitij.so 37
    def __init__(self, livePricing=None, findThumbnail=None):
15867 kshitij.so 38
        self.count_trials = 0
39
        self.livePricing = livePricing
16100 kshitij.so 40
        self.findThumbnail = findThumbnail
15867 kshitij.so 41
 
42
    def read(self, url):
43
        response_data = ""
16004 kshitij.so 44
        url = transformUrl(url,5)
45
        print url
15867 kshitij.so 46
        try:
15896 kshitij.so 47
            response_data = fetchResponseUsingProxy(url, headers=headers, proxy=False)
15867 kshitij.so 48
        except Exception as e:
49
            print 'ERROR: ', e
50
            print 'Retrying'
51
            self.count_trials += 1
52
 
53
            if self.count_trials < 5:
54
                return self.read(url)
55
 
56
        self.response_data=response_data
57
        return self.createData()
58
 
59
    def createData(self):
60
        self.soup = strip_tags(self.response_data,invalid_tags)
61
        self.response_data =None
16100 kshitij.so 62
        return self.scrape()
15867 kshitij.so 63
 
64
 
16100 kshitij.so 65
    def scrape(self):
66
        div = self.soup.find('div',{'class':'pd_name clearfix'})
15867 kshitij.so 67
        scin =  div['data-id']
16100 kshitij.so 68
        thumbnailUrl = ""
15867 kshitij.so 69
        try:
16100 kshitij.so 70
            if self.findThumbnail:
71
                imgTag =  self.soup.find('div',{'class':'pd-image'})['style']
72
                style = cssutils.parseStyle(imgTag)
73
                thumbnailUrl =  style['background'][style['background'].index('(')+1:style['background'].rfind(')')]
74
        except:
75
            pass
76
        div2 = self.soup.find('div',{'class':'pd-price-cont clearfix'})
77
        try:
15867 kshitij.so 78
            price =  float(removePriceFormatting(div2.find('span',{'id':'thirdPrice'}).find('span').string))
79
        except:
80
            price =  float(removePriceFormatting(div2.find('span',{'id':'sellingPrice'}).find('span').string))
81
        inStock = 1
82
        if (div2.find('div',{'class':re.compile('stock.*')}).string).strip().upper() == 'OUT OF STOCK':
83
            inStock = 0
84
        isCod = 1
85
        if div2.find('li',{'id':'iscod'}) is None:
86
            isCod = 0
87
        coupon = ""
88
        try:
89
            if div2.find('div',{'class':'info clearfix'}):
16096 kshitij.so 90
                coupon = div2.find('div',{'class':'info clearfix'}).findAll('span')[1].string
15867 kshitij.so 91
        except:
92
            print "Unable to parse coupon code"
16100 kshitij.so 93
        return {'scin':scin,'price':price,'inStock':inStock,'isCod':isCod,'coupon':coupon, "thumbnail":thumbnailUrl}
15867 kshitij.so 94
 
95
 
96
 
97
if __name__ == '__main__':
98
    import datetime
99
    print datetime.datetime.now()
100
    scraper = ShopCluesScraper()
16004 kshitij.so 101
    print scraper.read('http://shopclues.com/samsung-galaxy-note-4-13.html')
15867 kshitij.so 102
    print datetime.datetime.now()