Subversion Repositories SmartDukaan

Rev

Rev 15906 | Rev 16096 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
15867 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup, NavigableString
16004 kshitij.so 3
from dtr.utils.utils import fetchResponseUsingProxy, removePriceFormatting, \
4
transformUrl
15867 kshitij.so 5
import re
6
 
7
invalid_tags = ['b', 'i', 'u']
8
bestSellers = []
9
 
10
headers = {
11
            'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36',
12
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
13
            'Accept-Language' : 'en-US,en;q=0.8',                     
14
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
15
            'Connection':'keep-alive',
16
            'Accept-Encoding' : 'gzip,deflate,sdch'
17
        }
18
 
19
def strip_tags(html, invalid_tags):
20
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
21
 
22
    for tag in soup.findAll(True):
23
        if tag.name in invalid_tags:
24
            s = ""
25
 
26
            for c in tag.contents:
27
                if not isinstance(c, NavigableString):
28
                    c = strip_tags(unicode(c), invalid_tags)
29
                s += unicode(c)
30
 
31
            tag.replaceWith(s)
32
 
33
    return soup
34
 
35
class ShopCluesScraper:
36
    def __init__(self, livePricing=None):
37
        self.count_trials = 0
38
        self.livePricing = livePricing
39
 
40
    def read(self, url):
41
        response_data = ""
16004 kshitij.so 42
        url = transformUrl(url,5)
43
        print url
15867 kshitij.so 44
        try:
15896 kshitij.so 45
            response_data = fetchResponseUsingProxy(url, headers=headers, proxy=False)
15867 kshitij.so 46
        except Exception as e:
47
            print 'ERROR: ', e
48
            print 'Retrying'
49
            self.count_trials += 1
50
 
51
            if self.count_trials < 5:
52
                return self.read(url)
53
 
54
        self.response_data=response_data
55
        return self.createData()
56
 
57
    def createData(self):
58
        self.soup = strip_tags(self.response_data,invalid_tags)
59
        self.response_data =None
60
        return self.scrape(self.soup)
61
 
62
 
63
    def scrape(self,soup):
64
        div = soup.find('div',{'class':'pd_name clearfix'})
65
        scin =  div['data-id']
66
        div2 = soup.find('div',{'class':'pd-price-cont clearfix'})
67
        try:
68
            price =  float(removePriceFormatting(div2.find('span',{'id':'thirdPrice'}).find('span').string))
69
        except:
70
            price =  float(removePriceFormatting(div2.find('span',{'id':'sellingPrice'}).find('span').string))
71
        inStock = 1
72
        if (div2.find('div',{'class':re.compile('stock.*')}).string).strip().upper() == 'OUT OF STOCK':
73
            inStock = 0
74
        isCod = 1
75
        if div2.find('li',{'id':'iscod'}) is None:
76
            isCod = 0
77
        coupon = ""
78
        try:
79
            if div2.find('div',{'class':'info clearfix'}):
80
                coupon = div2.find('div',{'class':'info clearfix'}).findAll('span')[1].text
81
        except:
82
            print "Unable to parse coupon code"
83
        return {'scin':scin,'price':price,'inStock':inStock,'isCod':isCod,'coupon':coupon}
84
 
85
 
86
 
87
if __name__ == '__main__':
88
    import datetime
89
    print datetime.datetime.now()
90
    scraper = ShopCluesScraper()
16004 kshitij.so 91
    print scraper.read('http://shopclues.com/samsung-galaxy-note-4-13.html')
15867 kshitij.so 92
    print datetime.datetime.now()