Subversion Repositories SmartDukaan

Rev

Rev 15893 | Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
15867 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup, NavigableString
3
from dtr.utils.utils import fetchResponseUsingProxy, removePriceFormatting
4
import re
5
 
6
invalid_tags = ['b', 'i', 'u']
7
bestSellers = []
8
 
9
headers = {
10
            'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36',
11
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',      
12
            'Accept-Language' : 'en-US,en;q=0.8',                     
13
            'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
14
            'Connection':'keep-alive',
15
            'Accept-Encoding' : 'gzip,deflate,sdch'
16
        }
17
 
18
def strip_tags(html, invalid_tags):
19
    soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
20
 
21
    for tag in soup.findAll(True):
22
        if tag.name in invalid_tags:
23
            s = ""
24
 
25
            for c in tag.contents:
26
                if not isinstance(c, NavigableString):
27
                    c = strip_tags(unicode(c), invalid_tags)
28
                s += unicode(c)
29
 
30
            tag.replaceWith(s)
31
 
32
    return soup
33
 
34
class ShopCluesScraper:
35
    def __init__(self, livePricing=None):
36
        self.count_trials = 0
37
        self.livePricing = livePricing
38
 
39
    def read(self, url):
40
        response_data = ""
41
        try:
42
            response_data = fetchResponseUsingProxy(url)
43
        except Exception as e:
44
            print 'ERROR: ', e
45
            print 'Retrying'
46
            self.count_trials += 1
47
 
48
            if self.count_trials < 5:
49
                return self.read(url)
50
 
51
        self.response_data=response_data
52
        return self.createData()
53
 
54
    def createData(self):
55
        self.soup = strip_tags(self.response_data,invalid_tags)
56
        self.response_data =None
57
        return self.scrape(self.soup)
58
 
59
 
60
    def scrape(self,soup):
61
        div = soup.find('div',{'class':'pd_name clearfix'})
62
        scin =  div['data-id']
63
        div2 = soup.find('div',{'class':'pd-price-cont clearfix'})
64
        try:
65
            price =  float(removePriceFormatting(div2.find('span',{'id':'thirdPrice'}).find('span').string))
66
        except:
67
            price =  float(removePriceFormatting(div2.find('span',{'id':'sellingPrice'}).find('span').string))
68
        inStock = 1
69
        if (div2.find('div',{'class':re.compile('stock.*')}).string).strip().upper() == 'OUT OF STOCK':
70
            inStock = 0
71
        isCod = 1
72
        if div2.find('li',{'id':'iscod'}) is None:
73
            isCod = 0
74
        coupon = ""
75
        try:
76
            if div2.find('div',{'class':'info clearfix'}):
77
                coupon = div2.find('div',{'class':'info clearfix'}).findAll('span')[1].text
78
        except:
79
            print "Unable to parse coupon code"
80
        return {'scin':scin,'price':price,'inStock':inStock,'isCod':isCod,'coupon':coupon}
81
 
82
 
83
 
84
if __name__ == '__main__':
85
    import datetime
86
    print datetime.datetime.now()
87
    scraper = ShopCluesScraper()
88
    print scraper.read('http://m.shopclues.com/apple-iphone-6-16-gb-10.html')
89
    print datetime.datetime.now()