Subversion Repositories SmartDukaan

Rev

Rev 16096 | Rev 16217 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 16096 Rev 16100
Line 1... Line 1...
1
import urllib2
1
import urllib2
2
from BeautifulSoup import BeautifulSoup, NavigableString
2
from BeautifulSoup import BeautifulSoup, NavigableString
3
from dtr.utils.utils import fetchResponseUsingProxy, removePriceFormatting, \
3
from dtr.utils.utils import fetchResponseUsingProxy, removePriceFormatting, \
4
transformUrl
4
transformUrl
5
import re
5
import re
-
 
6
import cssutils
6
 
7
 
7
invalid_tags = ['b', 'i', 'u']
8
invalid_tags = ['b', 'i', 'u']
8
bestSellers = []
9
bestSellers = []
9
 
10
 
10
headers = {
11
headers = {
Line 31... Line 32...
31
            tag.replaceWith(s)
32
            tag.replaceWith(s)
32
 
33
 
33
    return soup
34
    return soup
34
 
35
 
35
class ShopCluesScraper:
36
class ShopCluesScraper:
36
    def __init__(self, livePricing=None):
37
    def __init__(self, livePricing=None, findThumbnail=None):
37
        self.count_trials = 0
38
        self.count_trials = 0
38
        self.livePricing = livePricing
39
        self.livePricing = livePricing
-
 
40
        self.findThumbnail = findThumbnail
39
    
41
    
40
    def read(self, url):
42
    def read(self, url):
41
        response_data = ""
43
        response_data = ""
42
        url = transformUrl(url,5)
44
        url = transformUrl(url,5)
43
        print url
45
        print url
Line 55... Line 57...
55
        return self.createData()
57
        return self.createData()
56
    
58
    
57
    def createData(self):
59
    def createData(self):
58
        self.soup = strip_tags(self.response_data,invalid_tags)
60
        self.soup = strip_tags(self.response_data,invalid_tags)
59
        self.response_data =None
61
        self.response_data =None
60
        return self.scrape(self.soup)
62
        return self.scrape()
61
    
63
    
62
    
64
    
63
    def scrape(self,soup):
65
    def scrape(self):
64
        div = soup.find('div',{'class':'pd_name clearfix'})
66
        div = self.soup.find('div',{'class':'pd_name clearfix'})
65
        scin =  div['data-id']
67
        scin =  div['data-id']
-
 
68
        thumbnailUrl = ""
-
 
69
        try:
-
 
70
            if self.findThumbnail:
-
 
71
                imgTag =  self.soup.find('div',{'class':'pd-image'})['style']
-
 
72
                style = cssutils.parseStyle(imgTag)
-
 
73
                thumbnailUrl =  style['background'][style['background'].index('(')+1:style['background'].rfind(')')]
-
 
74
        except:
-
 
75
            pass
66
        div2 = soup.find('div',{'class':'pd-price-cont clearfix'})
76
        div2 = self.soup.find('div',{'class':'pd-price-cont clearfix'})
67
        try:
77
        try:
68
            price =  float(removePriceFormatting(div2.find('span',{'id':'thirdPrice'}).find('span').string))
78
            price =  float(removePriceFormatting(div2.find('span',{'id':'thirdPrice'}).find('span').string))
69
        except:
79
        except:
70
            price =  float(removePriceFormatting(div2.find('span',{'id':'sellingPrice'}).find('span').string))
80
            price =  float(removePriceFormatting(div2.find('span',{'id':'sellingPrice'}).find('span').string))
71
        inStock = 1
81
        inStock = 1
Line 78... Line 88...
78
        try:
88
        try:
79
            if div2.find('div',{'class':'info clearfix'}):
89
            if div2.find('div',{'class':'info clearfix'}):
80
                coupon = div2.find('div',{'class':'info clearfix'}).findAll('span')[1].string
90
                coupon = div2.find('div',{'class':'info clearfix'}).findAll('span')[1].string
81
        except:
91
        except:
82
            print "Unable to parse coupon code"
92
            print "Unable to parse coupon code"
83
        return {'scin':scin,'price':price,'inStock':inStock,'isCod':isCod,'coupon':coupon}
93
        return {'scin':scin,'price':price,'inStock':inStock,'isCod':isCod,'coupon':coupon, "thumbnail":thumbnailUrl}
84
            
94
            
85
            
95
            
86
 
96
 
87
if __name__ == '__main__':
97
if __name__ == '__main__':
88
    import datetime
98
    import datetime