Subversion Repositories SmartDukaan

Rev

Rev 12765 | Rev 12821 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
10503 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup
3
import re
11668 kshitij.so 4
from sys import exit
10503 kshitij.so 5
 
6
class FlipkartScraper:
7
    def __init__(self):
8
        self.count_trials = 0
12764 kshitij.so 9
        self.redirectCount = 0
10503 kshitij.so 10
 
11
    def read(self, url):
12766 kshitij.so 12
        url = url.replace('www.flipkart.com','163.53.77.21')
12765 kshitij.so 13
        print url
10503 kshitij.so 14
        request = urllib2.Request(url)
12764 kshitij.so 15
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0')
10503 kshitij.so 16
        opener = urllib2.build_opener()
17
        response_data = ""
12766 kshitij.so 18
        redirect_url = ""
10503 kshitij.so 19
        try:
12764 kshitij.so 20
            response = urllib2.urlopen(request)
21
            response_data = response.read()
12199 kshitij.so 22
            print "Fetched response from flipkart for %s" %(url)
12764 kshitij.so 23
            redirect_url = response.url
10503 kshitij.so 24
 
25
        except urllib2.HTTPError as e:
26
            print 'ERROR: ', e
27
            print 'Retrying'
28
            self.count_trials += 1
29
 
30
            if self.count_trials < 3:
31
                return self.read(url)
32
 
33
        self.response_data=response_data
12764 kshitij.so 34
        return self.createData(url,redirect_url)
12212 kshitij.so 35
 
12764 kshitij.so 36
    def scrapeRedirectedPage(self,soup,redirect_url):
37
        print soup
38
        print redirect_url
39
        t = soup.find("div" , {"class" : "seller-table fk-user-select-none line"})
40
        print t
41
        table_rows = t.findAll("tr" , {"class" : re.compile('t-row.*')})
42
        print table_rows
43
        for x in table_rows:
44
            print x
45
 
46
    def createData(self,url, redirect_url):
12215 kshitij.so 47
        print "Creating soup from flipkart data for %s" %(url)
12766 kshitij.so 48
        redirect_url = redirect_url.replace('www.flipkart.com','163.53.77.21')
49
        print "Redirect url is %s"%(redirect_url)
10503 kshitij.so 50
        page=self.response_data.decode("utf-8")
12212 kshitij.so 51
        self.soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
11967 kshitij.so 52
        page = None
53
        self.response_data = None
12200 kshitij.so 54
        print "Soup created from flipkart data for %s" %(url)
12764 kshitij.so 55
        if (url==redirect_url):
56
            return self.scrape(self.soup,url)
57
        else:
58
            print self.redirectCount
59
            self.redirectCount+=1
12766 kshitij.so 60
            if self.redirectCount >5:
12764 kshitij.so 61
                raise
62
            return self.read(url)
63
 
64
 
12212 kshitij.so 65
 
66
 
67
    def scrape(self,soup,url):
12215 kshitij.so 68
        print "Inside json creator for %s" %(url)
10503 kshitij.so 69
        info = []
70
        oddSeller = soup.findAll("div" , {"class" : "line seller-item odd "})
71
        for data in oddSeller:
72
            temp={}
11668 kshitij.so 73
            try:
74
                businessDays = data.find('span', attrs={'class' : re.compile('fk-deliverable.*')})
75
                shippingTime = businessDays.find('span', attrs={'class' : re.compile('fk-bold')}).string.replace('to','').replace('business days.','').strip().replace('  ','-')
76
                temp['shippingTime']=shippingTime
77
            except:
78
                pass
10503 kshitij.so 79
            price = data.find('span', attrs={'class' : re.compile('pxs-final-price.*')}).string.strip('Rs.').strip()
80
            temp['sellingPrice']=float(price)
81
            for sellerInfo in data.findAll("div",{"class":re.compile(".*seller-info*")}):
82
                sellerName = sellerInfo.find('a').string
83
                temp['sellerName'] = sellerName
84
            for metrics in data.find("div",{"class":"fk-text-right"}):
85
                try:
11217 kshitij.so 86
                    metric = metrics.findAll('input', {'type': 'submit'})
87
                except AttributeError:
88
                    continue
89
                try:
90
                    inputTags = metric[0]['data-lst-buytrend']
10503 kshitij.so 91
                except TypeError:
92
                    continue
11217 kshitij.so 93
                dataMetrics = metric[0]['data-listing-metrics']
10503 kshitij.so 94
                try:
95
                    buyTrend = inputTags[0:str(inputTags).index('NWSR')].replace('_','')
96
                except ValueError:
97
                    buyTrend = inputTags[0:str(inputTags).index('WSR')].replace('_','')
98
                temp['buyTrend']=buyTrend
99
                dataMetric = dataMetrics.split(';')
100
                sellerCode = dataMetric[0]
101
                temp['sellerCode']=sellerCode
102
                temp['sellingPriceMetric'] = float(dataMetric[1])
11668 kshitij.so 103
                if not temp.has_key('shippingTime'):
104
                    print "Populating shipping time from metrics"
105
                    temp['shippingTime'] = dataMetric[3]
10503 kshitij.so 106
                temp['sellerScore'] = int(dataMetric[4])
107
                info.append(temp)
108
        evenSeller = soup.findAll("div" , {"class" : "line seller-item even "})
109
        for data in evenSeller:
110
            temp={}
111
            price = data.find('span', attrs={'class' : re.compile('pxs-final-price.*')}).string.strip('Rs.')
11668 kshitij.so 112
            try:
113
                businessDays = data.find('span', attrs={'class' : re.compile('fk-deliverable.*')})
114
                shippingTime = businessDays.find('span', attrs={'class' : re.compile('fk-bold')}).string.replace('to','').replace('business days.','').strip().replace('  ','-')
115
                temp['shippingTime']=shippingTime
116
            except:
117
                pass
10503 kshitij.so 118
            temp['sellingPrice']=float(price)
119
            for sellerInfo in data.findAll("div",{"class":re.compile(".*seller-info*")}):
120
                sellerName = sellerInfo.find('a').string
121
                temp['sellerName'] = sellerName
122
            for metrics in data.find("div",{"class":"fk-text-right"}):
123
                try:
11217 kshitij.so 124
                    metric = metrics.findAll('input', {'type': 'submit'})
125
                except AttributeError:
126
                    continue
127
                try:
128
                    inputTags = metric[0]['data-lst-buytrend']
10503 kshitij.so 129
                except TypeError:
130
                    continue
11217 kshitij.so 131
                dataMetrics = metric[0]['data-listing-metrics']
10503 kshitij.so 132
                try:
133
                    buyTrend = inputTags[0:str(inputTags).index('NWSR')].replace('_','')
134
                except ValueError:
135
                    buyTrend = inputTags[0:str(inputTags).index('WSR')].replace('_','')
136
                temp['buyTrend']=buyTrend
137
                dataMetric = dataMetrics.split(';')
138
                temp['sellerCode'] = dataMetric[0] 
139
                temp['sellingPriceMetric'] = float(dataMetric[1])
11668 kshitij.so 140
                if not temp.has_key('shippingTime'):
141
                    print "Populating shipping time from metrics"
142
                    temp['shippingTime'] = dataMetric[3]
10503 kshitij.so 143
                temp['sellerScore'] = int(dataMetric[4])
144
                info.append(temp)
12200 kshitij.so 145
        print "Returning Json response from flipkart for %s" %(url)
10503 kshitij.so 146
        return info
147
 
148
if __name__ == '__main__':
149
    scraper = FlipkartScraper()
12764 kshitij.so 150
    print scraper.read('http://www.flipkart.com/ps/MOBDYFURT9PKAPSX')