Subversion Repositories SmartDukaan

Rev

Rev 12256 | Rev 12765 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
10503 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup
3
import re
11668 kshitij.so 4
from sys import exit
10503 kshitij.so 5
 
6
class FlipkartScraper:
7
    def __init__(self):
8
        self.count_trials = 0
12764 kshitij.so 9
        self.redirectCount = 0
10503 kshitij.so 10
 
11
    def read(self, url):
12
        request = urllib2.Request(url)
12764 kshitij.so 13
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0')
10503 kshitij.so 14
        opener = urllib2.build_opener()
15
        response_data = ""
16
        try:
12764 kshitij.so 17
            response = urllib2.urlopen(request)
18
            response_data = response.read()
12199 kshitij.so 19
            print "Fetched response from flipkart for %s" %(url)
12764 kshitij.so 20
            redirect_url = response.url
10503 kshitij.so 21
 
22
        except urllib2.HTTPError as e:
23
            print 'ERROR: ', e
24
            print 'Retrying'
25
            self.count_trials += 1
26
 
27
            if self.count_trials < 3:
28
                return self.read(url)
29
 
30
        self.response_data=response_data
12764 kshitij.so 31
        return self.createData(url,redirect_url)
12212 kshitij.so 32
 
12764 kshitij.so 33
    def scrapeRedirectedPage(self,soup,redirect_url):
34
        print soup
35
        print redirect_url
36
        t = soup.find("div" , {"class" : "seller-table fk-user-select-none line"})
37
        print t
38
        table_rows = t.findAll("tr" , {"class" : re.compile('t-row.*')})
39
        print table_rows
40
        for x in table_rows:
41
            print x
42
 
43
    def createData(self,url, redirect_url):
12215 kshitij.so 44
        print "Creating soup from flipkart data for %s" %(url)
12764 kshitij.so 45
        print redirect_url
10503 kshitij.so 46
        page=self.response_data.decode("utf-8")
12212 kshitij.so 47
        self.soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
11967 kshitij.so 48
        page = None
49
        self.response_data = None
12200 kshitij.so 50
        print "Soup created from flipkart data for %s" %(url)
12764 kshitij.so 51
        if (url==redirect_url):
52
            return self.scrape(self.soup,url)
53
        else:
54
            print self.redirectCount
55
            self.redirectCount+=1
56
            if self.redirectCount >4:
57
                raise
58
            return self.read(url)
59
 
60
 
12212 kshitij.so 61
 
62
 
63
    def scrape(self,soup,url):
12215 kshitij.so 64
        print "Inside json creator for %s" %(url)
10503 kshitij.so 65
        info = []
66
        oddSeller = soup.findAll("div" , {"class" : "line seller-item odd "})
67
        for data in oddSeller:
68
            temp={}
11668 kshitij.so 69
            try:
70
                businessDays = data.find('span', attrs={'class' : re.compile('fk-deliverable.*')})
71
                shippingTime = businessDays.find('span', attrs={'class' : re.compile('fk-bold')}).string.replace('to','').replace('business days.','').strip().replace('  ','-')
72
                temp['shippingTime']=shippingTime
73
            except:
74
                pass
10503 kshitij.so 75
            price = data.find('span', attrs={'class' : re.compile('pxs-final-price.*')}).string.strip('Rs.').strip()
76
            temp['sellingPrice']=float(price)
77
            for sellerInfo in data.findAll("div",{"class":re.compile(".*seller-info*")}):
78
                sellerName = sellerInfo.find('a').string
79
                temp['sellerName'] = sellerName
80
            for metrics in data.find("div",{"class":"fk-text-right"}):
81
                try:
11217 kshitij.so 82
                    metric = metrics.findAll('input', {'type': 'submit'})
83
                except AttributeError:
84
                    continue
85
                try:
86
                    inputTags = metric[0]['data-lst-buytrend']
10503 kshitij.so 87
                except TypeError:
88
                    continue
11217 kshitij.so 89
                dataMetrics = metric[0]['data-listing-metrics']
10503 kshitij.so 90
                try:
91
                    buyTrend = inputTags[0:str(inputTags).index('NWSR')].replace('_','')
92
                except ValueError:
93
                    buyTrend = inputTags[0:str(inputTags).index('WSR')].replace('_','')
94
                temp['buyTrend']=buyTrend
95
                dataMetric = dataMetrics.split(';')
96
                sellerCode = dataMetric[0]
97
                temp['sellerCode']=sellerCode
98
                temp['sellingPriceMetric'] = float(dataMetric[1])
11668 kshitij.so 99
                if not temp.has_key('shippingTime'):
100
                    print "Populating shipping time from metrics"
101
                    temp['shippingTime'] = dataMetric[3]
10503 kshitij.so 102
                temp['sellerScore'] = int(dataMetric[4])
103
                info.append(temp)
104
        evenSeller = soup.findAll("div" , {"class" : "line seller-item even "})
105
        for data in evenSeller:
106
            temp={}
107
            price = data.find('span', attrs={'class' : re.compile('pxs-final-price.*')}).string.strip('Rs.')
11668 kshitij.so 108
            try:
109
                businessDays = data.find('span', attrs={'class' : re.compile('fk-deliverable.*')})
110
                shippingTime = businessDays.find('span', attrs={'class' : re.compile('fk-bold')}).string.replace('to','').replace('business days.','').strip().replace('  ','-')
111
                temp['shippingTime']=shippingTime
112
            except:
113
                pass
10503 kshitij.so 114
            temp['sellingPrice']=float(price)
115
            for sellerInfo in data.findAll("div",{"class":re.compile(".*seller-info*")}):
116
                sellerName = sellerInfo.find('a').string
117
                temp['sellerName'] = sellerName
118
            for metrics in data.find("div",{"class":"fk-text-right"}):
119
                try:
11217 kshitij.so 120
                    metric = metrics.findAll('input', {'type': 'submit'})
121
                except AttributeError:
122
                    continue
123
                try:
124
                    inputTags = metric[0]['data-lst-buytrend']
10503 kshitij.so 125
                except TypeError:
126
                    continue
11217 kshitij.so 127
                dataMetrics = metric[0]['data-listing-metrics']
10503 kshitij.so 128
                try:
129
                    buyTrend = inputTags[0:str(inputTags).index('NWSR')].replace('_','')
130
                except ValueError:
131
                    buyTrend = inputTags[0:str(inputTags).index('WSR')].replace('_','')
132
                temp['buyTrend']=buyTrend
133
                dataMetric = dataMetrics.split(';')
134
                temp['sellerCode'] = dataMetric[0] 
135
                temp['sellingPriceMetric'] = float(dataMetric[1])
11668 kshitij.so 136
                if not temp.has_key('shippingTime'):
137
                    print "Populating shipping time from metrics"
138
                    temp['shippingTime'] = dataMetric[3]
10503 kshitij.so 139
                temp['sellerScore'] = int(dataMetric[4])
140
                info.append(temp)
12200 kshitij.so 141
        print "Returning Json response from flipkart for %s" %(url)
10503 kshitij.so 142
        return info
143
 
144
if __name__ == '__main__':
145
    scraper = FlipkartScraper()
12764 kshitij.so 146
    print scraper.read('http://www.flipkart.com/ps/MOBDYFURT9PKAPSX')