Subversion Repositories SmartDukaan

Rev

Rev 12764 | Rev 12766 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
10503 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup
3
import re
11668 kshitij.so 4
from sys import exit
10503 kshitij.so 5
 
6
class FlipkartScraper:
7
    def __init__(self):
8
        self.count_trials = 0
12764 kshitij.so 9
        self.redirectCount = 0
10503 kshitij.so 10
 
11
    def read(self, url):
12765 kshitij.so 12
        print url.replace('http://www.flipkart.com','163.53.77.21')
13
        print url
10503 kshitij.so 14
        request = urllib2.Request(url)
12764 kshitij.so 15
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0')
10503 kshitij.so 16
        opener = urllib2.build_opener()
17
        response_data = ""
18
        try:
12764 kshitij.so 19
            response = urllib2.urlopen(request)
20
            response_data = response.read()
12199 kshitij.so 21
            print "Fetched response from flipkart for %s" %(url)
12764 kshitij.so 22
            redirect_url = response.url
10503 kshitij.so 23
 
24
        except urllib2.HTTPError as e:
25
            print 'ERROR: ', e
26
            print 'Retrying'
27
            self.count_trials += 1
28
 
29
            if self.count_trials < 3:
30
                return self.read(url)
31
 
32
        self.response_data=response_data
12764 kshitij.so 33
        return self.createData(url,redirect_url)
12212 kshitij.so 34
 
12764 kshitij.so 35
    def scrapeRedirectedPage(self,soup,redirect_url):
36
        print soup
37
        print redirect_url
38
        t = soup.find("div" , {"class" : "seller-table fk-user-select-none line"})
39
        print t
40
        table_rows = t.findAll("tr" , {"class" : re.compile('t-row.*')})
41
        print table_rows
42
        for x in table_rows:
43
            print x
44
 
45
    def createData(self,url, redirect_url):
12215 kshitij.so 46
        print "Creating soup from flipkart data for %s" %(url)
12764 kshitij.so 47
        print redirect_url
10503 kshitij.so 48
        page=self.response_data.decode("utf-8")
12212 kshitij.so 49
        self.soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
11967 kshitij.so 50
        page = None
51
        self.response_data = None
12200 kshitij.so 52
        print "Soup created from flipkart data for %s" %(url)
12764 kshitij.so 53
        if (url==redirect_url):
54
            return self.scrape(self.soup,url)
55
        else:
56
            print self.redirectCount
57
            self.redirectCount+=1
58
            if self.redirectCount >4:
59
                raise
60
            return self.read(url)
61
 
62
 
12212 kshitij.so 63
 
64
 
65
    def scrape(self,soup,url):
12215 kshitij.so 66
        print "Inside json creator for %s" %(url)
10503 kshitij.so 67
        info = []
68
        oddSeller = soup.findAll("div" , {"class" : "line seller-item odd "})
69
        for data in oddSeller:
70
            temp={}
11668 kshitij.so 71
            try:
72
                businessDays = data.find('span', attrs={'class' : re.compile('fk-deliverable.*')})
73
                shippingTime = businessDays.find('span', attrs={'class' : re.compile('fk-bold')}).string.replace('to','').replace('business days.','').strip().replace('  ','-')
74
                temp['shippingTime']=shippingTime
75
            except:
76
                pass
10503 kshitij.so 77
            price = data.find('span', attrs={'class' : re.compile('pxs-final-price.*')}).string.strip('Rs.').strip()
78
            temp['sellingPrice']=float(price)
79
            for sellerInfo in data.findAll("div",{"class":re.compile(".*seller-info*")}):
80
                sellerName = sellerInfo.find('a').string
81
                temp['sellerName'] = sellerName
82
            for metrics in data.find("div",{"class":"fk-text-right"}):
83
                try:
11217 kshitij.so 84
                    metric = metrics.findAll('input', {'type': 'submit'})
85
                except AttributeError:
86
                    continue
87
                try:
88
                    inputTags = metric[0]['data-lst-buytrend']
10503 kshitij.so 89
                except TypeError:
90
                    continue
11217 kshitij.so 91
                dataMetrics = metric[0]['data-listing-metrics']
10503 kshitij.so 92
                try:
93
                    buyTrend = inputTags[0:str(inputTags).index('NWSR')].replace('_','')
94
                except ValueError:
95
                    buyTrend = inputTags[0:str(inputTags).index('WSR')].replace('_','')
96
                temp['buyTrend']=buyTrend
97
                dataMetric = dataMetrics.split(';')
98
                sellerCode = dataMetric[0]
99
                temp['sellerCode']=sellerCode
100
                temp['sellingPriceMetric'] = float(dataMetric[1])
11668 kshitij.so 101
                if not temp.has_key('shippingTime'):
102
                    print "Populating shipping time from metrics"
103
                    temp['shippingTime'] = dataMetric[3]
10503 kshitij.so 104
                temp['sellerScore'] = int(dataMetric[4])
105
                info.append(temp)
106
        evenSeller = soup.findAll("div" , {"class" : "line seller-item even "})
107
        for data in evenSeller:
108
            temp={}
109
            price = data.find('span', attrs={'class' : re.compile('pxs-final-price.*')}).string.strip('Rs.')
11668 kshitij.so 110
            try:
111
                businessDays = data.find('span', attrs={'class' : re.compile('fk-deliverable.*')})
112
                shippingTime = businessDays.find('span', attrs={'class' : re.compile('fk-bold')}).string.replace('to','').replace('business days.','').strip().replace('  ','-')
113
                temp['shippingTime']=shippingTime
114
            except:
115
                pass
10503 kshitij.so 116
            temp['sellingPrice']=float(price)
117
            for sellerInfo in data.findAll("div",{"class":re.compile(".*seller-info*")}):
118
                sellerName = sellerInfo.find('a').string
119
                temp['sellerName'] = sellerName
120
            for metrics in data.find("div",{"class":"fk-text-right"}):
121
                try:
11217 kshitij.so 122
                    metric = metrics.findAll('input', {'type': 'submit'})
123
                except AttributeError:
124
                    continue
125
                try:
126
                    inputTags = metric[0]['data-lst-buytrend']
10503 kshitij.so 127
                except TypeError:
128
                    continue
11217 kshitij.so 129
                dataMetrics = metric[0]['data-listing-metrics']
10503 kshitij.so 130
                try:
131
                    buyTrend = inputTags[0:str(inputTags).index('NWSR')].replace('_','')
132
                except ValueError:
133
                    buyTrend = inputTags[0:str(inputTags).index('WSR')].replace('_','')
134
                temp['buyTrend']=buyTrend
135
                dataMetric = dataMetrics.split(';')
136
                temp['sellerCode'] = dataMetric[0] 
137
                temp['sellingPriceMetric'] = float(dataMetric[1])
11668 kshitij.so 138
                if not temp.has_key('shippingTime'):
139
                    print "Populating shipping time from metrics"
140
                    temp['shippingTime'] = dataMetric[3]
10503 kshitij.so 141
                temp['sellerScore'] = int(dataMetric[4])
142
                info.append(temp)
12200 kshitij.so 143
        print "Returning Json response from flipkart for %s" %(url)
10503 kshitij.so 144
        return info
145
 
146
if __name__ == '__main__':
147
    scraper = FlipkartScraper()
12764 kshitij.so 148
    print scraper.read('http://www.flipkart.com/ps/MOBDYFURT9PKAPSX')