Subversion Repositories SmartDukaan

Rev

Rev 12766 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
10503 kshitij.so 1
import urllib2
2
from BeautifulSoup import BeautifulSoup
3
import re
11668 kshitij.so 4
from sys import exit
10503 kshitij.so 5
 
6
class FlipkartScraper:
7
    def __init__(self):
8
        self.count_trials = 0
12764 kshitij.so 9
        self.redirectCount = 0
10503 kshitij.so 10
 
11
    def read(self, url):
12766 kshitij.so 12
        url = url.replace('www.flipkart.com','163.53.77.21')
12765 kshitij.so 13
        print url
10503 kshitij.so 14
        request = urllib2.Request(url)
12821 kshitij.so 15
        request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
16
        #request.add_header('Accept-Charset','ISO-8859-1,utf-8;q=0.7,*;q=0.3')
17
        #request.add_header('Accept-Encoding','gzip,deflate,sdch')
18
        request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')
12764 kshitij.so 19
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0')
12821 kshitij.so 20
        request.add_header('Connection','keep-alive')
21
        request.add_header('Cookie','T=TI141106533261202044684051011971166779542511900764606324691282640130; __gads=ID=683ebf052dfc3143:T=1411293573:S=ALNI_MZ_Ii5vGWTfpp24h4M8eqj95_ctPA; __sonar=7756033766217071307; buyer=0; is_loggedin=1; km_lv=x; _ga=GA1.2.1763496909.1411627333; kvcd=1411645515976; km_ai=m2z93iskuj81qiid; km_ni=m2z93iskuj81qiid; TGSRC=semcmpid%3Asem_8024046704_brand_goog; GOOGSRC=semcmpid%3Asem_8024046704_brand_goog; currentSession=present; sessionCount=3; prd_day=6|1411762819830; visitCount=7; _we_wk_ss_lsf_=true; FK-CMP-DATA=; s_ppv=42; km_uq=; Tkt=67af0938; SN=2.VI45A1DC8A40884B39A24FBA0584587E3C.SI737D7515E5C94593A5DD0F9D1CFDCD20.VS141165407206939742793.1411654071; VID=2.VI45A1DC8A40884B39A24FBA0584587E3C.1411654071.VS141165407206939742793; NSID=2.SI737D7515E5C94593A5DD0F9D1CFDCD20.1411654071.VI45A1DC8A40884B39A24FBA0584587E3C; __utma=19769839.146415981.1411293538.1411647571.1411654082.5; __utmb=19769839.3.8.1411654082; __utmc=19769839; __utmz=19769839.1411647571.4.4.utmgclid=CMu2ifys_MACFQyTjgodWnMAwQ|utmccn=(not%20set)|utmcmd=(not%20set)|utmctr=(not%20provided); s_cc=true; gpv_pn=SellerListing%3AMobile%3AKarbonn%20K105s; gpv_pn_t=no%20value; s_sq=%5B%5BB%5D%5D')
22
        request.add_header('Host','www.flipkart.com')
23
        request.add_header('User-Agent','Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36')
10503 kshitij.so 24
        response_data = ""
12766 kshitij.so 25
        redirect_url = ""
10503 kshitij.so 26
        try:
12764 kshitij.so 27
            response = urllib2.urlopen(request)
28
            response_data = response.read()
12199 kshitij.so 29
            print "Fetched response from flipkart for %s" %(url)
12764 kshitij.so 30
            redirect_url = response.url
10503 kshitij.so 31
 
12821 kshitij.so 32
        except Exception as e:
10503 kshitij.so 33
            print 'ERROR: ', e
34
            print 'Retrying'
35
            self.count_trials += 1
36
 
37
            if self.count_trials < 3:
38
                return self.read(url)
39
 
40
        self.response_data=response_data
12764 kshitij.so 41
        return self.createData(url,redirect_url)
12212 kshitij.so 42
 
12764 kshitij.so 43
    def scrapeRedirectedPage(self,soup,redirect_url):
44
        print soup
45
        print redirect_url
46
        t = soup.find("div" , {"class" : "seller-table fk-user-select-none line"})
47
        print t
48
        table_rows = t.findAll("tr" , {"class" : re.compile('t-row.*')})
49
        print table_rows
50
        for x in table_rows:
51
            print x
52
 
53
    def createData(self,url, redirect_url):
12215 kshitij.so 54
        print "Creating soup from flipkart data for %s" %(url)
12766 kshitij.so 55
        redirect_url = redirect_url.replace('www.flipkart.com','163.53.77.21')
56
        print "Redirect url is %s"%(redirect_url)
10503 kshitij.so 57
        page=self.response_data.decode("utf-8")
12212 kshitij.so 58
        self.soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
11967 kshitij.so 59
        page = None
60
        self.response_data = None
12200 kshitij.so 61
        print "Soup created from flipkart data for %s" %(url)
12764 kshitij.so 62
        if (url==redirect_url):
63
            return self.scrape(self.soup,url)
64
        else:
65
            print self.redirectCount
66
            self.redirectCount+=1
12766 kshitij.so 67
            if self.redirectCount >5:
12764 kshitij.so 68
                raise
69
            return self.read(url)
70
 
71
 
12212 kshitij.so 72
 
73
 
74
    def scrape(self,soup,url):
12215 kshitij.so 75
        print "Inside json creator for %s" %(url)
10503 kshitij.so 76
        info = []
77
        oddSeller = soup.findAll("div" , {"class" : "line seller-item odd "})
78
        for data in oddSeller:
79
            temp={}
11668 kshitij.so 80
            try:
81
                businessDays = data.find('span', attrs={'class' : re.compile('fk-deliverable.*')})
82
                shippingTime = businessDays.find('span', attrs={'class' : re.compile('fk-bold')}).string.replace('to','').replace('business days.','').strip().replace('  ','-')
83
                temp['shippingTime']=shippingTime
84
            except:
85
                pass
10503 kshitij.so 86
            price = data.find('span', attrs={'class' : re.compile('pxs-final-price.*')}).string.strip('Rs.').strip()
87
            temp['sellingPrice']=float(price)
88
            for sellerInfo in data.findAll("div",{"class":re.compile(".*seller-info*")}):
89
                sellerName = sellerInfo.find('a').string
90
                temp['sellerName'] = sellerName
91
            for metrics in data.find("div",{"class":"fk-text-right"}):
92
                try:
11217 kshitij.so 93
                    metric = metrics.findAll('input', {'type': 'submit'})
94
                except AttributeError:
95
                    continue
96
                try:
97
                    inputTags = metric[0]['data-lst-buytrend']
10503 kshitij.so 98
                except TypeError:
99
                    continue
11217 kshitij.so 100
                dataMetrics = metric[0]['data-listing-metrics']
10503 kshitij.so 101
                try:
102
                    buyTrend = inputTags[0:str(inputTags).index('NWSR')].replace('_','')
103
                except ValueError:
104
                    buyTrend = inputTags[0:str(inputTags).index('WSR')].replace('_','')
105
                temp['buyTrend']=buyTrend
106
                dataMetric = dataMetrics.split(';')
107
                sellerCode = dataMetric[0]
108
                temp['sellerCode']=sellerCode
109
                temp['sellingPriceMetric'] = float(dataMetric[1])
11668 kshitij.so 110
                if not temp.has_key('shippingTime'):
111
                    print "Populating shipping time from metrics"
112
                    temp['shippingTime'] = dataMetric[3]
10503 kshitij.so 113
                temp['sellerScore'] = int(dataMetric[4])
114
                info.append(temp)
115
        evenSeller = soup.findAll("div" , {"class" : "line seller-item even "})
116
        for data in evenSeller:
117
            temp={}
118
            price = data.find('span', attrs={'class' : re.compile('pxs-final-price.*')}).string.strip('Rs.')
11668 kshitij.so 119
            try:
120
                businessDays = data.find('span', attrs={'class' : re.compile('fk-deliverable.*')})
121
                shippingTime = businessDays.find('span', attrs={'class' : re.compile('fk-bold')}).string.replace('to','').replace('business days.','').strip().replace('  ','-')
122
                temp['shippingTime']=shippingTime
123
            except:
124
                pass
10503 kshitij.so 125
            temp['sellingPrice']=float(price)
126
            for sellerInfo in data.findAll("div",{"class":re.compile(".*seller-info*")}):
127
                sellerName = sellerInfo.find('a').string
128
                temp['sellerName'] = sellerName
129
            for metrics in data.find("div",{"class":"fk-text-right"}):
130
                try:
11217 kshitij.so 131
                    metric = metrics.findAll('input', {'type': 'submit'})
132
                except AttributeError:
133
                    continue
134
                try:
135
                    inputTags = metric[0]['data-lst-buytrend']
10503 kshitij.so 136
                except TypeError:
137
                    continue
11217 kshitij.so 138
                dataMetrics = metric[0]['data-listing-metrics']
10503 kshitij.so 139
                try:
140
                    buyTrend = inputTags[0:str(inputTags).index('NWSR')].replace('_','')
141
                except ValueError:
142
                    buyTrend = inputTags[0:str(inputTags).index('WSR')].replace('_','')
143
                temp['buyTrend']=buyTrend
144
                dataMetric = dataMetrics.split(';')
145
                temp['sellerCode'] = dataMetric[0] 
146
                temp['sellingPriceMetric'] = float(dataMetric[1])
11668 kshitij.so 147
                if not temp.has_key('shippingTime'):
148
                    print "Populating shipping time from metrics"
149
                    temp['shippingTime'] = dataMetric[3]
10503 kshitij.so 150
                temp['sellerScore'] = int(dataMetric[4])
151
                info.append(temp)
12821 kshitij.so 152
        print info
12200 kshitij.so 153
        print "Returning Json response from flipkart for %s" %(url)
10503 kshitij.so 154
        return info
155
 
156
if __name__ == '__main__':
157
    scraper = FlipkartScraper()
12821 kshitij.so 158
    scraper.read('http://www.flipkart.com/ps/MOBDY45GPWHXH9UY')