Subversion Repositories SmartDukaan

Rev

Rev 14157 | Rev 14175 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 14157 Rev 14168
Line 7... Line 7...
7
    def __init__(self):
7
    def __init__(self):
8
        self.count_trials = 0
8
        self.count_trials = 0
9
        self.redirectCount = 0
9
        self.redirectCount = 0
10
    
10
    
11
    def read(self, url):
11
    def read(self, url):
12
        #url = url.replace('www.flipkart.com','163.53.77.21')
-
 
13
        url = url.replace('www.flipkart.com','163.53.76.55')
-
 
14
        print url
-
 
15
        request = urllib2.Request(url)
12
        request = urllib2.Request(url)
16
        request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
13
        request.add_header('Accept','text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
17
        #request.add_header('Accept-Charset','ISO-8859-1,utf-8;q=0.7,*;q=0.3')
-
 
18
        #request.add_header('Accept-Encoding','gzip,deflate,sdch')
-
 
19
        request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')
14
        request.add_header('Accept-Language','en-US,en;q=0.8,hi;q=0.6')
20
        request.add_header('Connection','keep-alive')
15
        request.add_header('Connection','keep-alive')
21
        request.add_header('Cookie','T=TI141257426738726661427143281839817329423126740566618323641725716448; __sonar=7237334677420142002; __gads=ID=c8b82101a0e4f451:T=1412574724:S=ALNI_MbPMbEOZj2nAGjM54z8ZHFMqwTOTQ; FK-CMP-DATA=; SN=2.VI11FB3FB6ED9D4693A796AB8C965B3417.SI802C325AC43444858830E870C4FD3324.VS141257426735693951472.1412576209; VID=2.VI11FB3FB6ED9D4693A796AB8C965B3417.1412576209.VS141257426735693951472; NSID=2.SI802C325AC43444858830E870C4FD3324.1412576209.VI11FB3FB6ED9D4693A796AB8C965B3417; __utma=19769839.709301254.1412574234.1412574234.1412574234.1; __utmb=19769839.23.10.1412574234; __utmc=19769839; __utmz=19769839.1412574234.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); s_cc=true; gpv_pn=SellerListing%3AMobile%3AMicromax%20Canvas%20Fire%20A093; gpv_pn_t=no%20value; s_sq=%5B%5BB%5D%5D; s_ppv=36')
16
        request.add_header('Cookie','T=TI141257426738726661427143281839817329423126740566618323641725716448; __sonar=7237334677420142002; __gads=ID=c8b82101a0e4f451:T=1412574724:S=ALNI_MbPMbEOZj2nAGjM54z8ZHFMqwTOTQ; FK-CMP-DATA=; SN=2.VI11FB3FB6ED9D4693A796AB8C965B3417.SI802C325AC43444858830E870C4FD3324.VS141257426735693951472.1412576209; VID=2.VI11FB3FB6ED9D4693A796AB8C965B3417.1412576209.VS141257426735693951472; NSID=2.SI802C325AC43444858830E870C4FD3324.1412576209.VI11FB3FB6ED9D4693A796AB8C965B3417; __utma=19769839.709301254.1412574234.1412574234.1412574234.1; __utmb=19769839.23.10.1412574234; __utmc=19769839; __utmz=19769839.1412574234.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); s_cc=true; gpv_pn=SellerListing%3AMobile%3AMicromax%20Canvas%20Fire%20A093; gpv_pn_t=no%20value; s_sq=%5B%5BB%5D%5D; s_ppv=36')
22
        request.add_header('Cache-Control','max-age=0')
17
        request.add_header('Cache-Control','max-age=0')
23
        request.add_header('Host','www.flipkart.com')
18
        request.add_header('Host','www.flipkart.com')
Line 38... Line 33...
38
 
33
 
39
            if self.count_trials < 3:
34
            if self.count_trials < 3:
40
                return self.read(url)
35
                return self.read(url)
41
 
36
 
42
        self.response_data=response_data
37
        self.response_data=response_data
43
        return self.createData(url,redirect_url)
38
        return self.createSoup(url)
44
    
39
    
45
    def scrapeRedirectedPage(self,soup,redirect_url):
40
#    def scrapeRedirectedPage(self,soup,redirect_url):
46
        print soup
41
#        print soup
47
        print redirect_url
42
#        print redirect_url
48
        t = soup.find("div" , {"class" : "seller-table fk-user-select-none line"})
43
#        t = soup.find("div" , {"class" : "seller-table fk-user-select-none line"})
49
        print t
44
#        print t
50
        table_rows = t.findAll("tr" , {"class" : re.compile('t-row.*')})
45
#        table_rows = t.findAll("tr" , {"class" : re.compile('t-row.*')})
51
        print table_rows
46
#        print table_rows
52
        for x in table_rows:
47
#        for x in table_rows:
53
            print x
48
#            print x
54
    
49
#    
55
    def createData(self,url, redirect_url):
50
    def createSoup(self, url):
56
        print "Creating soup from flipkart data for %s" %(url)
51
        print "Creating soup from flipkart data for %s" %(url)
57
        #redirect_url = redirect_url.replace('www.flipkart.com','163.53.77.21')
-
 
58
        print "Redirect url is %s"%(redirect_url)
-
 
59
        page=self.response_data.decode("utf-8")
52
        page=self.response_data.decode("utf-8")
60
        self.soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
53
        self.soup = BeautifulSoup(page,convertEntities=BeautifulSoup.HTML_ENTITIES)
61
        page = None
54
        page = None
62
        self.response_data = None
55
        self.response_data = None
63
        print "Soup created from flipkart data for %s" %(url)
56
        print "Soup created from flipkart data for %s" %(url)
64
        if (url==redirect_url):
-
 
65
            return self.scrape(self.soup,url)
57
        return self.scrape(self.soup,url)
66
        else:
-
 
67
            print self.redirectCount
-
 
68
            self.redirectCount+=1
-
 
69
            if self.redirectCount >5:
-
 
70
                raise
-
 
71
            return self.read(url)
-
 
72
            
-
 
73
            
-
 
74
    
-
 
75
    
58
    
76
    def scrape(self,soup,url):
59
    def scrape(self,soup,url):
77
        print "Inside json creator for %s" %(url)
60
        print "Inside json creator for %s" %(url)
78
        info = []
61
        info = []
79
        oddSeller = soup.findAll("div" , {"class" : "line seller-item odd "})
62
        oddSeller = soup.findAll("div" , {"class" : "line seller-item odd "})