Subversion Repositories SmartDukaan

Rev

Rev 12275 | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 12275 Rev 15483
Line 1... Line 1...
1
import urllib2
1
import urllib2
2
from BeautifulSoup import BeautifulSoup, NavigableString
2
from BeautifulSoup import BeautifulSoup, NavigableString
-
 
3
from dtr.utils.utils import fetchResponseUsingProxy
3
import re
4
import re
4
import sys
5
import sys
5
 
6
 
6
invalid_tags = ['b', 'i', 'u']
7
invalid_tags = ['b', 'i', 'u']
7
bestSellers = []
8
bestSellers = []
Line 21... Line 22...
21
            tag.replaceWith(s)
22
            tag.replaceWith(s)
22
 
23
 
23
    return soup
24
    return soup
24
 
25
 
25
class AmazonScraper:
26
class AmazonScraper:
26
    def __init__(self):
27
    def __init__(self, livePricing=None):
27
        self.count_trials = 0
28
        self.count_trials = 0
-
 
29
        self.livePricing = livePricing
28
    
30
    
29
    def read(self, url, findStore):
31
    def read(self, url, findStore):
30
        request = urllib2.Request(url)
-
 
31
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
-
 
32
        opener = urllib2.build_opener()
-
 
33
        response_data = ""
32
        response_data = ""
34
        self.findStore = findStore
33
        self.findStore = findStore
35
        try:
34
        try:
36
            response_data = opener.open(request).read()
35
            response_data = fetchResponseUsingProxy(url,livePricing=self.livePricing)
37
            
-
 
38
        except urllib2.HTTPError as e:
36
        except Exception as e:
39
            print 'ERROR: ', e
37
            print 'ERROR: ', e
40
            print 'Retrying'
38
            print 'Retrying'
41
            self.count_trials += 1
39
            self.count_trials += 1
42
            
40
            
43
            if self.count_trials < 3:
41
            if self.count_trials < 5:
44
                return self.read(url)
42
                return self.read(url)
45
        
43
        
46
        self.response_data=response_data
44
        self.response_data=response_data
-
 
45
        
-
 
46
        if "Server Busy" in self.response_data:
-
 
47
            print "Captcha page, lets try again."
-
 
48
            self.count_trials += 1
-
 
49
            return self.read(url)
-
 
50
        return self.createData()
47
    
51
    
48
    def createData(self):
52
    def createData(self):
49
        self.soup = strip_tags(self.response_data,invalid_tags)
53
        self.soup = strip_tags(self.response_data,invalid_tags)
50
        self.response_data =None
54
        self.response_data =None
51
        return self.scrape(self.soup)
55
        return self.scrape(self.soup)
Line 93... Line 97...
93
            print "Rating info ",ratingColumn
97
            print "Rating info ",ratingColumn
94
            print "***********************"
98
            print "***********************"
95
            return unitCost+shippingCost,store
99
            return unitCost+shippingCost,store
96
    
100
    
97
    def findStoreFront(self,storeUrl):
101
    def findStoreFront(self,storeUrl):
98
        request = urllib2.Request(storeUrl)
-
 
99
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
-
 
100
        opener = urllib2.build_opener()
-
 
101
        response_data = ""
-
 
102
        try:
102
        try:
103
            response_data = opener.open(request).read()
103
            response_data = fetchResponseUsingProxy(storeUrl,livePricing=None)
104
            
104
        except:
105
        except urllib2.HTTPError as e:
-
 
106
            print 'ERROR: ', e
-
 
107
            print 'Retrying'
-
 
108
            self.count_trials += 1
-
 
109
            
-
 
110
            if self.count_trials < 3:
-
 
111
                return ""
105
            return ""
112
        soup = strip_tags(response_data,invalid_tags)
106
        soup = strip_tags(response_data,invalid_tags)
113
        response_data =None
107
        response_data =None
114
        return soup.title.string
108
        return soup.title.string
115
            
109
            
116
 
110
 
117
if __name__ == '__main__':
111
if __name__ == '__main__':
118
    scraper = AmazonScraper()
112
    scraper = AmazonScraper()
119
    scraper.read('http://www.amazon.in/gp/offer-listing/B006PB44NM/ref=olp_sort_ps',True)
113
    print scraper.read('http://www.amazon.in/gp/offer-listing/B006PB44NM/ref=olp_sort_ps',True)
120
    print scraper.createData()
-
 
121
    
114
    
122
115