Subversion Repositories SmartDukaan

Rev

Rev 12198 | Rev 12275 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 12198 Rev 12256
Line 24... Line 24...
24
 
24
 
25
class AmazonScraper:
25
class AmazonScraper:
26
    def __init__(self):
26
    def __init__(self):
27
        self.count_trials = 0
27
        self.count_trials = 0
28
    
28
    
29
    def read(self, url):
29
    def read(self, url, findStore):
30
        request = urllib2.Request(url)
30
        request = urllib2.Request(url)
31
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
31
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
32
        opener = urllib2.build_opener()
32
        opener = urllib2.build_opener()
33
        response_data = ""
33
        response_data = ""
-
 
34
        self.findStore = findStore
34
        try:
35
        try:
35
            response_data = opener.open(request).read()
36
            response_data = opener.open(request).read()
36
            
37
            
37
        except urllib2.HTTPError as e:
38
        except urllib2.HTTPError as e:
38
            print 'ERROR: ', e
39
            print 'ERROR: ', e
Line 64... Line 65...
64
            else:
65
            else:
65
                print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))
66
                print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))
66
                shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))
67
                shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))
67
            
68
            
68
            sellerColumn =  data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})
69
            sellerColumn =  data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})
-
 
70
            store=""
-
 
71
            if self.findStore:
69
            print "Seller info ",sellerColumn
72
                print "Seller info ",sellerColumn
-
 
73
                x = sellerColumn.find('a')['href']
-
 
74
                print "&&&&"
-
 
75
                storeUrl = x
-
 
76
                store = self.findStoreFront(storeUrl)
-
 
77
                try:
-
 
78
                    ind = store.index("@ Amazon.in")
-
 
79
                    store = store[0:ind].strip()
-
 
80
                except:
-
 
81
                    try:
-
 
82
                        ind = store.split(":")
-
 
83
                        store = ind[1].strip()
-
 
84
                    except:
-
 
85
                        store =""
70
            ratingColumn = data.find('p', attrs={'class' : 'a-spacing-small'}).find('a').contents[0]
86
            ratingColumn = data.find('p', attrs={'class' : 'a-spacing-small'}).find('a').contents[0]
71
            print "Rating info ",ratingColumn
87
            print "Rating info ",ratingColumn
72
            print "***********************"
88
            print "***********************"
73
            return unitCost+shippingCost
89
            return unitCost+shippingCost,store
74
    
90
    
75
    def getBestSellers(self,soup):
91
    def findStoreFront(self,storeUrl):
76
        global bestSellers
92
        request = urllib2.Request(storeUrl)
77
        bestSellerData = soup.findAll("div" , {"class" : "zg_itemImmersion"})
93
        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
78
        for data in bestSellerData:
94
        opener = urllib2.build_opener()
79
            temp = {}
95
        response_data = ""
80
            rankVal = data.find('span', attrs={'class' : 'zg_rankNumber'}).text
-
 
81
            print "Rank = ",rankVal.lstrip()
96
        try:
82
            productUrl = data.find('a')['href']
97
            response_data = opener.open(request).read()
83
            print "Product URL = ",productUrl.lstrip().replace("\n","")
-
 
84
            productUrl = productUrl.replace("http://www.amazon.in/","").lstrip()
-
 
85
            ind = productUrl.rindex("/dp/")
98
            
86
            productName = productUrl[0:productUrl.rindex("/dp/")]
99
        except urllib2.HTTPError as e:
87
            print "Product Name = ",productName
100
            print 'ERROR: ', e
88
            asin = productUrl[ind+4: productUrl.rindex("/ref=")]
101
            print 'Retrying'
89
            print "Asin = ",asin
102
            self.count_trials += 1
90
            print "**********************"
103
            
91
            temp['Rank'] = rankVal.lstrip().replace(".","")
104
            if self.count_trials < 3:
92
            temp['Url'] = productUrl.lstrip().replace("\n","")
105
                return ""
93
            temp['Product Name'] = productUrl[0:productUrl.rindex("/dp/")]
106
        soup = strip_tags(response_data,invalid_tags)
94
            temp['Asin'] = productUrl[ind+4: productUrl.rindex("/ref=")]
107
        response_data =None
95
            bestSellers.append(temp)
108
        return soup.title.string
96
            
109
            
97
 
110
 
98
if __name__ == '__main__':
111
if __name__ == '__main__':
99
    scraper = AmazonScraper()
112
    scraper = AmazonScraper()
100
    scraper.read('http://www.amazon.in/gp/offer-listing/B001D0ROGO/ref=olp_sort_ps')
113
    scraper.read('http://www.amazon.in/gp/offer-listing/B001D0ROGO/ref=olp_sort_ps',True)
101
    print scraper.createData()
114
    print scraper.createData()
102
    
115
    
103
116