WebSVN – SmartDukaan – Diff – /trunk/PyProj/src/shop2020/model/v1/catalog/script/AmazonScraper.py

 class AmazonScraper:
     def __init__(self):
         self.count_trials = 0
-    def read(self, url):
+    def read(self, url, findStore):
         request = urllib2.Request(url)
         request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
         opener = urllib2.build_opener()
         response_data = ""
+        self.findStore = findStore
         try:
             response_data = opener.open(request).read()
         except urllib2.HTTPError as e:
             print 'ERROR: ', e
             else:
                 print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))
                 shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))
             sellerColumn =  data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})
+            store=""
+            if self.findStore:
-            print "Seller info ",sellerColumn
+                print "Seller info ",sellerColumn
+                x = sellerColumn.find('a')['href']
+                print "&&&&"
+                storeUrl = x
+                store = self.findStoreFront(storeUrl)
+                try:
+                    ind = store.index("@ Amazon.in")
+                    store = store[0:ind].strip()
+                except:
+                    try:
+                        ind = store.split(":")
+                        store = ind[1].strip()
+                    except:
+                        store =""
             ratingColumn = data.find('p', attrs={'class' : 'a-spacing-small'}).find('a').contents[0]
             print "Rating info ",ratingColumn
             print "***********************"
-            return unitCost+shippingCost
+            return unitCost+shippingCost,store
-    def getBestSellers(self,soup):
+    def findStoreFront(self,storeUrl):
-        global bestSellers
+        request = urllib2.Request(storeUrl)
-        bestSellerData = soup.findAll("div" , {"class" : "zg_itemImmersion"})
+        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
-        for data in bestSellerData:
+        opener = urllib2.build_opener()
-            temp = {}
+        response_data = ""
-            rankVal = data.find('span', attrs={'class' : 'zg_rankNumber'}).text
-            print "Rank = ",rankVal.lstrip()
+        try:
-            productUrl = data.find('a')['href']
+            response_data = opener.open(request).read()
-            print "Product URL = ",productUrl.lstrip().replace("\n","")
-            productUrl = productUrl.replace("http://www.amazon.in/","").lstrip()
-            ind = productUrl.rindex("/dp/")
-            productName = productUrl[0:productUrl.rindex("/dp/")]
+        except urllib2.HTTPError as e:
-            print "Product Name = ",productName
+            print 'ERROR: ', e
-            asin = productUrl[ind+4: productUrl.rindex("/ref=")]
+            print 'Retrying'
-            print "Asin = ",asin
+            self.count_trials += 1
-            print "**********************"
-            temp['Rank'] = rankVal.lstrip().replace(".","")
+            if self.count_trials < 3:
-            temp['Url'] = productUrl.lstrip().replace("\n","")
+                return ""
-            temp['Product Name'] = productUrl[0:productUrl.rindex("/dp/")]
+        soup = strip_tags(response_data,invalid_tags)
-            temp['Asin'] = productUrl[ind+4: productUrl.rindex("/ref=")]
+        response_data =None
-            bestSellers.append(temp)
+        return soup.title.string
 if __name__ == '__main__':
     scraper = AmazonScraper()
-    scraper.read('http://www.amazon.in/gp/offer-listing/B001D0ROGO/ref=olp_sort_ps')
+    scraper.read('http://www.amazon.in/gp/offer-listing/B001D0ROGO/ref=olp_sort_ps',True)
     print scraper.createData()

Subversion Repositories SmartDukaan

(root)/trunk/PyProj/src/shop2020/model/v1/catalog/script/AmazonScraper.py – Rev 12198 → 12256