WebSVN – SmartDukaan – Diff – /trunk/PyProj/src/shop2020/model/v1/catalog/script/AmazonScraper.py

 import urllib2
 from BeautifulSoup import BeautifulSoup, NavigableString
+from dtr.utils.utils import fetchResponseUsingProxy
 import re
 import sys
 invalid_tags = ['b', 'i', 'u']
 bestSellers = []
             tag.replaceWith(s)
     return soup
 class AmazonScraper:
-    def __init__(self):
+    def __init__(self, livePricing=None):
         self.count_trials = 0
+        self.livePricing = livePricing
     def read(self, url, findStore):
-        request = urllib2.Request(url)
-        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
-        opener = urllib2.build_opener()
         response_data = ""
         self.findStore = findStore
         try:
-            response_data = opener.open(request).read()
+            response_data = fetchResponseUsingProxy(url,livePricing=self.livePricing)
-        except urllib2.HTTPError as e:
+        except Exception as e:
             print 'ERROR: ', e
             print 'Retrying'
             self.count_trials += 1
-            if self.count_trials < 3:
+            if self.count_trials < 5:
                 return self.read(url)
         self.response_data=response_data
+        if "Server Busy" in self.response_data:
+            print "Captcha page, lets try again."
+            self.count_trials += 1
+            return self.read(url)
+        return self.createData()
     def createData(self):
         self.soup = strip_tags(self.response_data,invalid_tags)
         self.response_data =None
         return self.scrape(self.soup)
             print "Rating info ",ratingColumn
             print "***********************"
             return unitCost+shippingCost,store
     def findStoreFront(self,storeUrl):
-        request = urllib2.Request(storeUrl)
-        request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
-        opener = urllib2.build_opener()
-        response_data = ""
         try:
-            response_data = opener.open(request).read()
+            response_data = fetchResponseUsingProxy(storeUrl,livePricing=None)
+        except:
-        except urllib2.HTTPError as e:
-            print 'ERROR: ', e
-            print 'Retrying'
-            self.count_trials += 1
-            if self.count_trials < 3:
-                return ""
+            return ""
         soup = strip_tags(response_data,invalid_tags)
         response_data =None
         return soup.title.string
 if __name__ == '__main__':
     scraper = AmazonScraper()
-    scraper.read('http://www.amazon.in/gp/offer-listing/B006PB44NM/ref=olp_sort_ps',True)
+    print scraper.read('http://www.amazon.in/gp/offer-listing/B006PB44NM/ref=olp_sort_ps',True)
-    print scraper.createData()

Subversion Repositories SmartDukaan

(root)/trunk/PyProj/src/shop2020/model/v1/catalog/script/AmazonScraper.py – Rev 12275 → 15483