WebSVN – SmartDukaan – Diff – /trunk/PriceComparisonFramework/src/Scrapers/LetsBuyScraper.py

 @author: Varun Gupta
 '''
 from BeautifulSoup import BeautifulSoup
 from BaseScraper import BaseScraper
 from Utils import removePriceFormatting
+import json
 class LetsBuyScraper(BaseScraper):
+    pageCount = {}
     def __init__(self):
         BaseScraper.__init__(self)
         self.url = None
         self.id = None
+        self.currentPage = None
+        self.category = None
     def setUrl(self, url):
         self.url = url
+        for params in url.split('?')[1].split('&'):
+            paramName = params.split('=')[0].strip()
+            if paramName == 'pg':
+                self.currentPage = int(params.split('=')[1])
+            elif paramName == 'c':
+                self.category = params.split('=')[1]
+        if self.currentPage is None:
+            self.currentPage = 1
     def scrape(self):
-        html = BaseScraper.read(self, self.url)
+        str = BaseScraper.read(self, self.url)
-        self.soup = BeautifulSoup(html)
+        self.json = json.loads(str)
+        self.setPageCount()
     def getPhones(self):
-        phone_prices = []
+        phones = []
-        for div in self.soup.findAll('div', {'class': "detailbox"}):
-            name_tag = div('h2')[0]('a')[0]
-            name = name_tag.string.strip()
-            price = removePriceFormatting(div.findAll('span', {'class': "text12_stb"})[0].string.strip())
-            url = str(name_tag['href'])
+        for product in self.json['result']:
-            try:
-                phone_prices.append({
+            phones.append({
-                        "name": str(name),
+                        'name': str(product['products_name']),
-                        "price": str(price),
+                        'price': product['products_price'],
                         'source': 'letsbuy',
-                        "in_stock": 1,
-                        "product_url": str(url)
+                        'product_url': str(product['url']),
-                    })
-            except UnicodeEncodeError as e:
-                print 'Unicode Error', e, name
-                name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
-                print name_ascii
-                phone_prices.append({
-                        "name": str(name_ascii),
-                        "price": str(price),
-                        'source': 'letsbuy',
-                        "in_stock": 1,
-                        "product_url": str(url)
+                        'in_stock': int(product['product_status'])
                     })
-        return phone_prices
+        return phones
     def getNextUrl(self):
-        next_url = None
-        for anchor in self.soup.findAll('a'):
+        if self.currentPage < LetsBuyScraper.pageCount[self.category]:
-            try:
-                if anchor['title'].strip() == "Next Page":
+            return 'http://www.letsbuy.com/filterResult?c=%s&pp=192&pg=%s' % (self.category, self.currentPage + 1)
-                    next_url = anchor['href'].strip()
-            except KeyError:
-                pass
+        else:
-        return next_url
+            return None
+    def setPageCount(self):
+        if LetsBuyScraper.pageCount is None or self.category not in LetsBuyScraper.pageCount:
+            resultCount = int(self.json['resultCount']['0'])
+            LetsBuyScraper.pageCount[self.category] = 1 + int(resultCount / 192)
     def getDataFromProductPage(self, url):
         html = BaseScraper.read(self, url)
         soup = BeautifulSoup(html)
         name = soup.find('h1', {'class': 'prod_name'}).string.strip()
         price = removePriceFormatting(soup.find('span',{'class': 'offer_price'}).string.strip())
         return data
 if __name__ == '__main__':
     s = LetsBuyScraper()
-    print s.getDataFromProductPage('http://www.letsbuy.com/samsung-galaxy-pop-s5570-p-14143')
+#    print s.getDataFromProductPage('http://www.letsbuy.com/samsung-galaxy-pop-s5570-p-14143')
+#    s.setUrl('http://www.letsbuy.com/mobile-phones-mobiles-c-254_88')
+    s.setUrl('http://www.letsbuy.com/filterResult?c=254_88&pp=192&pg=7')
+    s.scrape()
-#    s.setUrl('http://www.letsbuy.com/mobile-phones-mobiles-c-254_88?perpage=192')
-#    s.scrape()
-#    phones = s.getPhones()
-#    print phones
-#    print s.getNextUrl()
     print s.getPhones()
+    print s.getNextUrl()

Subversion Repositories SmartDukaan

(root)/trunk/PriceComparisonFramework/src/Scrapers/LetsBuyScraper.py – Rev 4199 → 5291