Subversion Repositories SmartDukaan

Rev

Rev 5291 | Blame | Compare with Previous | Last modification | View Log | RSS feed

'''
Created on 24-May-2012

@author: Varun Gupta
'''
from BeautifulSoup import BeautifulSoup
from BaseScraper import BaseScraper
from Utils import removePriceFormatting
import json

class SnapdealScraper(BaseScraper):
    
    pageCount = {}
    productCountPerScraping = 20
    
    def __init__(self):
        BaseScraper.__init__(self)
        self.url = None
        self.id = None
        self.currentPage = None
        self.category = None
    
    def setUrl(self, url):
        self.url = url
        urlChunks = url.split('/')
        
        self.category = urlChunks[7]
        self.currentPage = 1 + (int(urlChunks[8]) / SnapdealScraper.productCountPerScraping)
    
    def scrape(self):
        str = BaseScraper.read(self, self.url)
        self.json = json.loads(str)
        self.setPageCount()
    
    def getPhones(self):
        phones = []
        
        for product in self.json['productResponseDTO']['productDtos']:
            phones.append({
                    'name': str(product['name']),
                    'price': product['voucherPrice'],
                    'source': 'snapdeal',
                    'product_url': str(product['pageUrl']),
                    'in_stock': int(not bool(product['soldOut']))
                })
        return phones
    
    def getNextUrl(self):
        if self.currentPage < SnapdealScraper.pageCount[self.category]:
            return 'http://www.snapdeal.com/json/product/get/search/%s/%s/%s?q=&sort=plrty&keyword=' % (self.category, self.currentPage * SnapdealScraper.productCountPerScraping, SnapdealScraper.productCountPerScraping)
        else:
            return None

    def setPageCount(self):
        if SnapdealScraper.pageCount is None or self.category not in SnapdealScraper.pageCount:
            resultCount = int(self.json['productResponseDTO']['numberFound'])
            SnapdealScraper.pageCount[self.category] = 1 + int(resultCount / SnapdealScraper.productCountPerScraping)
    
    def getDataFromProductPage(self, url):
        html = BaseScraper.read(self, url)
        soup = BeautifulSoup(html)
        name = str(soup.find('div', {'class': 'prodtitle-head'}).find('h1').string.strip())
        price = removePriceFormatting(str(soup.find('span', {'id': 'selling-price-id'}).string.strip()))
        
        data = {
            "product_url": str(url),
            "source": "snapdeal",
            "price": price,
            "in_stock": 1,
            "name": name
        }
        return data

if __name__ == '__main__':
    s = SnapdealScraper()
    s.setUrl('http://www.snapdeal.com/json/product/get/search/175/480/20?q=&sort=plrty&keyword=')
    s.scrape()
    print s.getPhones()
    print s.getNextUrl()
    #print s.getDataFromProductPage('http://www.snapdeal.com/product/mobiles-mobile-phones/sony-ericsson-xperia-mini-pro-sk17i-black-10052?pos=31;493')