Subversion Repositories SmartDukaan

Rev

Rev 5291 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
5291 varun.gupt 1
'''
2
Created on 24-May-2012
3
 
4
@author: Varun Gupta
5
'''
6
from BeautifulSoup import BeautifulSoup
7
from BaseScraper import BaseScraper
8
from Utils import removePriceFormatting
9
import json
10
 
11
class SnapdealScraper(BaseScraper):
12
 
13
    pageCount = {}
14
    productCountPerScraping = 20
15
 
16
    def __init__(self):
17
        BaseScraper.__init__(self)
18
        self.url = None
19
        self.id = None
20
        self.currentPage = None
21
        self.category = None
22
 
23
    def setUrl(self, url):
24
        self.url = url
25
        urlChunks = url.split('/')
26
 
27
        self.category = urlChunks[7]
28
        self.currentPage = 1 + (int(urlChunks[8]) / SnapdealScraper.productCountPerScraping)
29
 
30
    def scrape(self):
31
        str = BaseScraper.read(self, self.url)
32
        self.json = json.loads(str)
33
        self.setPageCount()
34
 
35
    def getPhones(self):
36
        phones = []
37
 
38
        for product in self.json['productResponseDTO']['productDtos']:
39
            phones.append({
40
                    'name': str(product['name']),
41
                    'price': product['voucherPrice'],
42
                    'source': 'snapdeal',
43
                    'product_url': str(product['pageUrl']),
44
                    'in_stock': int(not bool(product['soldOut']))
45
                })
46
        return phones
47
 
48
    def getNextUrl(self):
49
        if self.currentPage < SnapdealScraper.pageCount[self.category]:
50
            return 'http://www.snapdeal.com/json/product/get/search/%s/%s/%s?q=&sort=plrty&keyword=' % (self.category, self.currentPage * SnapdealScraper.productCountPerScraping, SnapdealScraper.productCountPerScraping)
51
        else:
52
            return None
53
 
54
    def setPageCount(self):
55
        if SnapdealScraper.pageCount is None or self.category not in SnapdealScraper.pageCount:
56
            resultCount = int(self.json['productResponseDTO']['numberFound'])
57
            SnapdealScraper.pageCount[self.category] = 1 + int(resultCount / SnapdealScraper.productCountPerScraping)
58
 
59
    def getDataFromProductPage(self, url):
60
        html = BaseScraper.read(self, url)
61
        soup = BeautifulSoup(html)
62
        name = str(soup.find('div', {'class': 'prodtitle-head'}).find('h1').string.strip())
63
        price = removePriceFormatting(str(soup.find('span', {'id': 'selling-price-id'}).string.strip()))
64
 
65
        data = {
66
            "product_url": str(url),
67
            "source": "snapdeal",
68
            "price": price,
69
            "in_stock": 1,
70
            "name": name
71
        }
72
        return data
73
 
74
if __name__ == '__main__':
75
    s = SnapdealScraper()
5401 varun.gupt 76
    s.setUrl('http://www.snapdeal.com/json/product/get/search/175/480/20?q=&sort=plrty&keyword=')
77
    s.scrape()
78
    print s.getPhones()
79
    print s.getNextUrl()
80
    #print s.getDataFromProductPage('http://www.snapdeal.com/product/mobiles-mobile-phones/sony-ericsson-xperia-mini-pro-sk17i-black-10052?pos=31;493')