Subversion Repositories SmartDukaan

Rev

Rev 4199 | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 4199 Rev 5291
Line 4... Line 4...
4
@author: Varun Gupta
4
@author: Varun Gupta
5
'''
5
'''
6
from BeautifulSoup import BeautifulSoup
6
from BeautifulSoup import BeautifulSoup
7
from BaseScraper import BaseScraper
7
from BaseScraper import BaseScraper
8
from Utils import removePriceFormatting
8
from Utils import removePriceFormatting
-
 
9
import json
9
 
10
 
10
class LetsBuyScraper(BaseScraper):
11
class LetsBuyScraper(BaseScraper):
11
    
12
    
-
 
13
    pageCount = {}
-
 
14
    
12
    def __init__(self):
15
    def __init__(self):
13
        BaseScraper.__init__(self)
16
        BaseScraper.__init__(self)
14
        self.url = None
17
        self.url = None
15
        self.id = None
18
        self.id = None
-
 
19
        self.currentPage = None
-
 
20
        self.category = None
16
    
21
    
17
    def setUrl(self, url):
22
    def setUrl(self, url):
18
        self.url = url
23
        self.url = url
-
 
24
        
-
 
25
        for params in url.split('?')[1].split('&'):
-
 
26
            paramName = params.split('=')[0].strip()
-
 
27
            
-
 
28
            if paramName == 'pg':
-
 
29
                self.currentPage = int(params.split('=')[1])
-
 
30
            
-
 
31
            elif paramName == 'c':
-
 
32
                self.category = params.split('=')[1]
-
 
33
        
-
 
34
        if self.currentPage is None:
-
 
35
            self.currentPage = 1
19
    
36
    
20
    def scrape(self):
37
    def scrape(self):
21
        html = BaseScraper.read(self, self.url)
38
        str = BaseScraper.read(self, self.url)
22
        self.soup = BeautifulSoup(html)
39
        self.json = json.loads(str)
-
 
40
        self.setPageCount()
23
    
41
    
24
    def getPhones(self):
42
    def getPhones(self):
25
        phone_prices = []
43
        phones = []
26
 
-
 
27
        for div in self.soup.findAll('div', {'class': "detailbox"}):
-
 
28
            name_tag = div('h2')[0]('a')[0]
-
 
29
            name = name_tag.string.strip()
-
 
30
            price = removePriceFormatting(div.findAll('span', {'class': "text12_stb"})[0].string.strip())
-
 
31
            url = str(name_tag['href'])
44
        for product in self.json['result']:
32
            try:
-
 
33
                phone_prices.append({
45
            phones.append({
34
                        "name": str(name), 
46
                        'name': str(product['products_name']),
35
                        "price": str(price),
47
                        'price': product['products_price'],
36
                        'source': 'letsbuy', 
48
                        'source': 'letsbuy',
37
                        "in_stock": 1, 
-
 
38
                        "product_url": str(url)
49
                        'product_url': str(product['url']),
39
                    })
-
 
40
            except UnicodeEncodeError as e:
-
 
41
                print 'Unicode Error', e, name
-
 
42
                name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
-
 
43
                print name_ascii
-
 
44
                phone_prices.append({
-
 
45
                        "name": str(name_ascii), 
-
 
46
                        "price": str(price),
-
 
47
                        'source': 'letsbuy', 
-
 
48
                        "in_stock": 1, 
-
 
49
                        "product_url": str(url)
50
                        'in_stock': int(product['product_status'])
50
                    })
51
                    })
51
            
-
 
52
        return phone_prices
52
        return phones
53
    
53
    
54
    def getNextUrl(self):
54
    def getNextUrl(self):
55
        next_url = None
-
 
56
        
55
        
57
        for anchor in self.soup.findAll('a'):
56
        if self.currentPage < LetsBuyScraper.pageCount[self.category]:
58
            try:
-
 
59
                if anchor['title'].strip() == "Next Page":
57
            return 'http://www.letsbuy.com/filterResult?c=%s&pp=192&pg=%s' % (self.category, self.currentPage + 1)
60
                    next_url = anchor['href'].strip()
-
 
61
            except KeyError:
-
 
62
                pass
-
 
63
        
58
        else: 
64
        return next_url
59
            return None
65
 
60
 
-
 
61
    def setPageCount(self):
-
 
62
        if LetsBuyScraper.pageCount is None or self.category not in LetsBuyScraper.pageCount:
-
 
63
            resultCount = int(self.json['resultCount']['0'])
-
 
64
            LetsBuyScraper.pageCount[self.category] = 1 + int(resultCount / 192)
-
 
65
    
66
    def getDataFromProductPage(self, url):
66
    def getDataFromProductPage(self, url):
67
        html = BaseScraper.read(self, url)
67
        html = BaseScraper.read(self, url)
68
        soup = BeautifulSoup(html)
68
        soup = BeautifulSoup(html)
69
        name = soup.find('h1', {'class': 'prod_name'}).string.strip()
69
        name = soup.find('h1', {'class': 'prod_name'}).string.strip()
70
        price = removePriceFormatting(soup.find('span',{'class': 'offer_price'}).string.strip())
70
        price = removePriceFormatting(soup.find('span',{'class': 'offer_price'}).string.strip())
Line 79... Line 79...
79
        return data
79
        return data
80
 
80
 
81
 
81
 
82
if __name__ == '__main__':
82
if __name__ == '__main__':
83
    s = LetsBuyScraper()
83
    s = LetsBuyScraper()
84
    print s.getDataFromProductPage('http://www.letsbuy.com/samsung-galaxy-pop-s5570-p-14143')
84
#    print s.getDataFromProductPage('http://www.letsbuy.com/samsung-galaxy-pop-s5570-p-14143')
-
 
85
#    s.setUrl('http://www.letsbuy.com/mobile-phones-mobiles-c-254_88')
-
 
86
    s.setUrl('http://www.letsbuy.com/filterResult?c=254_88&pp=192&pg=7')
-
 
87
    s.scrape()
85
    
88
    
86
#    s.setUrl('http://www.letsbuy.com/mobile-phones-mobiles-c-254_88?perpage=192')
-
 
87
#    s.scrape()
-
 
88
#    phones = s.getPhones()
-
 
89
#    print phones
-
 
90
#    print s.getNextUrl()
-
 
91
89
    print s.getPhones()
-
 
90
    print s.getNextUrl()
-
 
91
92
92