Subversion Repositories SmartDukaan

Rev

Rev 5291 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
3232 varun.gupt 1
'''
2
Created on 24-Aug-2011
3
 
4
@author: Varun Gupta
5
'''
6
from BeautifulSoup import BeautifulSoup
4039 varun.gupt 7
from BaseScraper import BaseScraper
4198 varun.gupt 8
from Utils import removePriceFormatting
3232 varun.gupt 9
 
4039 varun.gupt 10
class InfibeamScraper(BaseScraper):
3232 varun.gupt 11
 
12
    def __init__(self):
13
        self.url = None
14
        self.id = None
15
 
16
    def setUrl(self, url):
17
        self.url = url
18
 
19
    def scrape(self):
4039 varun.gupt 20
        html = BaseScraper.read(self, self.url)
3232 varun.gupt 21
        self.soup = BeautifulSoup(html)
22
 
4039 varun.gupt 23
    def getPhones(self):
3232 varun.gupt 24
        phone_prices = []
6166 amar.kumar 25
        if self.url.find('Cameras') !=-1 :
26
            ulTagClass = 'srch_result landscape'
27
        else :
28
            ulTagClass = 'srch_result portrait'
5291 varun.gupt 29
 
6166 amar.kumar 30
        for li in self.soup.findAll('ul', {'class': ulTagClass})[0]('li'):
31
 
5291 varun.gupt 32
            name = li.find('span', {'class': 'title'}).contents[1].strip()
4039 varun.gupt 33
            try:
5291 varun.gupt 34
                price = li.find('div', {'class': 'price'}).find('span', {'class': 'normal'}).string
4039 varun.gupt 35
            except IndexError:
5291 varun.gupt 36
                price = removePriceFormatting(li.find('span', {'class': 'price'}).contents[-1].strip())
37
            except AttributeError:
38
                price = removePriceFormatting(li.find('span', {'class': 'price'}).contents[-1].strip())
4039 varun.gupt 39
 
40
            url = li.findAll('a')[0]['href']
41
 
42
            try:
4198 varun.gupt 43
                phone_prices.append({
44
                        'name': str(name), 
5291 varun.gupt 45
                        'price': removePriceFormatting(str(price)),
4198 varun.gupt 46
                        'source': 'infibeam', 
47
                        'in_stock': 1, 
48
                        'product_url': str(url)
49
                    })
4039 varun.gupt 50
 
51
            except UnicodeEncodeError as e:
52
                print 'Unicode Error', e, name
53
                name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
54
                print name_ascii
4198 varun.gupt 55
                phone_prices.append({
56
                        "name": str(name_ascii), 
5291 varun.gupt 57
                        "price": removePriceFormatting(str(price)),
4198 varun.gupt 58
                        'source': 'infibeam', 
59
                        "in_stock": 1, 
60
                        "product_url": str(url)
61
                    })
4039 varun.gupt 62
 
3232 varun.gupt 63
        return phone_prices
64
 
65
    def getNextUrl(self):
4039 varun.gupt 66
        b = self.soup.findAll('div', {'class': 'resultsSummary'})[0].findAll('b')
67
        current_max = int(b[0].string.split('-')[1])
68
        total_products = int(b[1].string)
69
 
6166 amar.kumar 70
        urlDivider = self.url.find('page=')
71
 
72
        return self.url[:urlDivider] + 'page=%d' % (1 + current_max / 20) if current_max < total_products else None
3232 varun.gupt 73
 
4199 varun.gupt 74
    def getDataFromProductPage(self, url):
75
        html = BaseScraper.read(self, url)
76
        soup = BeautifulSoup(html)
77
        name = soup.find('div', {'id': 'ib_details'}).find('h1', {'class': "fn"}).find('span', {'class': "item"}).string.strip()
78
        price = removePriceFormatting(soup.find('div',{'id': 'priceDiv'}).find('span', {'class': 'infiPrice amount price'}).string)
79
        in_stock = soup.find('div', {'id': 'colors'}).find('span', {'class':"status"}).string.strip()
80
 
81
        data = {
82
            "product_url": str(url), 
83
            "source": "infibeam", 
84
            "price": price, 
85
            "in_stock": 1 if in_stock == 'In Stock.' else 0, 
86
            "name": name
87
        }
88
        return data
89
 
3232 varun.gupt 90
if __name__ == '__main__':
91
    s = InfibeamScraper()
5291 varun.gupt 92
#    print s.getDataFromProductPage('http://www.infibeam.com/Mobiles/i-HTC-EVO-3D-Android-Smartphone/P-E-M-HTC-EVO-3D.html?id=Black')
93
    s.setUrl('http://www.infibeam.com/Mobiles/search?page=5')
94
    s.scrape()
95
    products = s.getPhones()
96
 
97
    print products
98
    print s.getNextUrl()