Subversion Repositories SmartDukaan

Rev

Rev 5291 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
4039 varun.gupt 1
'''
2
Created on 24-Aug-2011
3
 
4
@author: Varun Gupta
5
'''
6
 
7
from BeautifulSoup import BeautifulSoup
8
from BaseScraper import BaseScraper
4198 varun.gupt 9
from Utils import removePriceFormatting
4039 varun.gupt 10
 
11
class FlipcartScraper(BaseScraper):
12
 
13
    def __init__(self):
14
        BaseScraper.__init__(self)
15
        self.url = None
16
        self.id = None
17
 
18
    def setUrl(self, url):
19
        self.url = url
20
 
21
    def scrape(self):
22
        html = BaseScraper.read(self, self.url)
23
        self.soup = BeautifulSoup(html)
24
        self.phones = None
25
 
26
    def getPhones(self):
27
        phones = []
28
        for div in self.soup.findAll('div', {'class': 'fk-product-thumb fkp-medium'}):
29
            try:
5291 varun.gupt 30
                anchor = div.find('a', {'class': 'title tpadding5 fk-anchor-link'})
4039 varun.gupt 31
                name = anchor['title'].strip()
32
                price = None
33
                product_url = anchor['href'].strip()
34
                in_stock = 0 if div.findAll('b').__len__() > 0 else 1
35
 
36
                for span in div.findAll('span'):
37
                    try:
38
                        if span['class'].find('price final-price') > -1:
39
                            price = span.string.strip()
40
                    except KeyError:
41
                        pass
42
                try:
43
                    if price is None:
44
                        continue
45
                    else:
4198 varun.gupt 46
                        phones.append({
47
                                'name': str(name), 
48
                                'price': removePriceFormatting(price),
49
                                'source': 'flipkart', 
50
                                'product_url': str(product_url), 
51
                                'in_stock': in_stock
52
                            })
4039 varun.gupt 53
 
54
                except UnboundLocalError as e:
55
                    print e, name
56
                    print div
57
 
58
                except UnicodeEncodeError as e:
59
                    print 'Unicode Error', e, name
60
                    name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
61
                    print name_ascii
4198 varun.gupt 62
                    phones.append({
63
                            "name": str(name_ascii), 
64
                            "price": str(price),
65
                            'source': 'flipkart',  
66
                            "in_stock": in_stock, 
67
                            "product_url": str(product_url)
68
                        })
4039 varun.gupt 69
            except KeyError:
70
                pass
71
        self.phones = phones
72
        return phones
73
 
74
    def getNextUrl(self):
4203 varun.gupt 75
        tab_info = self.soup.find('div', {'class': 'unit fk-lres-header-text'})('b')
76
 
77
        current_max = int(tab_info[0].find('span').string)
4039 varun.gupt 78
        total = int(tab_info[1].string)
79
 
80
        if len(self.phones) > 0:
6166 amar.kumar 81
            category = ''
82
            if self.url.find('/tablet') != -1 :
83
                category = 'mobiles/tablet-20278/'
84
            elif self.url.find('/all-camcorder') != -1 :
85
                category = 'cameras/all-camcorder/'  
86
            elif self.url.find('/all-slr') != -1 :
87
                category = 'cameras/all-slr/'
88
            elif self.url.find('/all-point-shoot') != -1 :
89
                category = 'cameras/all-point-shoot/'
90
            else :
91
                category = 'mobiles/all/'
4039 varun.gupt 92
 
6166 amar.kumar 93
            base_url = 'http://www.flipkart.com/' + category
94
            #base_url = 'http://www.flipkart.com/mobiles/%s' % ('all/' if self.phones[0]['product_url'].find('/tablets/') == -1 else 'tablet-20278/')
95
 
4039 varun.gupt 96
            if current_max < total:
97
                return base_url + str(1 + (current_max / 20))
98
            else:
99
                return None
100
        else:
101
            return None
102
 
4198 varun.gupt 103
    def getDataFromProductPage(self, url):
104
        html = BaseScraper.read(self, url)
105
        soup = BeautifulSoup(html)
106
        name = soup.find('h1', {'itemprop': 'name'}).string.strip()
107
        price = soup.find('span',{'id': 'fk-mprod-our-id'}).contents[2]
5291 varun.gupt 108
        in_stock = 1
4198 varun.gupt 109
 
110
        data = {
111
            "product_url": str(url), 
112
            "source": "flipkart", 
113
            "price": price, 
114
            "in_stock": 1 if in_stock == 'In Stock.' else 0, 
115
            "name": name
116
        }
117
        return data
4039 varun.gupt 118
 
119
if __name__ == '__main__':
120
    s = FlipcartScraper()
5291 varun.gupt 121
    data = s.getDataFromProductPage('http://www.flipkart.com/samsung-wave-ii-s8530-mobile-phone/p/itmctnexz3gyjfac?pid=MOBCTXB47XCP7Z9X&ref=eca2ea19-cde2-4bfd-a3d8-15cf737c88d3')
122
    print data
4198 varun.gupt 123
 
5291 varun.gupt 124
#    s.setUrl('http://www.flipkart.com/mobiles/all')
125
#    s.scrape()
126
#    phones = s.getPhones()
127
#    for p in phones: print p
128
#    print s.getNextUrl()