Subversion Repositories SmartDukaan

Rev

Rev 5291 | Blame | Compare with Previous | Last modification | View Log | RSS feed

'''
Created on 24-Aug-2011

@author: Varun Gupta
'''

from BeautifulSoup import BeautifulSoup
from BaseScraper import BaseScraper
from Utils import removePriceFormatting

class FlipcartScraper(BaseScraper):
    
    def __init__(self):
        BaseScraper.__init__(self)
        self.url = None
        self.id = None
    
    def setUrl(self, url):
        self.url = url
    
    def scrape(self):
        html = BaseScraper.read(self, self.url)
        self.soup = BeautifulSoup(html)
        self.phones = None
    
    def getPhones(self):
        phones = []
        for div in self.soup.findAll('div', {'class': 'fk-product-thumb fkp-medium'}):
            try:
                anchor = div.find('a', {'class': 'title tpadding5 fk-anchor-link'})
                name = anchor['title'].strip()
                price = None
                product_url = anchor['href'].strip()
                in_stock = 0 if div.findAll('b').__len__() > 0 else 1
                
                for span in div.findAll('span'):
                    try:
                        if span['class'].find('price final-price') > -1:
                            price = span.string.strip()
                    except KeyError:
                        pass
                try:
                    if price is None:
                        continue
                    else:
                        phones.append({
                                'name': str(name), 
                                'price': removePriceFormatting(price),
                                'source': 'flipkart', 
                                'product_url': str(product_url), 
                                'in_stock': in_stock
                            })
                
                except UnboundLocalError as e:
                    print e, name
                    print div
                    
                except UnicodeEncodeError as e:
                    print 'Unicode Error', e, name
                    name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
                    print name_ascii
                    phones.append({
                            "name": str(name_ascii), 
                            "price": str(price),
                            'source': 'flipkart',  
                            "in_stock": in_stock, 
                            "product_url": str(product_url)
                        })
            except KeyError:
                pass
        self.phones = phones
        return phones
    
    def getNextUrl(self):
        tab_info = self.soup.find('div', {'class': 'unit fk-lres-header-text'})('b')
        
        current_max = int(tab_info[0].find('span').string)
        total = int(tab_info[1].string)
        
        if len(self.phones) > 0:
            category = ''
            if self.url.find('/tablet') != -1 :
                category = 'mobiles/tablet-20278/'
            elif self.url.find('/all-camcorder') != -1 :
                category = 'cameras/all-camcorder/'  
            elif self.url.find('/all-slr') != -1 :
                category = 'cameras/all-slr/'
            elif self.url.find('/all-point-shoot') != -1 :
                category = 'cameras/all-point-shoot/'
            else :
                category = 'mobiles/all/'
            
            base_url = 'http://www.flipkart.com/' + category
            #base_url = 'http://www.flipkart.com/mobiles/%s' % ('all/' if self.phones[0]['product_url'].find('/tablets/') == -1 else 'tablet-20278/')
            
            if current_max < total:
                return base_url + str(1 + (current_max / 20))
            else:
                return None
        else:
            return None

    def getDataFromProductPage(self, url):
        html = BaseScraper.read(self, url)
        soup = BeautifulSoup(html)
        name = soup.find('h1', {'itemprop': 'name'}).string.strip()
        price = soup.find('span',{'id': 'fk-mprod-our-id'}).contents[2]
        in_stock = 1
        
        data = {
            "product_url": str(url), 
            "source": "flipkart", 
            "price": price, 
            "in_stock": 1 if in_stock == 'In Stock.' else 0, 
            "name": name
        }
        return data

if __name__ == '__main__':
    s = FlipcartScraper()
    data = s.getDataFromProductPage('http://www.flipkart.com/samsung-wave-ii-s8530-mobile-phone/p/itmctnexz3gyjfac?pid=MOBCTXB47XCP7Z9X&ref=eca2ea19-cde2-4bfd-a3d8-15cf737c88d3')
    print data
    
#    s.setUrl('http://www.flipkart.com/mobiles/all')
#    s.scrape()
#    phones = s.getPhones()
#    for p in phones: print p
#    print s.getNextUrl()