WebSVN – SmartDukaan – /trunk/PriceComparisonFramework/src/Scrapers/InfibeamScraper.py

'''
Created on 24-Aug-2011

@author: Varun Gupta
'''
from BeautifulSoup import BeautifulSoup
from BaseScraper import BaseScraper
from Utils import removePriceFormatting

class InfibeamScraper(BaseScraper):
    
    def __init__(self):
        self.url = None
        self.id = None
    
    def setUrl(self, url):
        self.url = url
    
    def scrape(self):
        html = BaseScraper.read(self, self.url)
        self.soup = BeautifulSoup(html)
    
    def getPhones(self):
        phone_prices = []
        if self.url.find('Cameras') !=-1 :
            ulTagClass = 'srch_result landscape'
        else :
            ulTagClass = 'srch_result portrait'
            
        for li in self.soup.findAll('ul', {'class': ulTagClass})[0]('li'):
            
            name = li.find('span', {'class': 'title'}).contents[1].strip()
            try:
                price = li.find('div', {'class': 'price'}).find('span', {'class': 'normal'}).string
            except IndexError:
                price = removePriceFormatting(li.find('span', {'class': 'price'}).contents[-1].strip())
            except AttributeError:
                price = removePriceFormatting(li.find('span', {'class': 'price'}).contents[-1].strip())
            
            url = li.findAll('a')[0]['href']
            
            try:
                phone_prices.append({
                        'name': str(name), 
                        'price': removePriceFormatting(str(price)),
                        'source': 'infibeam', 
                        'in_stock': 1, 
                        'product_url': str(url)
                    })
                
            except UnicodeEncodeError as e:
                print 'Unicode Error', e, name
                name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
                print name_ascii
                phone_prices.append({
                        "name": str(name_ascii), 
                        "price": removePriceFormatting(str(price)),
                        'source': 'infibeam', 
                        "in_stock": 1, 
                        "product_url": str(url)
                    })
            
        return phone_prices
    
    def getNextUrl(self):
        b = self.soup.findAll('div', {'class': 'resultsSummary'})[0].findAll('b')
        current_max = int(b[0].string.split('-')[1])
        total_products = int(b[1].string)
        
        urlDivider = self.url.find('page=')
        
        return self.url[:urlDivider] + 'page=%d' % (1 + current_max / 20) if current_max < total_products else None

    def getDataFromProductPage(self, url):
        html = BaseScraper.read(self, url)
        soup = BeautifulSoup(html)
        name = soup.find('div', {'id': 'ib_details'}).find('h1', {'class': "fn"}).find('span', {'class': "item"}).string.strip()
        price = removePriceFormatting(soup.find('div',{'id': 'priceDiv'}).find('span', {'class': 'infiPrice amount price'}).string)
        in_stock = soup.find('div', {'id': 'colors'}).find('span', {'class':"status"}).string.strip()
        
        data = {
            "product_url": str(url), 
            "source": "infibeam", 
            "price": price, 
            "in_stock": 1 if in_stock == 'In Stock.' else 0, 
            "name": name
        }
        return data

if __name__ == '__main__':
    s = InfibeamScraper()
#    print s.getDataFromProductPage('http://www.infibeam.com/Mobiles/i-HTC-EVO-3D-Android-Smartphone/P-E-M-HTC-EVO-3D.html?id=Black')
    s.setUrl('http://www.infibeam.com/Mobiles/search?page=5')
    s.scrape()
    products = s.getPhones()
    
    print products
    print s.getNextUrl()
Subversion Repositories SmartDukaan

(root)/trunk/PriceComparisonFramework/src/Scrapers/InfibeamScraper.py – Rev 6166