WebSVN – SmartDukaan – /trunk/PriceComparisonFramework/src/Scrapers/LetsBuyScraper.py

'''
Created on 24-Aug-2011

@author: Varun Gupta
'''
from BeautifulSoup import BeautifulSoup
from BaseScraper import BaseScraper
from Utils import removePriceFormatting

class LetsBuyScraper(BaseScraper):
    
    def __init__(self):
        BaseScraper.__init__(self)
        self.url = None
        self.id = None
    
    def setUrl(self, url):
        self.url = url
    
    def scrape(self):
        html = BaseScraper.read(self, self.url)
        self.soup = BeautifulSoup(html)
    
    def getPhones(self):
        phone_prices = []

        for div in self.soup.findAll('div', {'class': "detailbox"}):
            name_tag = div('h2')[0]('a')[0]
            name = name_tag.string.strip()
            price = removePriceFormatting(div.findAll('span', {'class': "text12_stb"})[0].string.strip())
            url = str(name_tag['href'])
            try:
                phone_prices.append({
                        "name": str(name), 
                        "price": str(price),
                        'source': 'letsbuy', 
                        "in_stock": 1, 
                        "product_url": str(url)
                    })
            except UnicodeEncodeError as e:
                print 'Unicode Error', e, name
                name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
                print name_ascii
                phone_prices.append({
                        "name": str(name_ascii), 
                        "price": str(price),
                        'source': 'letsbuy', 
                        "in_stock": 1, 
                        "product_url": str(url)
                    })
            
        return phone_prices
    
    def getNextUrl(self):
        next_url = None
        
        for anchor in self.soup.findAll('a'):
            try:
                if anchor['title'].strip() == "Next Page":
                    next_url = anchor['href'].strip()
            except KeyError:
                pass
        
        return next_url

    def getDataFromProductPage(self, url):
        html = BaseScraper.read(self, url)
        soup = BeautifulSoup(html)
        name = soup.find('h1', {'class': 'prod_name'}).string.strip()
        price = removePriceFormatting(soup.find('span',{'class': 'offer_price'}).string.strip())
        
        data = {
            "product_url": str(url),
            "source": "letsbuy",
            "price": price,
            "in_stock": 1,
            "name": name
        }
        return data


if __name__ == '__main__':
    s = LetsBuyScraper()
    print s.getDataFromProductPage('http://www.letsbuy.com/samsung-galaxy-pop-s5570-p-14143')
    
#    s.setUrl('http://www.letsbuy.com/mobile-phones-mobiles-c-254_88?perpage=192')
#    s.scrape()
#    phones = s.getPhones()
#    print phones
#    print s.getNextUrl()
Subversion Repositories SmartDukaan

(root)/trunk/PriceComparisonFramework/src/Scrapers/LetsBuyScraper.py – Rev 4199