WebSVN – SmartDukaan – /trunk/PriceComparisonFramework/src/Scrapers/HS18Scraper.py

'''
Created on 07-Sep-2011

@author: Varun Gupta
'''
from BeautifulSoup import BeautifulSoup
from BaseScraper import BaseScraper
from Utils import removePriceFormatting
from SoupSelect import select

class HS18Scraper(BaseScraper):

    def __init__(self):
        self.url = None
        self.id = None
    
    def setUrl(self, url):
        self.url = url
    
    def scrape(self):
        html = BaseScraper.read(self, self.url)
        self.soup = BeautifulSoup(html)
    
    def getPhones(self):
        product_prices = []
        
        for div in select(self.soup, "div.product_div"):#self.soup.findAll('div', {'class': 'product_div'}):
            
            anchor = div.find('p', {'class': 'product_title'})('a')[0]
            name = str(anchor['title'].strip())
            
            if name.endswith(' Mobile Phone'):
                name = name.replace(' Mobile Phone', '')
            
            url = str(anchor['href'].strip())
            price = removePriceFormatting(str(div.findAll('span', {'class': 'product_new_price'})[0].string.strip()))
            
            try:
                product_prices.append({
                        'name': name, 
                        'price': price,
                        'source': 'homeshop18', 
                        'in_stock': 1, 
                        'product_url': url
                    })
                
            except UnicodeEncodeError as e:
                print 'Unicode Error', e, name
                name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
                print name_ascii
                product_prices.append({
                        "name": str(name_ascii), 
                        "price": str(price),
                        'source': 'homeshop18',
                        "in_stock": 1, 
                        "product_url": str(url)
                    })
        
        return product_prices
    
    def getNextUrl(self):
        pagination_links = self.soup.find('div', {'class': 'pagination'}).findAll('span')
        
        try:
            if pagination_links[-1]['class'].strip() == 'disabled_pagination':
                return None
            else:
                return pagination_links[-1]('a')[0]['href'].strip()
        except KeyError:
            print pagination_links

    def getDataFromProductPage(self, url):
        html = BaseScraper.read(self, url)
        soup = BeautifulSoup(html)
        name = soup.find('h1', {'id': 'productLayoutForm:pbiName'}).string.replace('Mobile Phone', '').strip()
        price = removePriceFormatting(soup.find('span',{'id': 'productLayoutForm:OurPrice'}).string)
        
        data = {
            "product_url": str(url), 
            "source": "homeshop18", 
            "price": price, 
            "in_stock": 1,
            "name": name
        }
        return data


if __name__ == '__main__':
    scraper = HS18Scraper()
#    print scraper.getDataFromProductPage('http://www.homeshop18.com/samsung-galaxy-note-n7000-mobile-phone/mobiles-accessories/gsm-handsets/product:16601211/cid:3027/')
    scraper.setUrl('http://www.homeshop18.com/mobiles/category:14569/')
    scraper.scrape()
    products = scraper.getPhones()
    print products
    print scraper.getNextUrl()
Subversion Repositories SmartDukaan

(root)/trunk/PriceComparisonFramework/src/Scrapers/HS18Scraper.py – Rev 5401