Subversion Repositories SmartDukaan

Rev

Rev 4199 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

'''
Created on 07-Sep-2011

@author: Varun Gupta
'''
from BeautifulSoup import BeautifulSoup
from BaseScraper import BaseScraper

class HS18Scraper(BaseScraper):

    def __init__(self):
        self.url = None
        self.id = None
    
    def setUrl(self, url):
        self.url = url
    
    def scrape(self):
        html = BaseScraper.read(self, self.url)
        self.soup = BeautifulSoup(html)
    
    def getPhones(self):
        product_prices = []
        
        for div in self.soup.findAll('div', {'class': 'product_div book_info_box'}):
            anchor = div.findAll('p', {'class': 'product_title'})[0]('a')[0]
            name = str(anchor['title'].strip())
            
            if name.endswith(' Mobile Phone'):  name = name.replace(' Mobile Phone', '')
            
            url = str(anchor['href'].strip())
            price = str(div.findAll('span', {'class': 'product_new_price'})[0].string.strip())
            
            try:
                product_prices.append({'name': name, 'price': price, 'in_stock': 1, 'product_url': url})
                
            except UnicodeEncodeError as e:
                print 'Unicode Error', e, name
                name_ascii = "".join([char if ord(char) < 128 else " " for char in name])
                print name_ascii
                product_prices.append({"name": str(name_ascii), "price": str(price), "in_stock": 1, "product_url": str(url)})
        
        return product_prices
    
    def getNextUrl(self):
        pagination_links = self.soup.findAll('div', {'class': 'pagination'})[0]('span')
        
        if pagination_links[-1]['class'].strip() == 'disabled_pagination':
            return None
        else:
            return pagination_links[-1]('a')[0]['href'].strip()

if __name__ == '__main__':
    scraper = HS18Scraper()
    scraper.setUrl('http://www.homeshop18.com/ipads-2f-tablets/category:8937/')
    scraper.scrape()
    products = scraper.getPhones()
    print scraper.getNextUrl()
    print products