Rev 4039 | Rev 4199 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 07-Sep-2011@author: Varun Gupta'''from BeautifulSoup import BeautifulSoupfrom BaseScraper import BaseScraperfrom Utils import removePriceFormattingclass HS18Scraper(BaseScraper):def __init__(self):self.url = Noneself.id = Nonedef setUrl(self, url):self.url = urldef scrape(self):html = BaseScraper.read(self, self.url)self.soup = BeautifulSoup(html)def getPhones(self):product_prices = []for div in self.soup.findAll('div', {'class': 'product_div book_info_box'}):anchor = div.findAll('p', {'class': 'product_title'})[0]('a')[0]name = str(anchor['title'].strip())if name.endswith(' Mobile Phone'): name = name.replace(' Mobile Phone', '')url = str(anchor['href'].strip())price = removePriceFormatting(str(div.findAll('span', {'class': 'product_new_price'})[0].string.strip()))try:product_prices.append({'name': name,'price': price,'source': 'homeshop18','in_stock': 1,'product_url': url})except UnicodeEncodeError as e:print 'Unicode Error', e, namename_ascii = "".join([char if ord(char) < 128 else " " for char in name])print name_asciiproduct_prices.append({"name": str(name_ascii),"price": str(price),'source': 'homeshop18',"in_stock": 1,"product_url": str(url)})return product_pricesdef getNextUrl(self):pagination_links = self.soup.find('div', {'class': 'pagination'}).findAll('span')try:if pagination_links[-1]['class'].strip() == 'disabled_pagination':return Noneelse:return pagination_links[-1]('a')[0]['href'].strip()except KeyError:print pagination_linksif __name__ == '__main__':scraper = HS18Scraper()scraper.setUrl('http://www.homeshop18.com//ipads-2f-tablets/categoryid:8937/search:*/start:32/')scraper.scrape()products = scraper.getPhones()print scraper.getNextUrl()print products