Rev 4198 | Rev 4203 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 07-Sep-2011@author: Varun Gupta'''from BeautifulSoup import BeautifulSoupfrom BaseScraper import BaseScraperfrom Utils import removePriceFormattingclass HS18Scraper(BaseScraper):def __init__(self):self.url = Noneself.id = Nonedef setUrl(self, url):self.url = urldef scrape(self):html = BaseScraper.read(self, self.url)self.soup = BeautifulSoup(html)def getPhones(self):product_prices = []for div in self.soup.findAll('div', {'class': 'product_div book_info_box'}):anchor = div.findAll('p', {'class': 'product_title'})[0]('a')[0]name = str(anchor['title'].strip())if name.endswith(' Mobile Phone'): name = name.replace(' Mobile Phone', '')url = str(anchor['href'].strip())price = removePriceFormatting(str(div.findAll('span', {'class': 'product_new_price'})[0].string.strip()))try:product_prices.append({'name': name,'price': price,'source': 'homeshop18','in_stock': 1,'product_url': url})except UnicodeEncodeError as e:print 'Unicode Error', e, namename_ascii = "".join([char if ord(char) < 128 else " " for char in name])print name_asciiproduct_prices.append({"name": str(name_ascii),"price": str(price),'source': 'homeshop18',"in_stock": 1,"product_url": str(url)})return product_pricesdef getNextUrl(self):pagination_links = self.soup.find('div', {'class': 'pagination'}).findAll('span')try:if pagination_links[-1]['class'].strip() == 'disabled_pagination':return Noneelse:return pagination_links[-1]('a')[0]['href'].strip()except KeyError:print pagination_linksdef getDataFromProductPage(self, url):html = BaseScraper.read(self, url)soup = BeautifulSoup(html)name = soup.find('h1', {'itemprop': 'name'}).string.strip()price = soup.find('span',{'id': 'fk-mprod-our-id'}).contents[2]in_stock = soup.find('div', {'id': 'fk-stock-info-id'}).string.strip()data = {"product_url": str(url),"source": "flipkart","price": price,"in_stock": 1 if in_stock == 'In Stock.' else 0,"name": name}return dataif __name__ == '__main__':scraper = HS18Scraper()scraper.setUrl('http://www.homeshop18.com//ipads-2f-tablets/categoryid:8937/search:*/start:32/')scraper.scrape()products = scraper.getPhones()print scraper.getNextUrl()print products