Rev 5401 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 07-Sep-2011@author: Varun Gupta'''from BeautifulSoup import BeautifulSoupfrom BaseScraper import BaseScraperfrom Utils import removePriceFormattingfrom SoupSelect import selectclass HS18Scraper(BaseScraper):def __init__(self):self.url = Noneself.id = Nonedef setUrl(self, url):self.url = urldef scrape(self):html = BaseScraper.read(self, self.url)self.soup = BeautifulSoup(html)def getPhones(self):product_prices = []for div in select(self.soup, "div.product_div"):#self.soup.findAll('div', {'class': 'product_div'}):try:anchor = div.find('p', {'class': 'product_title'})('a')[0]name = str(anchor['title'].strip())if name.endswith(' Mobile Phone'):name = name.replace(' Mobile Phone', '')url = str(anchor['href'].strip())price = removePriceFormatting(str(div.findAll('span', {'class': 'product_new_price'})[0].string.strip()))try:product_prices.append({'name': name,'price': price,'source': 'homeshop18','in_stock': 1,'product_url': url})except UnicodeEncodeError as e:print 'Unicode Error', e, namename_ascii = "".join([char if ord(char) < 128 else " " for char in name])print name_asciiproduct_prices.append({"name": str(name_ascii),"price": str(price),'source': 'homeshop18',"in_stock": 1,"product_url": str(url)})except Exception as e:print ereturn product_pricesdef getNextUrl(self):pagination_links = self.soup.find('div', {'class': 'pagination'}).findAll('span')try:if pagination_links[-1]['class'].strip() == 'disabled_pagination':return Noneelse:return pagination_links[-1]('a')[0]['href'].strip()except KeyError:print pagination_linksdef getDataFromProductPage(self, url):html = BaseScraper.read(self, url)soup = BeautifulSoup(html)name = soup.find('h1', {'id': 'productLayoutForm:pbiName'}).string.replace('Mobile Phone', '').strip()price = removePriceFormatting(soup.find('span',{'id': 'productLayoutForm:OurPrice'}).string)data = {"product_url": str(url),"source": "homeshop18","price": price,"in_stock": 1,"name": name}return dataif __name__ == '__main__':scraper = HS18Scraper()# print scraper.getDataFromProductPage('http://www.homeshop18.com/samsung-galaxy-note-n7000-mobile-phone/mobiles-accessories/gsm-handsets/product:16601211/cid:3027/')scraper.setUrl('http://www.homeshop18.com/mobiles/category:14569/')scraper.scrape()products = scraper.getPhones()print productsprint scraper.getNextUrl()