WebSVN – SmartDukaan – Blame – /trunk/PriceComparisonFramework/src/Scrapers/HS18Scraper.py

Rev	Author	Line No.	Line
4039	varun.gupt	1	`'''`
		2	`Created on 07-Sep-2011`
		3
		4	`@author: Varun Gupta`
		5	`'''`
		6	`from BeautifulSoup import BeautifulSoup`
		7	`from BaseScraper import BaseScraper`
4198	varun.gupt	8	`from Utils import removePriceFormatting`
5401	varun.gupt	9	`from SoupSelect import select`
4039	varun.gupt	10
		11	`class HS18Scraper(BaseScraper):`
		12
		13	`def __init__(self):`
		14	`self.url = None`
		15	`self.id = None`
		16
		17	`def setUrl(self, url):`
		18	`self.url = url`
		19
		20	`def scrape(self):`
		21	`html = BaseScraper.read(self, self.url)`
		22	`self.soup = BeautifulSoup(html)`
		23
		24	`def getPhones(self):`
		25	`product_prices = []`
		26
5401	varun.gupt	27	`for div in select(self.soup, "div.product_div"):#self.soup.findAll('div', {'class': 'product_div'}):`
4039	varun.gupt	28	`try:`
6168	amar.kumar	29	`anchor = div.find('p', {'class': 'product_title'})('a')[0]`
		30	`name = str(anchor['title'].strip())`
4039	varun.gupt	31
6168	amar.kumar	32	`if name.endswith(' Mobile Phone'):`
		33	`name = name.replace(' Mobile Phone', '')`
		34
		35	`url = str(anchor['href'].strip())`
		36	`price = removePriceFormatting(str(div.findAll('span', {'class': 'product_new_price'})[0].string.strip()))`
		37
		38	`try:`
		39	`product_prices.append({`
		40	`'name': name,`
		41	`'price': price,`
		42	`'source': 'homeshop18',`
		43	`'in_stock': 1,`
		44	`'product_url': url`
		45	`})`
		46
		47	`except UnicodeEncodeError as e:`
		48	`print 'Unicode Error', e, name`
		49	`name_ascii = "".join([char if ord(char) < 128 else " " for char in name])`
		50	`print name_ascii`
		51	`product_prices.append({`
		52	`"name": str(name_ascii),`
		53	`"price": str(price),`
		54	`'source': 'homeshop18',`
		55	`"in_stock": 1,`
		56	`"product_url": str(url)`
		57	`})`
		58	`except Exception as e:`
		59	`print e`
4039	varun.gupt	60	`return product_prices`
		61
		62	`def getNextUrl(self):`
4198	varun.gupt	63	`pagination_links = self.soup.find('div', {'class': 'pagination'}).findAll('span')`
4039	varun.gupt	64
4198	varun.gupt	65	`try:`
		66	`if pagination_links[-1]['class'].strip() == 'disabled_pagination':`
		67	`return None`
		68	`else:`
		69	`return pagination_links[-1]('a')[0]['href'].strip()`
		70	`except KeyError:`
		71	`print pagination_links`
4039	varun.gupt	72
4199	varun.gupt	73	`def getDataFromProductPage(self, url):`
		74	`html = BaseScraper.read(self, url)`
		75	`soup = BeautifulSoup(html)`
4203	varun.gupt	76	`name = soup.find('h1', {'id': 'productLayoutForm:pbiName'}).string.replace('Mobile Phone', '').strip()`
		77	`price = removePriceFormatting(soup.find('span',{'id': 'productLayoutForm:OurPrice'}).string)`
4199	varun.gupt	78
		79	`data = {`
		80	`"product_url": str(url),`
4203	varun.gupt	81	`"source": "homeshop18",`
4199	varun.gupt	82	`"price": price,`
4203	varun.gupt	83	`"in_stock": 1,`
4199	varun.gupt	84	`"name": name`
		85	`}`
		86	`return data`
		87
		88
4039	varun.gupt	89	`if __name__ == '__main__':`
		90	`scraper = HS18Scraper()`
5291	varun.gupt	91	`# print scraper.getDataFromProductPage('http://www.homeshop18.com/samsung-galaxy-note-n7000-mobile-phone/mobiles-accessories/gsm-handsets/product:16601211/cid:3027/')`
5401	varun.gupt	92	`scraper.setUrl('http://www.homeshop18.com/mobiles/category:14569/')`
5291	varun.gupt	93	`scraper.scrape()`
		94	`products = scraper.getPhones()`
		95	`print products`
		96	`print scraper.getNextUrl()`

Subversion Repositories SmartDukaan

(root)/trunk/PriceComparisonFramework/src/Scrapers/HS18Scraper.py – Rev 6168