WebSVN – SmartDukaan – Blame – /trunk/PyDTR/src/dtr/utils/AmazonDealScraper.py

Rev	Author	Line No.	Line
14307	kshitij.so	1	`import urllib2`
		2	`from BeautifulSoup import BeautifulSoup, NavigableString`
		3	`import re`
14759	kshitij.so	4	`from dtr.utils.utils import fetchResponseUsingProxy`
14307	kshitij.so	5
		6	`invalid_tags = ['b', 'i', 'u']`
		7	`bestSellers = []`
		8
		9	`def strip_tags(html, invalid_tags):`
		10	`soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)`
		11
		12	`for tag in soup.findAll(True):`
		13	`if tag.name in invalid_tags:`
		14	`s = ""`
		15
		16	`for c in tag.contents:`
		17	`if not isinstance(c, NavigableString):`
		18	`c = strip_tags(unicode(c), invalid_tags)`
		19	`s += unicode(c)`
		20
		21	`tag.replaceWith(s)`
		22
		23	`return soup`
		24
		25	`class AmazonScraper:`
		26	`def __init__(self):`
		27	`self.count_trials = 0`
		28
		29	`def read(self, url):`
		30	`response_data = ""`
		31	`try:`
14759	kshitij.so	32	`response_data = fetchResponseUsingProxy(url)`
		33	`except Exception as e:`
14307	kshitij.so	34	`print 'ERROR: ', e`
		35	`print 'Retrying'`
		36	`self.count_trials += 1`
		37
		38	`if self.count_trials < 3:`
		39	`return self.read(url)`
		40
		41	`self.response_data=response_data`
15154	kshitij.so	42	`if "Server Busy" in self.response_data:`
		43	`print "Server busy...Ahhhhh"`
		44	`self.count_trials += 1`
		45	`return self.read(url)`
14307	kshitij.so	46	`return self.createData()`
		47
		48	`def createData(self):`
		49	`self.soup = strip_tags(self.response_data,invalid_tags)`
		50	`self.response_data =None`
		51	`return self.scrape(self.soup)`
		52
		53
		54	`def scrape(self,soup):`
		55	`try:`
		56	`sellerData = soup.find("span" , {"id" : "priceblock_dealprice"})`
		57	`dealPrice = float(sellerData.text.replace("Rs.","").replace(",",""))`
		58	`except:`
		59	`dealPrice = 0.0`
		60	`try:`
		61	`dealAvailablity = soup.find('div',{'id':'deal_availability'})`
		62	`dealStatus = dealAvailablity.find('span',{'id':re.compile('dealStatusAvailability_*')})`
		63	`dealStatus = float(dealStatus.text.replace("%","").replace(",",""))`
		64	`except:`
		65	`dealStatus = 100`
		66
		67	`if dealStatus < 100 and dealPrice > 0:`
		68	`return dealPrice`
		69	`else:`
		70	`return 0.0`
		71
		72	`if __name__ == '__main__':`
		73	`scraper = AmazonScraper()`
		74	`print scraper.read('http://www.amazon.in/gp/product/B00FXLCLTO')`
		75

Subversion Repositories SmartDukaan

(root)/trunk/PyDTR/src/dtr/utils/AmazonDealScraper.py – Rev 15154