WebSVN – SmartDukaan – Blame – /trunk/PyDTR/src/dtr/utils/AmazonDealScraper.py

Rev	Author	Line No.	Line
14307	kshitij.so	1	`import urllib2`
		2	`from BeautifulSoup import BeautifulSoup, NavigableString`
		3	`import re`
14759	kshitij.so	4	`from dtr.utils.utils import fetchResponseUsingProxy`
14307	kshitij.so	5
		6	`invalid_tags = ['b', 'i', 'u']`
		7	`bestSellers = []`
		8
		9	`def strip_tags(html, invalid_tags):`
		10	`soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)`
		11
		12	`for tag in soup.findAll(True):`
		13	`if tag.name in invalid_tags:`
		14	`s = ""`
		15
		16	`for c in tag.contents:`
		17	`if not isinstance(c, NavigableString):`
		18	`c = strip_tags(unicode(c), invalid_tags)`
		19	`s += unicode(c)`
		20
		21	`tag.replaceWith(s)`
		22
		23	`return soup`
		24
		25	`class AmazonScraper:`
15210	kshitij.so	26	`def __init__(self,livePricing=None):`
14307	kshitij.so	27	`self.count_trials = 0`
15211	kshitij.so	28	`self.livePricing = livePricing`
14307	kshitij.so	29
		30	`def read(self, url):`
		31	`response_data = ""`
		32	`try:`
15211	kshitij.so	33	`response_data = fetchResponseUsingProxy(url, livePricing=self.livePricing)`
14759	kshitij.so	34	`except Exception as e:`
14307	kshitij.so	35	`print 'ERROR: ', e`
		36	`print 'Retrying'`
		37	`self.count_trials += 1`
		38
15156	kshitij.so	39	`if self.count_trials < 5:`
14307	kshitij.so	40	`return self.read(url)`
		41
		42	`self.response_data=response_data`
15154	kshitij.so	43	`if "Server Busy" in self.response_data:`
		44	`print "Server busy...Ahhhhh"`
		45	`self.count_trials += 1`
		46	`return self.read(url)`
14307	kshitij.so	47	`return self.createData()`
		48
		49	`def createData(self):`
		50	`self.soup = strip_tags(self.response_data,invalid_tags)`
		51	`self.response_data =None`
		52	`return self.scrape(self.soup)`
		53
		54
		55	`def scrape(self,soup):`
		56	`try:`
		57	`sellerData = soup.find("span" , {"id" : "priceblock_dealprice"})`
		58	`dealPrice = float(sellerData.text.replace("Rs.","").replace(",",""))`
17246	kshitij.so	59	`print dealPrice`
14307	kshitij.so	60	`except:`
		61	`dealPrice = 0.0`
		62	`try:`
17246	kshitij.so	63	`dealStatus = soup.find('span',{'id':re.compile('dealStatusAvailability_*')})`
14307	kshitij.so	64	`dealStatus = float(dealStatus.text.replace("%","").replace(",",""))`
		65	`except:`
		66	`dealStatus = 100`
		67
17246	kshitij.so	68
17247	kshitij.so	69	`if dealStatus < 100 and dealPrice > 0:`
14307	kshitij.so	70	`return dealPrice`
		71	`else:`
		72	`return 0.0`
		73
		74	`if __name__ == '__main__':`
15211	kshitij.so	75	`scraper = AmazonScraper(True)`
17246	kshitij.so	76	`print scraper.read('http://www.amazon.in/dp/B015CSIA38/ref=gbdp_vlo_61287013_B015CSIA38?_encoding=UTF8&smid=A2WBY8FP973J47')`
14307	kshitij.so	77

Subversion Repositories SmartDukaan

(root)/trunk/PyDTR/src/dtr/utils/AmazonDealScraper.py – Rev 17247