WebSVN – SmartDukaan – Blame – /trunk/PyDTR/src/dtr/utils/ShopCluesScraper.py

Rev	Author	Line No.	Line
15867	kshitij.so	1	`import urllib2`
		2	`from BeautifulSoup import BeautifulSoup, NavigableString`
16004	kshitij.so	3	`from dtr.utils.utils import fetchResponseUsingProxy, removePriceFormatting, \`
		4	`transformUrl`
15867	kshitij.so	5	`import re`
		6
		7	`invalid_tags = ['b', 'i', 'u']`
		8	`bestSellers = []`
		9
		10	`headers = {`
		11	`'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36',`
		12	`'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',`
		13	`'Accept-Language' : 'en-US,en;q=0.8',`
		14	`'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',`
		15	`'Connection':'keep-alive',`
		16	`'Accept-Encoding' : 'gzip,deflate,sdch'`
		17	`}`
		18
		19	`def strip_tags(html, invalid_tags):`
		20	`soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)`
		21
		22	`for tag in soup.findAll(True):`
		23	`if tag.name in invalid_tags:`
		24	`s = ""`
		25
		26	`for c in tag.contents:`
		27	`if not isinstance(c, NavigableString):`
		28	`c = strip_tags(unicode(c), invalid_tags)`
		29	`s += unicode(c)`
		30
		31	`tag.replaceWith(s)`
		32
		33	`return soup`
		34
		35	`class ShopCluesScraper:`
		36	`def __init__(self, livePricing=None):`
		37	`self.count_trials = 0`
		38	`self.livePricing = livePricing`
		39
		40	`def read(self, url):`
		41	`response_data = ""`
16004	kshitij.so	42	`url = transformUrl(url,5)`
		43	`print url`
15867	kshitij.so	44	`try:`
15896	kshitij.so	45	`response_data = fetchResponseUsingProxy(url, headers=headers, proxy=False)`
15867	kshitij.so	46	`except Exception as e:`
		47	`print 'ERROR: ', e`
		48	`print 'Retrying'`
		49	`self.count_trials += 1`
		50
		51	`if self.count_trials < 5:`
		52	`return self.read(url)`
		53
		54	`self.response_data=response_data`
		55	`return self.createData()`
		56
		57	`def createData(self):`
		58	`self.soup = strip_tags(self.response_data,invalid_tags)`
		59	`self.response_data =None`
		60	`return self.scrape(self.soup)`
		61
		62
		63	`def scrape(self,soup):`
		64	`div = soup.find('div',{'class':'pd_name clearfix'})`
		65	`scin = div['data-id']`
		66	`div2 = soup.find('div',{'class':'pd-price-cont clearfix'})`
		67	`try:`
		68	`price = float(removePriceFormatting(div2.find('span',{'id':'thirdPrice'}).find('span').string))`
		69	`except:`
		70	`price = float(removePriceFormatting(div2.find('span',{'id':'sellingPrice'}).find('span').string))`
		71	`inStock = 1`
		72	`if (div2.find('div',{'class':re.compile('stock.*')}).string).strip().upper() == 'OUT OF STOCK':`
		73	`inStock = 0`
		74	`isCod = 1`
		75	`if div2.find('li',{'id':'iscod'}) is None:`
		76	`isCod = 0`
		77	`coupon = ""`
		78	`try:`
		79	`if div2.find('div',{'class':'info clearfix'}):`
		80	`coupon = div2.find('div',{'class':'info clearfix'}).findAll('span')[1].text`
		81	`except:`
		82	`print "Unable to parse coupon code"`
		83	`return {'scin':scin,'price':price,'inStock':inStock,'isCod':isCod,'coupon':coupon}`
		84
		85
		86
		87	`if __name__ == '__main__':`
		88	`import datetime`
		89	`print datetime.datetime.now()`
		90	`scraper = ShopCluesScraper()`
16004	kshitij.so	91	`print scraper.read('http://shopclues.com/samsung-galaxy-note-4-13.html')`
15867	kshitij.so	92	`print datetime.datetime.now()`

Subversion Repositories SmartDukaan

(root)/trunk/PyDTR/src/dtr/utils/ShopCluesScraper.py – Rev 16004