WebSVN – SmartDukaan – Blame – //trunk/PyProj/src/shop2020/model/v1/catalog/script/AmazonAsyncScraper.py

Rev	Author	Line No.	Line
12363	kshitij.so	1	`from BeautifulSoup import BeautifulSoup, NavigableString`
		2	`import re`
		3	`import sys`
		4	`import datetime`
		5	`import grequests`
		6	`import re`
		7
		8	`invalid_tags = ['b', 'i', 'u']`
		9	`bestSellers = []`
		10
		11	`def strip_tags(html, invalid_tags):`
		12	`soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)`
		13
		14	`for tag in soup.findAll(True):`
		15	`if tag.name in invalid_tags:`
		16	`s = ""`
		17
		18	`for c in tag.contents:`
		19	`if not isinstance(c, NavigableString):`
		20	`c = strip_tags(unicode(c), invalid_tags)`
		21	`s += unicode(c)`
		22
		23	`tag.replaceWith(s)`
		24
		25	`return soup`
		26
		27	`class AmazonAsyncScraper:`
		28	`def __init__(self):`
		29	`self.count_trials = 0`
		30
		31	`def read(self, urls, findStore):`
		32	`returnMap = {}`
		33	`print datetime.datetime.now()`
12411	kshitij.so	34	`header = {'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1',`
		35	`'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',`
		36	`'Accept-Encoding':'gzip,deflate,sdch'`
		37	`}`
		38
12410	kshitij.so	39	`rs = (grequests.get(u, headers=header) for u in urls)`
12363	kshitij.so	40	`for x in grequests.map(rs):`
12410	kshitij.so	41	`soup = strip_tags(x.text,invalid_tags)`
12363	kshitij.so	42	`for tag in soup.findAll(True):`
		43	`if tag.name in invalid_tags:`
		44	`s = ""`
		45
		46	`for c in tag.contents:`
		47	`if not isinstance(c, NavigableString):`
		48	`c = strip_tags(unicode(c), invalid_tags)`
		49	`s += unicode(c)`
		50
		51	`tag.replaceWith(s)`
		52	`x.close()`
		53	`sellerCount=0`
		54	`info = []`
		55	`sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})`
12396	kshitij.so	56	`dataLength = len(sellerData)`
12410	kshitij.so	57	`print dataLength`
12363	kshitij.so	58	`for data in sellerData:`
		59	`tempMap={}`
		60	`price = data.find('span', attrs={'class' : re.compile('.olpOfferPrice')}).find('span').text`
		61	`unitCost = float(price.replace("Rs.","").replace(",",""))`
		62	`shippingCost = data.find('p', attrs={'class' : re.compile('.olpShippingInfo')}).find('span').text`
		63	`if "FREE" in shippingCost:`
		64	`shippingCost = 0`
		65	`else:`
12402	kshitij.so	66	`#print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))`
12363	kshitij.so	67	`shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))`
		68
		69	`sellerColumn = data.find('p', attrs={'class' : re.compile('.olpSellerName')})`
		70	`store=""`
		71	`storeUrl=""`
		72	`if findStore:`
		73	`storeUrl = sellerColumn.find('a')['href']`
		74	`temp = sellerColumn.find('a')`
		75	`store = temp.text`
		76	`if len(store)==0:`
		77	`print storeUrl`
		78	`dom_in = storeUrl.find("www.amazon.in")`
		79	`print dom_in`
		80	`if dom_in ==-1:`
		81	`storeUrl="http://amazon.in"+storeUrl`
		82	`if storeUrl[storeUrl.rfind('/')+1:]=='AF6E3O0VE0X4D':`
		83	`store = 'Saholic'`
		84	`if len(store)!=0:`
		85	`tempMap['isStoreFront']='True'`
		86	`else:`
		87	`tempMap['isStoreFront']='False'`
		88	`tempMap['storeUrl'] =storeUrl`
		89	`asinind = x.url.index("offer-listing")`
		90	`refind = x.url.index("/ref=olp_sort_ps")`
		91	`asin = x.url[asinind+14:refind].strip()`
		92	`sellerCount+=1`
		93	`if sellerCount==1:`
		94	`tempMap['sellerName'] = store.strip()`
		95	`tempMap['sellerPrice'] = unitCost+shippingCost`
		96	`if sellerCount==2:`
		97	`tempMap['sellerName'] = store.strip()`
		98	`tempMap['sellerPrice'] = unitCost+shippingCost`
		99	`if sellerCount==3:`
		100	`tempMap['sellerName'] = store.strip()`
		101	`tempMap['sellerPrice'] = unitCost+shippingCost`
		102	`info.append(tempMap)`
12396	kshitij.so	103	`if sellerCount==3 or sellerCount==dataLength:`
12363	kshitij.so	104	`returnMap[asin] = info`
		105	`break`
		106	`if findStore:`
		107	`return self.findStoreFront(returnMap)`
		108	`else:`
		109	`return returnMap`
		110
		111	`def findStoreFront(self,returnMap):`
		112	`storeFront={}`
		113	`for arr in returnMap.itervalues():`
12410	kshitij.so	114	`print "arr is ",arr`
12363	kshitij.so	115	`for dic in arr:`
12410	kshitij.so	116	`print "dic ",dic`
12363	kshitij.so	117	`if dic['isStoreFront']!='True':`
		118	`storeFront[dic.get('storeUrl')] =''`
		119	`rs = (grequests.get(u,stream=False) for u in storeFront.keys())`
		120	`for x in grequests.map(rs):`
		121	`soup = strip_tags(x.text,invalid_tags)`
		122	`x.close`
		123	`#print x.url.rfind('&me=')`
		124	`#print x.url[x.url.rfind('&me='):].rfind('&')`
		125	`mId= x.url[x.url.rfind('&me=')+4:x.url[x.url.rfind('&me='):].rfind('&')+x.url.rfind('&me=')]`
		126	`sellerName = soup.title.string`
		127	`#print mId`
		128	`try:`
		129	`ind = sellerName.index("@ Amazon.in")`
		130	`sellerName = sellerName[0:ind].strip()`
		131	`except:`
		132	`try:`
		133	`ind = sellerName.split(":")`
		134	`sellerName = ind[1].strip()`
		135	`except:`
		136	`sellerName =""`
		137	`#storeFront[re.compile(''+mId+'.')] = sellerName`
		138	`#print mId`
		139	`#print sellerName`
		140	`myRe = re.compile('.'+mId+'.')`
		141	`for key in storeFront:`
		142	`if myRe.match(key):`
		143	`#print "Match found ",key`
		144	`storeFront[key] = sellerName.strip()`
		145	`#storeFront.get(re.compile('.'+mId+'.'))`
		146	`for arr in returnMap.itervalues():`
		147	`#print "arr is ",arr`
		148	`for dic in arr:`
		149	`#print "dic ",dic`
		150	`if dic['isStoreFront']!='True':`
		151	`dic['sellerName'] =storeFront.get(dic.get('storeUrl'))`
		152	`dic['isStoreFront']='True'`
		153
		154	`print "********"`
		155	`return returnMap`
		156
		157
		158	`# rs = (grequests.get(u,stream=False) for u in urls)`
		159	`# for x in grequests.map(rs):`
		160	`#return soup.title.string`
		161
		162
		163	`if __name__ == '__main__':`
		164	`urls=[]`
12410	kshitij.so	165	`urls.append("http://amazon.in/gp/offer-listing/B007VZFZO8/ref=olp_sort_ps")`
12363	kshitij.so	166	`# asin = []`
		167	`# for a in amazonlisted:`
		168	`# asin.append(a.asin)`
		169	`# urls.append('http://www.amazon.in/gp/offer-listing/'+str(a.asin)+'/ref=olp_sort_ps')`
		170	`# if len(urls)==50:`
		171	`# break`
		172	`print urls`
		173	`scraper = AmazonAsyncScraper()`
12396	kshitij.so	174	`'http://www.amazon.in/gp/offer-listing/B003SNIN9Q/ref=olp_sort_ps'`
12363	kshitij.so	175	`print len(urls)`
		176	`x = scraper.read(urls,True)`
		177	`print x`
		178	`print "##################"`
		179	`# fetched = x.items()`
		180	`# print list(set(asin) - set(fetched))`
		181	`# for a,i in x.iteritems():`
		182	`# print a`
		183	`# for data in i:`
		184	`# print data`
		185	`# print "*********"`
		186	`#print scraper.createData()`
		187	`print datetime.datetime.now()`

Subversion Repositories SmartDukaan

(root)//trunk/PyProj/src/shop2020/model/v1/catalog/script/AmazonAsyncScraper.py – Rev 12411