| 12363 |
kshitij.so |
1 |
from BeautifulSoup import BeautifulSoup, NavigableString
|
|
|
2 |
import re
|
|
|
3 |
import sys
|
|
|
4 |
import datetime
|
|
|
5 |
import grequests
|
|
|
6 |
import re
|
|
|
7 |
|
|
|
8 |
invalid_tags = ['b', 'i', 'u']
|
|
|
9 |
bestSellers = []
|
|
|
10 |
|
|
|
11 |
def strip_tags(html, invalid_tags):
|
|
|
12 |
soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
|
|
|
13 |
|
|
|
14 |
for tag in soup.findAll(True):
|
|
|
15 |
if tag.name in invalid_tags:
|
|
|
16 |
s = ""
|
|
|
17 |
|
|
|
18 |
for c in tag.contents:
|
|
|
19 |
if not isinstance(c, NavigableString):
|
|
|
20 |
c = strip_tags(unicode(c), invalid_tags)
|
|
|
21 |
s += unicode(c)
|
|
|
22 |
|
|
|
23 |
tag.replaceWith(s)
|
|
|
24 |
|
|
|
25 |
return soup
|
|
|
26 |
|
|
|
27 |
class AmazonAsyncScraper:
|
|
|
28 |
def __init__(self):
|
|
|
29 |
self.count_trials = 0
|
|
|
30 |
|
|
|
31 |
def read(self, urls, findStore):
|
|
|
32 |
returnMap = {}
|
|
|
33 |
print datetime.datetime.now()
|
| 12411 |
kshitij.so |
34 |
header = {'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1',
|
|
|
35 |
'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
|
|
|
36 |
'Accept-Encoding':'gzip,deflate,sdch'
|
|
|
37 |
}
|
|
|
38 |
|
| 12410 |
kshitij.so |
39 |
rs = (grequests.get(u, headers=header) for u in urls)
|
| 12363 |
kshitij.so |
40 |
for x in grequests.map(rs):
|
| 12410 |
kshitij.so |
41 |
soup = strip_tags(x.text,invalid_tags)
|
| 12363 |
kshitij.so |
42 |
for tag in soup.findAll(True):
|
|
|
43 |
if tag.name in invalid_tags:
|
|
|
44 |
s = ""
|
|
|
45 |
|
|
|
46 |
for c in tag.contents:
|
|
|
47 |
if not isinstance(c, NavigableString):
|
|
|
48 |
c = strip_tags(unicode(c), invalid_tags)
|
|
|
49 |
s += unicode(c)
|
|
|
50 |
|
|
|
51 |
tag.replaceWith(s)
|
|
|
52 |
x.close()
|
|
|
53 |
sellerCount=0
|
|
|
54 |
info = []
|
|
|
55 |
sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})
|
| 12396 |
kshitij.so |
56 |
dataLength = len(sellerData)
|
| 12363 |
kshitij.so |
57 |
for data in sellerData:
|
|
|
58 |
tempMap={}
|
|
|
59 |
price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').text
|
|
|
60 |
unitCost = float(price.replace("Rs.","").replace(",",""))
|
|
|
61 |
shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').text
|
|
|
62 |
if "FREE" in shippingCost:
|
|
|
63 |
shippingCost = 0
|
|
|
64 |
else:
|
| 12402 |
kshitij.so |
65 |
#print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))
|
| 12363 |
kshitij.so |
66 |
shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))
|
|
|
67 |
|
|
|
68 |
sellerColumn = data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})
|
|
|
69 |
store=""
|
|
|
70 |
storeUrl=""
|
|
|
71 |
if findStore:
|
|
|
72 |
storeUrl = sellerColumn.find('a')['href']
|
|
|
73 |
temp = sellerColumn.find('a')
|
|
|
74 |
store = temp.text
|
|
|
75 |
if len(store)==0:
|
|
|
76 |
dom_in = storeUrl.find("www.amazon.in")
|
|
|
77 |
if dom_in ==-1:
|
|
|
78 |
storeUrl="http://amazon.in"+storeUrl
|
|
|
79 |
if storeUrl[storeUrl.rfind('/')+1:]=='AF6E3O0VE0X4D':
|
|
|
80 |
store = 'Saholic'
|
|
|
81 |
if len(store)!=0:
|
|
|
82 |
tempMap['isStoreFront']='True'
|
|
|
83 |
else:
|
|
|
84 |
tempMap['isStoreFront']='False'
|
|
|
85 |
tempMap['storeUrl'] =storeUrl
|
|
|
86 |
asinind = x.url.index("offer-listing")
|
|
|
87 |
refind = x.url.index("/ref=olp_sort_ps")
|
|
|
88 |
asin = x.url[asinind+14:refind].strip()
|
|
|
89 |
sellerCount+=1
|
|
|
90 |
if sellerCount==1:
|
|
|
91 |
tempMap['sellerName'] = store.strip()
|
|
|
92 |
tempMap['sellerPrice'] = unitCost+shippingCost
|
|
|
93 |
if sellerCount==2:
|
|
|
94 |
tempMap['sellerName'] = store.strip()
|
|
|
95 |
tempMap['sellerPrice'] = unitCost+shippingCost
|
|
|
96 |
if sellerCount==3:
|
|
|
97 |
tempMap['sellerName'] = store.strip()
|
|
|
98 |
tempMap['sellerPrice'] = unitCost+shippingCost
|
|
|
99 |
info.append(tempMap)
|
| 12396 |
kshitij.so |
100 |
if sellerCount==3 or sellerCount==dataLength:
|
| 12363 |
kshitij.so |
101 |
returnMap[asin] = info
|
|
|
102 |
break
|
|
|
103 |
if findStore:
|
|
|
104 |
return self.findStoreFront(returnMap)
|
|
|
105 |
else:
|
|
|
106 |
return returnMap
|
|
|
107 |
|
|
|
108 |
def findStoreFront(self,returnMap):
|
|
|
109 |
storeFront={}
|
|
|
110 |
for arr in returnMap.itervalues():
|
|
|
111 |
for dic in arr:
|
|
|
112 |
if dic['isStoreFront']!='True':
|
|
|
113 |
storeFront[dic.get('storeUrl')] =''
|
|
|
114 |
rs = (grequests.get(u,stream=False) for u in storeFront.keys())
|
|
|
115 |
for x in grequests.map(rs):
|
|
|
116 |
soup = strip_tags(x.text,invalid_tags)
|
|
|
117 |
x.close
|
|
|
118 |
#print x.url.rfind('&me=')
|
|
|
119 |
#print x.url[x.url.rfind('&me='):].rfind('&')
|
|
|
120 |
mId= x.url[x.url.rfind('&me=')+4:x.url[x.url.rfind('&me='):].rfind('&')+x.url.rfind('&me=')]
|
|
|
121 |
sellerName = soup.title.string
|
|
|
122 |
#print mId
|
|
|
123 |
try:
|
|
|
124 |
ind = sellerName.index("@ Amazon.in")
|
|
|
125 |
sellerName = sellerName[0:ind].strip()
|
|
|
126 |
except:
|
|
|
127 |
try:
|
|
|
128 |
ind = sellerName.split(":")
|
|
|
129 |
sellerName = ind[1].strip()
|
|
|
130 |
except:
|
|
|
131 |
sellerName =""
|
|
|
132 |
#storeFront[re.compile('*'+mId+'.*')] = sellerName
|
|
|
133 |
#print mId
|
|
|
134 |
#print sellerName
|
|
|
135 |
myRe = re.compile('.*'+mId+'.*')
|
|
|
136 |
for key in storeFront:
|
|
|
137 |
if myRe.match(key):
|
|
|
138 |
#print "Match found ",key
|
|
|
139 |
storeFront[key] = sellerName.strip()
|
|
|
140 |
#storeFront.get(re.compile('.*'+mId+'.*'))
|
|
|
141 |
for arr in returnMap.itervalues():
|
|
|
142 |
#print "arr is ",arr
|
|
|
143 |
for dic in arr:
|
|
|
144 |
#print "dic ",dic
|
|
|
145 |
if dic['isStoreFront']!='True':
|
|
|
146 |
dic['sellerName'] =storeFront.get(dic.get('storeUrl'))
|
|
|
147 |
dic['isStoreFront']='True'
|
|
|
148 |
|
|
|
149 |
print "********"
|
|
|
150 |
return returnMap
|
|
|
151 |
|
|
|
152 |
|
|
|
153 |
# rs = (grequests.get(u,stream=False) for u in urls)
|
|
|
154 |
# for x in grequests.map(rs):
|
|
|
155 |
#return soup.title.string
|
|
|
156 |
|
|
|
157 |
|
|
|
158 |
if __name__ == '__main__':
|
|
|
159 |
urls=[]
|
| 12410 |
kshitij.so |
160 |
urls.append("http://amazon.in/gp/offer-listing/B007VZFZO8/ref=olp_sort_ps")
|
| 12363 |
kshitij.so |
161 |
# asin = []
|
|
|
162 |
# for a in amazonlisted:
|
|
|
163 |
# asin.append(a.asin)
|
|
|
164 |
# urls.append('http://www.amazon.in/gp/offer-listing/'+str(a.asin)+'/ref=olp_sort_ps')
|
|
|
165 |
# if len(urls)==50:
|
|
|
166 |
# break
|
|
|
167 |
print urls
|
|
|
168 |
scraper = AmazonAsyncScraper()
|
| 12396 |
kshitij.so |
169 |
'http://www.amazon.in/gp/offer-listing/B003SNIN9Q/ref=olp_sort_ps'
|
| 12363 |
kshitij.so |
170 |
print len(urls)
|
|
|
171 |
x = scraper.read(urls,True)
|
|
|
172 |
print x
|
|
|
173 |
print "##################"
|
|
|
174 |
# fetched = x.items()
|
|
|
175 |
# print list(set(asin) - set(fetched))
|
|
|
176 |
# for a,i in x.iteritems():
|
|
|
177 |
# print a
|
|
|
178 |
# for data in i:
|
|
|
179 |
# print data
|
|
|
180 |
# print "*********"
|
|
|
181 |
#print scraper.createData()
|
|
|
182 |
print datetime.datetime.now()
|