| 11934 |
kshitij.so |
1 |
import urllib2
|
|
|
2 |
from BeautifulSoup import BeautifulSoup, NavigableString
|
|
|
3 |
import re
|
|
|
4 |
import sys
|
|
|
5 |
|
|
|
6 |
invalid_tags = ['b', 'i', 'u']
|
|
|
7 |
bestSellers = []
|
|
|
8 |
|
|
|
9 |
def strip_tags(html, invalid_tags):
|
|
|
10 |
soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
|
|
|
11 |
|
|
|
12 |
for tag in soup.findAll(True):
|
|
|
13 |
if tag.name in invalid_tags:
|
|
|
14 |
s = ""
|
|
|
15 |
|
|
|
16 |
for c in tag.contents:
|
|
|
17 |
if not isinstance(c, NavigableString):
|
|
|
18 |
c = strip_tags(unicode(c), invalid_tags)
|
|
|
19 |
s += unicode(c)
|
|
|
20 |
|
|
|
21 |
tag.replaceWith(s)
|
|
|
22 |
|
|
|
23 |
return soup
|
|
|
24 |
|
|
|
25 |
class AmazonScraper:
|
|
|
26 |
def __init__(self):
|
|
|
27 |
self.count_trials = 0
|
|
|
28 |
|
| 12256 |
kshitij.so |
29 |
def read(self, url, findStore):
|
| 11934 |
kshitij.so |
30 |
request = urllib2.Request(url)
|
|
|
31 |
request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
|
|
|
32 |
opener = urllib2.build_opener()
|
|
|
33 |
response_data = ""
|
| 12256 |
kshitij.so |
34 |
self.findStore = findStore
|
| 11934 |
kshitij.so |
35 |
try:
|
|
|
36 |
response_data = opener.open(request).read()
|
|
|
37 |
|
|
|
38 |
except urllib2.HTTPError as e:
|
|
|
39 |
print 'ERROR: ', e
|
|
|
40 |
print 'Retrying'
|
|
|
41 |
self.count_trials += 1
|
|
|
42 |
|
|
|
43 |
if self.count_trials < 3:
|
|
|
44 |
return self.read(url)
|
|
|
45 |
|
|
|
46 |
self.response_data=response_data
|
|
|
47 |
|
|
|
48 |
def createData(self):
|
|
|
49 |
self.soup = strip_tags(self.response_data,invalid_tags)
|
| 12197 |
kshitij.so |
50 |
self.response_data =None
|
| 11934 |
kshitij.so |
51 |
return self.scrape(self.soup)
|
|
|
52 |
|
|
|
53 |
|
|
|
54 |
def scrape(self,soup):
|
|
|
55 |
sellerData = soup.findAll("div" , {"class" : "a-row a-spacing-mini olpOffer"})
|
|
|
56 |
for data in sellerData:
|
|
|
57 |
print "sellerData****"
|
|
|
58 |
price = data.find('span', attrs={'class' : re.compile('.*olpOfferPrice*')}).find('span').text
|
|
|
59 |
print "Unit cost= ",float(price.replace("Rs.","").replace(",",""))
|
|
|
60 |
unitCost = float(price.replace("Rs.","").replace(",",""))
|
|
|
61 |
shippingCost = data.find('p', attrs={'class' : re.compile('.*olpShippingInfo*')}).find('span').text
|
|
|
62 |
if "FREE" in shippingCost:
|
|
|
63 |
print "shippingCost=0"
|
|
|
64 |
shippingCost = 0
|
|
|
65 |
else:
|
|
|
66 |
print "shippingCost= ",float(shippingCost.replace("+Rs.","").replace("Delivery",""))
|
|
|
67 |
shippingCost = float(shippingCost.replace("+Rs.","").replace("Delivery",""))
|
|
|
68 |
|
|
|
69 |
sellerColumn = data.find('p', attrs={'class' : re.compile('.*olpSellerName*')})
|
| 12256 |
kshitij.so |
70 |
store=""
|
|
|
71 |
if self.findStore:
|
|
|
72 |
print "Seller info ",sellerColumn
|
|
|
73 |
x = sellerColumn.find('a')['href']
|
|
|
74 |
print "&&&&"
|
|
|
75 |
storeUrl = x
|
|
|
76 |
store = self.findStoreFront(storeUrl)
|
|
|
77 |
try:
|
|
|
78 |
ind = store.index("@ Amazon.in")
|
|
|
79 |
store = store[0:ind].strip()
|
|
|
80 |
except:
|
|
|
81 |
try:
|
|
|
82 |
ind = store.split(":")
|
|
|
83 |
store = ind[1].strip()
|
|
|
84 |
except:
|
|
|
85 |
store =""
|
| 11934 |
kshitij.so |
86 |
ratingColumn = data.find('p', attrs={'class' : 'a-spacing-small'}).find('a').contents[0]
|
|
|
87 |
print "Rating info ",ratingColumn
|
|
|
88 |
print "***********************"
|
| 12256 |
kshitij.so |
89 |
return unitCost+shippingCost,store
|
| 11934 |
kshitij.so |
90 |
|
| 12256 |
kshitij.so |
91 |
def findStoreFront(self,storeUrl):
|
|
|
92 |
request = urllib2.Request(storeUrl)
|
|
|
93 |
request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.218 Safari/535.1')
|
|
|
94 |
opener = urllib2.build_opener()
|
|
|
95 |
response_data = ""
|
|
|
96 |
try:
|
|
|
97 |
response_data = opener.open(request).read()
|
| 11934 |
kshitij.so |
98 |
|
| 12256 |
kshitij.so |
99 |
except urllib2.HTTPError as e:
|
|
|
100 |
print 'ERROR: ', e
|
|
|
101 |
print 'Retrying'
|
|
|
102 |
self.count_trials += 1
|
|
|
103 |
|
|
|
104 |
if self.count_trials < 3:
|
|
|
105 |
return ""
|
|
|
106 |
soup = strip_tags(response_data,invalid_tags)
|
|
|
107 |
response_data =None
|
|
|
108 |
return soup.title.string
|
|
|
109 |
|
| 11934 |
kshitij.so |
110 |
|
|
|
111 |
if __name__ == '__main__':
|
|
|
112 |
scraper = AmazonScraper()
|
| 12256 |
kshitij.so |
113 |
scraper.read('http://www.amazon.in/gp/offer-listing/B001D0ROGO/ref=olp_sort_ps',True)
|
| 11934 |
kshitij.so |
114 |
print scraper.createData()
|
|
|
115 |
|