| 15867 |
kshitij.so |
1 |
import urllib2
|
|
|
2 |
from BeautifulSoup import BeautifulSoup, NavigableString
|
| 16004 |
kshitij.so |
3 |
from dtr.utils.utils import fetchResponseUsingProxy, removePriceFormatting, \
|
|
|
4 |
transformUrl
|
| 15867 |
kshitij.so |
5 |
import re
|
| 16100 |
kshitij.so |
6 |
import cssutils
|
| 15867 |
kshitij.so |
7 |
|
|
|
8 |
invalid_tags = ['b', 'i', 'u']
|
|
|
9 |
bestSellers = []
|
|
|
10 |
|
|
|
11 |
headers = {
|
|
|
12 |
'User-Agent':'Mozilla/5.0 (Linux; Android 4.3; Nexus 7 Build/JSS15Q) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.72 Safari/537.36',
|
|
|
13 |
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
|
14 |
'Accept-Language' : 'en-US,en;q=0.8',
|
|
|
15 |
'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
|
|
|
16 |
'Connection':'keep-alive',
|
|
|
17 |
'Accept-Encoding' : 'gzip,deflate,sdch'
|
|
|
18 |
}
|
|
|
19 |
|
|
|
20 |
def strip_tags(html, invalid_tags):
|
|
|
21 |
soup = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
|
|
|
22 |
|
|
|
23 |
for tag in soup.findAll(True):
|
|
|
24 |
if tag.name in invalid_tags:
|
|
|
25 |
s = ""
|
|
|
26 |
|
|
|
27 |
for c in tag.contents:
|
|
|
28 |
if not isinstance(c, NavigableString):
|
|
|
29 |
c = strip_tags(unicode(c), invalid_tags)
|
|
|
30 |
s += unicode(c)
|
|
|
31 |
|
|
|
32 |
tag.replaceWith(s)
|
|
|
33 |
|
|
|
34 |
return soup
|
|
|
35 |
|
|
|
36 |
class ShopCluesScraper:
|
| 16100 |
kshitij.so |
37 |
def __init__(self, livePricing=None, findThumbnail=None):
|
| 15867 |
kshitij.so |
38 |
self.count_trials = 0
|
|
|
39 |
self.livePricing = livePricing
|
| 16100 |
kshitij.so |
40 |
self.findThumbnail = findThumbnail
|
| 15867 |
kshitij.so |
41 |
|
|
|
42 |
def read(self, url):
|
|
|
43 |
response_data = ""
|
| 16004 |
kshitij.so |
44 |
url = transformUrl(url,5)
|
|
|
45 |
print url
|
| 15867 |
kshitij.so |
46 |
try:
|
| 15896 |
kshitij.so |
47 |
response_data = fetchResponseUsingProxy(url, headers=headers, proxy=False)
|
| 15867 |
kshitij.so |
48 |
except Exception as e:
|
|
|
49 |
print 'ERROR: ', e
|
|
|
50 |
print 'Retrying'
|
|
|
51 |
self.count_trials += 1
|
|
|
52 |
|
|
|
53 |
if self.count_trials < 5:
|
|
|
54 |
return self.read(url)
|
|
|
55 |
|
|
|
56 |
self.response_data=response_data
|
|
|
57 |
return self.createData()
|
|
|
58 |
|
|
|
59 |
def createData(self):
|
|
|
60 |
self.soup = strip_tags(self.response_data,invalid_tags)
|
|
|
61 |
self.response_data =None
|
| 16100 |
kshitij.so |
62 |
return self.scrape()
|
| 15867 |
kshitij.so |
63 |
|
|
|
64 |
|
| 16100 |
kshitij.so |
65 |
def scrape(self):
|
|
|
66 |
div = self.soup.find('div',{'class':'pd_name clearfix'})
|
| 15867 |
kshitij.so |
67 |
scin = div['data-id']
|
| 16100 |
kshitij.so |
68 |
thumbnailUrl = ""
|
| 15867 |
kshitij.so |
69 |
try:
|
| 16100 |
kshitij.so |
70 |
if self.findThumbnail:
|
|
|
71 |
imgTag = self.soup.find('div',{'class':'pd-image'})['style']
|
|
|
72 |
style = cssutils.parseStyle(imgTag)
|
|
|
73 |
thumbnailUrl = style['background'][style['background'].index('(')+1:style['background'].rfind(')')]
|
|
|
74 |
except:
|
|
|
75 |
pass
|
|
|
76 |
div2 = self.soup.find('div',{'class':'pd-price-cont clearfix'})
|
|
|
77 |
try:
|
| 15867 |
kshitij.so |
78 |
price = float(removePriceFormatting(div2.find('span',{'id':'thirdPrice'}).find('span').string))
|
|
|
79 |
except:
|
|
|
80 |
price = float(removePriceFormatting(div2.find('span',{'id':'sellingPrice'}).find('span').string))
|
|
|
81 |
inStock = 1
|
|
|
82 |
if (div2.find('div',{'class':re.compile('stock.*')}).string).strip().upper() == 'OUT OF STOCK':
|
|
|
83 |
inStock = 0
|
|
|
84 |
isCod = 1
|
|
|
85 |
if div2.find('li',{'id':'iscod'}) is None:
|
|
|
86 |
isCod = 0
|
|
|
87 |
coupon = ""
|
|
|
88 |
try:
|
|
|
89 |
if div2.find('div',{'class':'info clearfix'}):
|
| 16096 |
kshitij.so |
90 |
coupon = div2.find('div',{'class':'info clearfix'}).findAll('span')[1].string
|
| 15867 |
kshitij.so |
91 |
except:
|
|
|
92 |
print "Unable to parse coupon code"
|
| 16100 |
kshitij.so |
93 |
return {'scin':scin,'price':price,'inStock':inStock,'isCod':isCod,'coupon':coupon, "thumbnail":thumbnailUrl}
|
| 15867 |
kshitij.so |
94 |
|
|
|
95 |
|
|
|
96 |
|
|
|
97 |
if __name__ == '__main__':
|
|
|
98 |
import datetime
|
|
|
99 |
print datetime.datetime.now()
|
|
|
100 |
scraper = ShopCluesScraper()
|
| 16004 |
kshitij.so |
101 |
print scraper.read('http://shopclues.com/samsung-galaxy-note-4-13.html')
|
| 15867 |
kshitij.so |
102 |
print datetime.datetime.now()
|