| 4106 |
varun.gupt |
1 |
'''
|
|
|
2 |
Created on 19-Nov-2011
|
|
|
3 |
|
|
|
4 |
@author: Varun Gupta
|
|
|
5 |
'''
|
|
|
6 |
from BaseScraper import BaseScraper
|
|
|
7 |
from BeautifulSoup import BeautifulSoup
|
|
|
8 |
|
|
|
9 |
import json
|
|
|
10 |
|
|
|
11 |
class MySmartPrice(BaseScraper):
|
|
|
12 |
|
|
|
13 |
def __init__(self):
|
|
|
14 |
self.url_brand_list = 'http://www.mysmartprice.com/mobile/pricelist/'
|
|
|
15 |
self.source_whitelist = ['adexmart', 'flipkart', 'homeshop18', 'infibeam', 'letsbuy', 'saholic']
|
|
|
16 |
|
|
|
17 |
def getSourceName(self, url):
|
|
|
18 |
for source in self.source_whitelist:
|
|
|
19 |
if url.find(source) > -1: return source
|
|
|
20 |
|
|
|
21 |
return None
|
|
|
22 |
|
|
|
23 |
def getSaholicEntityId(self, map):
|
|
|
24 |
try:
|
|
|
25 |
if map['saholic']['url'] is None:
|
|
|
26 |
return None
|
|
|
27 |
else:
|
|
|
28 |
return map['saholic']['url'].split('-')[-1]
|
|
|
29 |
|
|
|
30 |
except KeyError:
|
|
|
31 |
return None
|
|
|
32 |
|
|
|
33 |
def getBrandURLs(self):
|
|
|
34 |
urls = []
|
|
|
35 |
html = BaseScraper.read(self, self.url_brand_list)
|
|
|
36 |
soup = BeautifulSoup(html)
|
|
|
37 |
for td in soup.find('div', {'class': 'msp_left'}).find('table').find('table').findAll('td', {'width':"300px"}):
|
|
|
38 |
urls.append(str(td.find('a')['href']))
|
|
|
39 |
return urls
|
|
|
40 |
|
|
|
41 |
def getPhoneURLsForBrand(self, url_brand):
|
|
|
42 |
urls = []
|
|
|
43 |
url_brand = 'http://www.mysmartprice.com' + url_brand
|
|
|
44 |
html = BaseScraper.read(self, url_brand)
|
|
|
45 |
soup = BeautifulSoup(html)
|
|
|
46 |
for div in soup.findAll('div', {'class': 'item'}):
|
|
|
47 |
a = div.find('a')
|
|
|
48 |
|
|
|
49 |
if a is not None:
|
|
|
50 |
urls.append(str(a['href']))
|
|
|
51 |
return urls
|
|
|
52 |
|
|
|
53 |
def getPhonePrices(self, url):
|
|
|
54 |
html = BaseScraper.read(self, url)
|
|
|
55 |
soup = BeautifulSoup(html)
|
|
|
56 |
map = {}
|
|
|
57 |
for div in soup.findAll('div', {'class': 'pt_row'}):
|
|
|
58 |
url = div.find('td', {'width': '140px'}).find('a')['href'].split('?url=')[-1].strip()
|
|
|
59 |
td_price = div.find('td', {'width': '135px'})
|
|
|
60 |
|
|
|
61 |
if td_price.string is None:
|
|
|
62 |
is_available = True
|
|
|
63 |
price = td_price.find('b').string.strip()
|
|
|
64 |
else:
|
|
|
65 |
is_available = False
|
|
|
66 |
|
|
|
67 |
source = self.getSourceName(url)
|
|
|
68 |
|
|
|
69 |
if source is not None:
|
|
|
70 |
map[source] = {
|
|
|
71 |
'is_available': is_available,
|
|
|
72 |
'price': price if is_available else 'Not Found',
|
|
|
73 |
'url': url if is_available else 'Not Found'
|
|
|
74 |
}
|
|
|
75 |
return map
|
|
|
76 |
|
|
|
77 |
if __name__ == '__main__':
|
|
|
78 |
scraper = MySmartPrice()
|
|
|
79 |
# brand_urls = scraper.getBrands()
|
|
|
80 |
brand_urls = [
|
|
|
81 |
'/mobile/pricelist/nokia-mobile-price-list-in-india.html',
|
|
|
82 |
'/mobile/pricelist/samsung-mobile-price-list-in-india.html',
|
|
|
83 |
'/mobile/pricelist/blackberry-mobile-price-list-in-india.html',
|
|
|
84 |
'/mobile/pricelist/lg-mobile-price-list-in-india.html',
|
|
|
85 |
'/mobile/pricelist/sony-ericsson-mobile-price-list-in-india.html',
|
|
|
86 |
'/mobile/pricelist/micromax-mobile-price-list-in-india.html',
|
|
|
87 |
'/mobile/pricelist/motorola-mobile-price-list-in-india.html',
|
|
|
88 |
'/mobile/pricelist/htc-mobile-price-list-in-india.html',
|
|
|
89 |
'/mobile/pricelist/apple-mobile-price-list-in-india.html',
|
|
|
90 |
'/mobile/pricelist/spice-mobile-price-list-in-india.html',
|
|
|
91 |
'/mobile/pricelist/karbonn-mobile-price-list-in-india.html',
|
|
|
92 |
'/mobile/pricelist/lava-mobile-price-list-in-india.html']
|
|
|
93 |
phone_urls = []
|
|
|
94 |
|
|
|
95 |
for brand_url in brand_urls:
|
|
|
96 |
try:
|
|
|
97 |
print brand_url
|
|
|
98 |
phone_urls.extend(scraper.getPhoneURLsForBrand(brand_url))
|
|
|
99 |
except Exception as e:
|
|
|
100 |
print e
|
|
|
101 |
continue
|
|
|
102 |
|
|
|
103 |
print phone_urls.__len__()
|
|
|
104 |
|
|
|
105 |
for url in phone_urls:
|
|
|
106 |
print url
|
|
|
107 |
map = scraper.getPhonePrices(url)
|
|
|
108 |
saholic_id = scraper.getSaholicEntityId(map)
|
|
|
109 |
print map
|
|
|
110 |
print saholic_id
|
|
|
111 |
|
|
|
112 |
if saholic_id is not None:
|
| 5761 |
amar.kumar |
113 |
file_path = str("/usr/msp_dir/%s" % saholic_id)
|
| 4106 |
varun.gupt |
114 |
file_to_write = open(file_path, "w")
|
|
|
115 |
|
|
|
116 |
if file_to_write is None:
|
|
|
117 |
print 'File pointer is None'
|
|
|
118 |
else:
|
|
|
119 |
json.dump(map, file_to_write, indent = 4)
|