| 189 |
ashish |
1 |
'''
|
|
|
2 |
Created on 27-May-2010
|
|
|
3 |
|
|
|
4 |
@author: gaurav
|
|
|
5 |
'''
|
|
|
6 |
from scrapy.spider import BaseSpider
|
|
|
7 |
from scrapy.selector import HtmlXPathSelector
|
|
|
8 |
from scrapy.http import Request
|
|
|
9 |
|
|
|
10 |
from demo.items import DemoItem
|
|
|
11 |
from scrapy.contrib.spidermiddleware import referer
|
|
|
12 |
from scrapy.http.headers import Headers
|
|
|
13 |
from scrapy.http.request.form import FormRequest
|
|
|
14 |
from scrapy.log import msg
|
|
|
15 |
from scrapy.http.response import Response
|
|
|
16 |
from time import *
|
| 236 |
ashish |
17 |
from datastore.DataCodeAccessor import *
|
|
|
18 |
from datastore.DataAccessor import *
|
| 189 |
ashish |
19 |
|
|
|
20 |
import urllib
|
| 236 |
ashish |
21 |
from html2text.unescaping import *
|
| 189 |
ashish |
22 |
|
|
|
23 |
class naaptol_price(BaseSpider):
|
| 262 |
ashish |
24 |
"""
|
|
|
25 |
Documentation for class naaptol_price
|
|
|
26 |
Since the urls collected in the previous spider for naaptol.com
|
|
|
27 |
are redirected to get the data for individual phones.
|
|
|
28 |
Some are of the form "http://www.naaptol.com/features/10417-Fly-E300.html"
|
|
|
29 |
while others are of the form "http://www.naaptol.com/price/10417-Fly-E300.html".
|
|
|
30 |
So to make data extraction symmetric, this spider will accomplish 2 tasks
|
|
|
31 |
First, for the urls conatining 'features' it collects the information for the
|
|
|
32 |
individual phones and store them in table datastore_datadefinition_naaptol_phones
|
|
|
33 |
for the ones conatining 'prices' in the url, a new url having 'price' repalced
|
|
|
34 |
with 'features' is framed and stored in the table datastore_datadefinition_morenaaptol_urls.
|
|
|
35 |
"""
|
| 236 |
ashish |
36 |
def __init__(self):
|
| 262 |
ashish |
37 |
"""
|
|
|
38 |
Documentation for constructor
|
|
|
39 |
initialize_table is called to make all the tables known in
|
|
|
40 |
the scope of this class.
|
|
|
41 |
Also start url needs to be feeded to the spider through start_urls.append
|
|
|
42 |
Domainname1 is name by which this spider is known outside
|
|
|
43 |
So this will be used as an argument for calling this spider
|
|
|
44 |
"""
|
| 236 |
ashish |
45 |
initialize_table()
|
|
|
46 |
#NAAPTOL_DOMAINNAME1 = "naaptol1"
|
|
|
47 |
NAAPTOL_DOMAINNAME1 = get_code_word("NAAPTOL_DOMAINNAME1")
|
|
|
48 |
self.domain_name = NAAPTOL_DOMAINNAME1
|
| 189 |
ashish |
49 |
|
| 236 |
ashish |
50 |
# get urls from the database and append them in the list for crawling
|
| 189 |
ashish |
51 |
da = DataHelper()
|
| 236 |
ashish |
52 |
#url = "http://www.naaptol.com/features/10417-Fly-E300.html"
|
|
|
53 |
#self.start_urls.append(url)
|
| 189 |
ashish |
54 |
for pitem in da.get_allnaaptolurls():
|
|
|
55 |
self.start_urls.append(pitem.url.strip())
|
|
|
56 |
|
|
|
57 |
def start_requests(self):
|
| 262 |
ashish |
58 |
"""
|
|
|
59 |
Documentation for method start_requests
|
|
|
60 |
To set various properties of the request to be made
|
|
|
61 |
like referer, headers and all.
|
|
|
62 |
@return a list of well formed requests which will be
|
|
|
63 |
crawled by spider and spider will return the response
|
|
|
64 |
"""
|
| 236 |
ashish |
65 |
#for each request a referer has to be set
|
| 189 |
ashish |
66 |
listreq = []
|
| 236 |
ashish |
67 |
#NAAPTOL_REFERER = "http://www.google.com"
|
|
|
68 |
NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
|
| 189 |
ashish |
69 |
for url1 in self.start_urls:
|
| 236 |
ashish |
70 |
request = Request(url = str(url1), callback=self.parse)
|
|
|
71 |
request.headers.setdefault("Referer", NAAPTOL_REFERER)
|
| 189 |
ashish |
72 |
listreq.append(request)
|
|
|
73 |
return listreq
|
| 236 |
ashish |
74 |
|
| 189 |
ashish |
75 |
def parse(self, response):
|
| 262 |
ashish |
76 |
"""
|
|
|
77 |
Documentation for method parse
|
|
|
78 |
@param response of individual requests
|
|
|
79 |
Using Xpaths needed information is extracted out of the response
|
|
|
80 |
and added to the database
|
|
|
81 |
Xpath2 = Give us price-range for individual phone
|
|
|
82 |
Xpath3 = Give us price-range for individual phone, if unable to retrieve from xpath2
|
|
|
83 |
Xpath4 = Give us number of onlinesellers for a particular phone
|
|
|
84 |
Xpath5 = Give us price for a particular phone offered by onlinesellers
|
|
|
85 |
Xpath6 and Xpath7 = Give us name of onlinesellers for a particular phone
|
|
|
86 |
Xpath8 = Give us number of offlinesellers for a particular phone
|
|
|
87 |
Xpath9 = Give us price for a particular phone offered by offlinesellers
|
|
|
88 |
Xpath10 = Give us name of offlinesellers for a particular phone
|
|
|
89 |
Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
|
|
|
90 |
chklist2 = contains what needs to be replaced, presently it conatains 'price'
|
|
|
91 |
part = contains 'features'
|
|
|
92 |
"""
|
| 236 |
ashish |
93 |
# there are two different type of urls one contains feature and other one contains price
|
|
|
94 |
#both have to be processed differently
|
|
|
95 |
msg(response.url)
|
| 189 |
ashish |
96 |
site = response.url
|
| 236 |
ashish |
97 |
site = unescape(site)
|
| 189 |
ashish |
98 |
sp1 = site.rfind("/")
|
|
|
99 |
sp2 = site.rfind("/",0,sp1-1)
|
|
|
100 |
catg = site[sp2+1:sp1]
|
|
|
101 |
da = DataHelper()
|
| 236 |
ashish |
102 |
#change price to features and add to urls as both provide the same data but in different formats
|
|
|
103 |
#otherwise crawl the url containing features
|
|
|
104 |
#NAAPTOL_CHKLIST2 = ['price']
|
|
|
105 |
#list separated by ';'
|
| 272 |
ashish |
106 |
NAAPTOL_CHKLIST2 = str(get_code_word("NAAPTOL_CHKLIST2"))
|
|
|
107 |
if len(NAAPTOL_CHKLIST2)>0:
|
|
|
108 |
NAAPTOL_CHKLIST2 = NAAPTOL_CHKLIST2.split(';')
|
| 236 |
ashish |
109 |
#NAAPTOL_PART = "features"
|
|
|
110 |
NAAPTOL_PART = get_code_word("NAAPTOL_PART")
|
|
|
111 |
#NAAPTOL_REMOVELIST = ["Rs.",","]
|
| 272 |
ashish |
112 |
NAAPTOL_REMOVELIST = str(get_code_word("NAAPTOL_REMOVELIST"))
|
|
|
113 |
if len(NAAPTOL_REMOVELIST)>0:
|
|
|
114 |
NAAPTOL_REMOVELIST = NAAPTOL_REMOVELIST.split(';')
|
| 236 |
ashish |
115 |
for c in NAAPTOL_CHKLIST2:
|
|
|
116 |
if c == catg:
|
|
|
117 |
site = site.replace(c,NAAPTOL_PART)
|
|
|
118 |
da.add_morenaaptolurl(site)
|
|
|
119 |
|
|
|
120 |
|
|
|
121 |
if catg == NAAPTOL_PART:
|
|
|
122 |
#retreiving name from the the url
|
| 189 |
ashish |
123 |
name = str(response.url)
|
| 236 |
ashish |
124 |
name = unescape(name)
|
| 189 |
ashish |
125 |
name_pos = name.rfind("/")
|
|
|
126 |
name = name[name_pos+1:len(name)-5]
|
| 236 |
ashish |
127 |
name_pos = name.find("-")
|
|
|
128 |
name = name[name_pos+1:len(name)]
|
|
|
129 |
|
|
|
130 |
hxs = HtmlXPathSelector(response)
|
|
|
131 |
#price and price2 determine range
|
|
|
132 |
#NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
|
|
|
133 |
NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
|
|
|
134 |
prices = hxs.select(NAAPTOL_XPATH2)
|
| 189 |
ashish |
135 |
try:
|
|
|
136 |
price1 = prices.extract()[0]
|
|
|
137 |
price1 = price1.strip()
|
|
|
138 |
except:
|
|
|
139 |
price1 = ""
|
|
|
140 |
|
|
|
141 |
try:
|
|
|
142 |
price2 = prices.extract()[1]
|
|
|
143 |
price2 = price2.strip()
|
|
|
144 |
except:
|
| 236 |
ashish |
145 |
price2 = ""
|
| 189 |
ashish |
146 |
try:
|
|
|
147 |
if price1 == "" and price2 == "":
|
| 236 |
ashish |
148 |
#NAAPTOL_XPATH3 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()'
|
|
|
149 |
NAAPTOL_XPATH3 = get_code_word("NAAPTOL_XPATH3")
|
|
|
150 |
prices = hxs.select(NAAPTOL_XPATH3)
|
| 189 |
ashish |
151 |
price = str(prices.extract()[0])
|
|
|
152 |
pos1 = price.find("'")
|
|
|
153 |
pos2 = price.find("'",pos1+1,len(price))
|
|
|
154 |
price1 = price[pos1+1:pos2] + "(approx)"
|
|
|
155 |
price2 = ""
|
|
|
156 |
except:
|
|
|
157 |
price1 = price2 = ""
|
| 236 |
ashish |
158 |
#removelist is used for converting price to decimal format containing only numbers and '.'
|
| 189 |
ashish |
159 |
|
|
|
160 |
if price1 != '':
|
| 236 |
ashish |
161 |
for r in NAAPTOL_REMOVELIST:
|
|
|
162 |
while price1.find(r) != -1:
|
|
|
163 |
price1 = price1.replace(r, "")
|
| 189 |
ashish |
164 |
price1 = price1.strip()
|
|
|
165 |
if price2 != '':
|
| 236 |
ashish |
166 |
for r in NAAPTOL_REMOVELIST:
|
|
|
167 |
while price2.find(r) != -1:
|
|
|
168 |
price2 = price2.replace(r, "")
|
| 189 |
ashish |
169 |
price2 = price2.strip()
|
|
|
170 |
|
|
|
171 |
if price1 == "Rates Not Available":
|
|
|
172 |
price1 = price2 = ""
|
| 236 |
ashish |
173 |
|
|
|
174 |
#range = price1 to price2
|
| 189 |
ashish |
175 |
range = price1
|
|
|
176 |
if price2 != "":
|
|
|
177 |
range = str(range) + " to "
|
|
|
178 |
range = range + str(price2)
|
| 236 |
ashish |
179 |
|
|
|
180 |
da.add_new_naaptolphone(name, str(range))
|
| 189 |
ashish |
181 |
|
|
|
182 |
|
|
|
183 |
OnlineSellers_pricelist = []
|
|
|
184 |
OnlineSellers_namelist = []
|
|
|
185 |
try:
|
| 236 |
ashish |
186 |
#ct1 holds the count of online sellers
|
|
|
187 |
#NAAPTOL_XPATH4 = '//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
|
|
|
188 |
NAAPTOL_XPATH4 = get_code_word("NAAPTOL_XPATH4")
|
|
|
189 |
ct1 = hxs.select(NAAPTOL_XPATH4)
|
| 189 |
ashish |
190 |
ct1 = str(ct1.extract()[0])
|
|
|
191 |
ct1 = ct1.decode("utf-8")
|
|
|
192 |
ct1 = ct1.strip()
|
|
|
193 |
ps1 = ct1.find(" ")
|
|
|
194 |
ct1 = ct1[0:ps1]
|
|
|
195 |
ct1 = int(ct1)
|
|
|
196 |
except:
|
|
|
197 |
ct1 = 0
|
|
|
198 |
ct = ct1
|
|
|
199 |
i = 0
|
| 236 |
ashish |
200 |
#NAAPTOL_XPATH5 = '//div[@id="onSellerContents"]//td[@class="price"]'
|
|
|
201 |
NAAPTOL_XPATH5 = get_code_word("NAAPTOL_XPATH5")
|
|
|
202 |
os_info = hxs.select(NAAPTOL_XPATH5)
|
| 189 |
ashish |
203 |
while ct > 0:
|
|
|
204 |
os = os_info[i].extract()
|
|
|
205 |
ps1 = os.find(">")
|
|
|
206 |
ps2 = os.find("<",ps1)
|
|
|
207 |
os = os[ps1+1:ps2]
|
| 236 |
ashish |
208 |
|
|
|
209 |
if os != '':
|
|
|
210 |
for r in NAAPTOL_REMOVELIST:
|
|
|
211 |
while os.find(r) != -1:
|
|
|
212 |
os = os.replace(r, "")
|
| 189 |
ashish |
213 |
os = urllib.unquote(os)
|
|
|
214 |
try:
|
|
|
215 |
os = int(os)
|
|
|
216 |
except:
|
| 236 |
ashish |
217 |
#stored in format different than previous one
|
| 189 |
ashish |
218 |
os = os_info[i].extract()
|
|
|
219 |
ps1 = os.find(">",ps2)
|
|
|
220 |
ps2 = os.find("<",ps1)
|
|
|
221 |
os = os[ps1+1:ps2]
|
| 236 |
ashish |
222 |
if os != '':
|
|
|
223 |
for r in NAAPTOL_REMOVELIST:
|
|
|
224 |
while os.find(r) != -1:
|
|
|
225 |
os = os.replace(r, "")
|
| 189 |
ashish |
226 |
os = urllib.unquote(os)
|
|
|
227 |
os = int(os)
|
| 236 |
ashish |
228 |
|
| 189 |
ashish |
229 |
OnlineSellers_pricelist.append(os)
|
|
|
230 |
|
| 236 |
ashish |
231 |
#NAAPTOL_XPATH6 = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'
|
|
|
232 |
NAAPTOL_XPATH6 = get_code_word("NAAPTOL_XPATH6")
|
|
|
233 |
#NAAPTOL_XPATH7 = '"]/span/text()'
|
|
|
234 |
NAAPTOL_XPATH7 = get_code_word("NAAPTOL_XPATH7")
|
|
|
235 |
NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + str(i)
|
|
|
236 |
NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + NAAPTOL_XPATH7
|
|
|
237 |
path = NAAPTOL_XPATH6
|
| 189 |
ashish |
238 |
osname = hxs.select(path)
|
|
|
239 |
osname = osname.extract()[0]
|
| 236 |
ashish |
240 |
osname = unescape(osname)
|
| 189 |
ashish |
241 |
osname = urllib.unquote(osname)
|
|
|
242 |
OnlineSellers_namelist.append(osname)
|
|
|
243 |
i = i+1
|
|
|
244 |
ct = ct-1
|
|
|
245 |
|
|
|
246 |
l = len(OnlineSellers_pricelist)
|
| 236 |
ashish |
247 |
i = 0
|
| 189 |
ashish |
248 |
nid = da.get_naaptolphone(name,range).id
|
|
|
249 |
while l > 0:
|
|
|
250 |
da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
|
|
|
251 |
i = i+1
|
|
|
252 |
l = l-1
|
|
|
253 |
|
|
|
254 |
LocalSellers_pricelist = []
|
|
|
255 |
LocalSellers_namelist = []
|
|
|
256 |
try:
|
| 236 |
ashish |
257 |
#ct1 holds the count of online sellers
|
|
|
258 |
#NAAPTOL_XPATH8 = '//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
|
|
|
259 |
NAAPTOL_XPATH8 = get_code_word("NAAPTOL_XPATH8")
|
|
|
260 |
ct1 = hxs.select(NAAPTOL_XPATH8)
|
| 189 |
ashish |
261 |
ct1 = str(ct1.extract()[0])
|
|
|
262 |
ct1 = ct1.decode("utf-8")
|
|
|
263 |
ct1 = ct1.strip()
|
|
|
264 |
ps1 = ct1.find(" ")
|
|
|
265 |
ct1 = ct1[0:ps1]
|
|
|
266 |
ct1 = int(ct1)
|
|
|
267 |
except:
|
|
|
268 |
ct1 = 0
|
|
|
269 |
ct = ct1
|
|
|
270 |
i = 0
|
| 236 |
ashish |
271 |
#NAAPTOL_XPATH9 = '//div[@id="offSellerContents"]//td[@class="price"]'
|
|
|
272 |
NAAPTOL_XPATH9 = get_code_word("NAAPTOL_XPATH9")
|
|
|
273 |
#NAAPTOL_XPATH10 = '//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()'
|
|
|
274 |
NAAPTOL_XPATH10 = get_code_word("NAAPTOL_XPATH10")
|
|
|
275 |
os_info = hxs.select(NAAPTOL_XPATH9)
|
|
|
276 |
os_names = hxs.select(NAAPTOL_XPATH10)
|
|
|
277 |
|
| 189 |
ashish |
278 |
while ct > 0:
|
|
|
279 |
os = os_info[i].extract()
|
|
|
280 |
osname = os_names[i].extract()
|
|
|
281 |
ps1 = os.find(">")
|
|
|
282 |
ps2 = os.find("<",ps1)
|
|
|
283 |
os = os[ps1+1:ps2]
|
| 236 |
ashish |
284 |
if os != '':
|
|
|
285 |
for r in NAAPTOL_REMOVELIST:
|
|
|
286 |
while os.find(r) != -1:
|
|
|
287 |
os = os.replace(r, "")
|
| 189 |
ashish |
288 |
os = urllib.unquote(os)
|
|
|
289 |
osname = urllib.unquote(osname)
|
| 236 |
ashish |
290 |
osname = unescape(osname)
|
| 189 |
ashish |
291 |
try:
|
|
|
292 |
os = int(os)
|
|
|
293 |
except:
|
| 236 |
ashish |
294 |
#stored in format different than previous one
|
| 189 |
ashish |
295 |
os = os_info[i].extract()
|
|
|
296 |
ps1 = os.find(">",ps2)
|
|
|
297 |
ps2 = os.find("<",ps1)
|
|
|
298 |
os = os[ps1+1:ps2]
|
| 236 |
ashish |
299 |
if os != '':
|
|
|
300 |
for r in NAAPTOL_REMOVELIST:
|
|
|
301 |
while os.find(r) != -1:
|
|
|
302 |
os = os.replace(r, "")
|
| 189 |
ashish |
303 |
os = urllib.unquote(os)
|
|
|
304 |
os = int(os)
|
|
|
305 |
LocalSellers_pricelist.append(os)
|
|
|
306 |
LocalSellers_namelist.append(osname)
|
|
|
307 |
i = i+1
|
|
|
308 |
ct = ct-1
|
|
|
309 |
|
|
|
310 |
l = len(LocalSellers_pricelist)
|
|
|
311 |
i = 0
|
|
|
312 |
nid = da.get_naaptolphone(name,range).id
|
|
|
313 |
while l > 0:
|
|
|
314 |
da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
|
|
|
315 |
i = i+1
|
|
|
316 |
l = l-1
|
|
|
317 |
|
|
|
318 |
SPIDER = naaptol_price()
|
|
|
319 |
|