| 189 |
ashish |
1 |
'''
|
|
|
2 |
Created on 27-May-2010
|
|
|
3 |
|
|
|
4 |
@author: gaurav
|
|
|
5 |
'''
|
|
|
6 |
from scrapy.spider import BaseSpider
|
|
|
7 |
from scrapy.selector import HtmlXPathSelector
|
|
|
8 |
from scrapy.http import Request
|
|
|
9 |
|
|
|
10 |
from demo.items import DemoItem
|
|
|
11 |
from scrapy.contrib.spidermiddleware import referer
|
|
|
12 |
from scrapy.http.headers import Headers
|
|
|
13 |
from scrapy.http.request.form import FormRequest
|
|
|
14 |
from scrapy.log import msg
|
|
|
15 |
from scrapy.http.response import Response
|
|
|
16 |
from time import *
|
|
|
17 |
|
|
|
18 |
from datastore import DataAccessor
|
|
|
19 |
from datastore.DataAccessor import DataHelper
|
|
|
20 |
import urllib
|
|
|
21 |
|
|
|
22 |
class naaptol_price(BaseSpider):
|
|
|
23 |
|
|
|
24 |
def __init__(self):
|
|
|
25 |
self.domain_name = "naaptolphones"
|
|
|
26 |
#self.start_urls.append("http://www.naaptol.com/price/10415-Fly-Hummer-HT1.html")
|
|
|
27 |
|
|
|
28 |
da = DataHelper()
|
|
|
29 |
for pitem in da.get_allnaaptolurls():
|
|
|
30 |
self.start_urls.append(pitem.url.strip())
|
|
|
31 |
|
|
|
32 |
def start_requests(self):
|
|
|
33 |
listreq = []
|
|
|
34 |
for url1 in self.start_urls:
|
|
|
35 |
request = Request(url = url1, callback=self.parse)
|
|
|
36 |
request.headers.setdefault("Referer", "www.naaptol.com")
|
|
|
37 |
listreq.append(request)
|
|
|
38 |
return listreq
|
|
|
39 |
|
|
|
40 |
def parse(self, response):
|
|
|
41 |
#msg(response.body)
|
|
|
42 |
site = response.url
|
|
|
43 |
sp1 = site.rfind("/")
|
|
|
44 |
sp2 = site.rfind("/",0,sp1-1)
|
|
|
45 |
catg = site[sp2+1:sp1]
|
|
|
46 |
da = DataHelper()
|
|
|
47 |
if catg == "price":
|
|
|
48 |
site = site.replace("price","features")
|
|
|
49 |
da.add_morenaaptolurl(site)
|
|
|
50 |
else:
|
|
|
51 |
f = open('/home/gaurav/Desktop/response.txt', 'w')
|
|
|
52 |
f.write(response.body)
|
|
|
53 |
print " url " + response.url
|
|
|
54 |
name = str(response.url)
|
|
|
55 |
name_pos = name.rfind("/")
|
|
|
56 |
name = name[name_pos+1:len(name)-5]
|
|
|
57 |
hxs = HtmlXPathSelector(response)
|
|
|
58 |
#prices = hxs.select('//table[@class ="ProductDetails"]/tbody/tr[6]/td/span/text()')
|
|
|
59 |
prices = hxs.select('//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()')
|
|
|
60 |
try:
|
|
|
61 |
price1 = prices.extract()[0]
|
|
|
62 |
price1 = price1.decode("utf-8")
|
|
|
63 |
price1 = price1.strip()
|
|
|
64 |
except:
|
|
|
65 |
price1 = ""
|
|
|
66 |
|
|
|
67 |
try:
|
|
|
68 |
price2 = prices.extract()[1]
|
|
|
69 |
price2 = price2.decode("utf-8")
|
|
|
70 |
price2 = price2.strip()
|
|
|
71 |
except:
|
|
|
72 |
price2 = ""
|
|
|
73 |
|
|
|
74 |
try:
|
|
|
75 |
if price1 == "" and price2 == "":
|
|
|
76 |
prices = hxs.select('//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()')
|
|
|
77 |
price = str(prices.extract()[0])
|
|
|
78 |
pos1 = price.find("'")
|
|
|
79 |
pos2 = price.find("'",pos1+1,len(price))
|
|
|
80 |
price1 = price[pos1+1:pos2] + "(approx)"
|
|
|
81 |
price2 = ""
|
|
|
82 |
except:
|
|
|
83 |
price1 = price2 = ""
|
|
|
84 |
|
|
|
85 |
if price1 != '':
|
|
|
86 |
price1 = price1.replace("Rs.", "")
|
|
|
87 |
price1 = price1.replace(",", "")
|
|
|
88 |
price1 = price1.strip()
|
|
|
89 |
if price2 != '':
|
|
|
90 |
price2 = price2.replace("Rs.", "")
|
|
|
91 |
price2 = price2.replace(",", "")
|
|
|
92 |
price2 = price2.strip()
|
|
|
93 |
|
|
|
94 |
if price1 == "Rates Not Available":
|
|
|
95 |
price1 = price2 = ""
|
|
|
96 |
print name
|
|
|
97 |
print price1
|
|
|
98 |
print price2
|
|
|
99 |
print "\n"
|
|
|
100 |
range = price1
|
|
|
101 |
if price2 != "":
|
|
|
102 |
range = str(range) + " to "
|
|
|
103 |
range = range + str(price2)
|
|
|
104 |
da.add_new_naaptolphone(name, range)
|
|
|
105 |
|
|
|
106 |
|
|
|
107 |
OnlineSellers_pricelist = []
|
|
|
108 |
OnlineSellers_namelist = []
|
|
|
109 |
try:
|
|
|
110 |
ct1 = hxs.select('//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()')
|
|
|
111 |
ct1 = str(ct1.extract()[0])
|
|
|
112 |
ct1 = ct1.decode("utf-8")
|
|
|
113 |
ct1 = ct1.strip()
|
|
|
114 |
ps1 = ct1.find(" ")
|
|
|
115 |
ct1 = ct1[0:ps1]
|
|
|
116 |
ct1 = int(ct1)
|
|
|
117 |
except:
|
|
|
118 |
ct1 = 0
|
|
|
119 |
ct = ct1
|
|
|
120 |
i = 0
|
|
|
121 |
os_info = hxs.select('//div[@id="onSellerContents"]//td[@class="price"]')
|
|
|
122 |
#print len(os_info)
|
|
|
123 |
while ct > 0:
|
|
|
124 |
os = os_info[i].extract()
|
|
|
125 |
ps1 = os.find(">")
|
|
|
126 |
ps2 = os.find("<",ps1)
|
|
|
127 |
os = os[ps1+1:ps2]
|
|
|
128 |
os = os.replace("Rs.", "")
|
|
|
129 |
os = os.replace(",", "")
|
|
|
130 |
os = urllib.unquote(os)
|
|
|
131 |
try:
|
|
|
132 |
os = int(os)
|
|
|
133 |
except:
|
|
|
134 |
os = os_info[i].extract()
|
|
|
135 |
ps1 = os.find(">",ps2)
|
|
|
136 |
ps2 = os.find("<",ps1)
|
|
|
137 |
os = os[ps1+1:ps2]
|
|
|
138 |
os = os.replace("Rs.", "")
|
|
|
139 |
os = os.replace(",", "")
|
|
|
140 |
os = urllib.unquote(os)
|
|
|
141 |
os = int(os)
|
|
|
142 |
print os
|
|
|
143 |
OnlineSellers_pricelist.append(os)
|
|
|
144 |
|
|
|
145 |
path = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'
|
|
|
146 |
path = path + str(i)
|
|
|
147 |
path = path + '"]/span/text()'
|
|
|
148 |
osname = hxs.select(path)
|
|
|
149 |
#print len(osname)
|
|
|
150 |
osname = osname.extract()[0]
|
|
|
151 |
osname = urllib.unquote(osname)
|
|
|
152 |
OnlineSellers_namelist.append(osname)
|
|
|
153 |
print osname
|
|
|
154 |
i = i+1
|
|
|
155 |
ct = ct-1
|
|
|
156 |
|
|
|
157 |
l = len(OnlineSellers_pricelist)
|
|
|
158 |
i = 0
|
|
|
159 |
|
|
|
160 |
nid = da.get_naaptolphone(name,range).id
|
|
|
161 |
while l > 0:
|
|
|
162 |
da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
|
|
|
163 |
#print OnlineSellers_list[i]
|
|
|
164 |
i = i+1
|
|
|
165 |
l = l-1
|
|
|
166 |
|
|
|
167 |
LocalSellers_pricelist = []
|
|
|
168 |
LocalSellers_namelist = []
|
|
|
169 |
try:
|
|
|
170 |
ct1 = hxs.select('//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()')
|
|
|
171 |
ct1 = str(ct1.extract()[0])
|
|
|
172 |
ct1 = ct1.decode("utf-8")
|
|
|
173 |
ct1 = ct1.strip()
|
|
|
174 |
ps1 = ct1.find(" ")
|
|
|
175 |
ct1 = ct1[0:ps1]
|
|
|
176 |
ct1 = int(ct1)
|
|
|
177 |
except:
|
|
|
178 |
ct1 = 0
|
|
|
179 |
ct = ct1
|
|
|
180 |
i = 0
|
|
|
181 |
os_info = hxs.select('//div[@id="offSellerContents"]//td[@class="price"]')
|
|
|
182 |
os_names = hxs.select('//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()')
|
|
|
183 |
#print len(os_info)
|
|
|
184 |
while ct > 0:
|
|
|
185 |
os = os_info[i].extract()
|
|
|
186 |
osname = os_names[i].extract()
|
|
|
187 |
#os = os.encode("utf-8")
|
|
|
188 |
ps1 = os.find(">")
|
|
|
189 |
ps2 = os.find("<",ps1)
|
|
|
190 |
os = os[ps1+1:ps2]
|
|
|
191 |
os = os.replace("Rs.", "")
|
|
|
192 |
os = os.replace(",", "")
|
|
|
193 |
os = urllib.unquote(os)
|
|
|
194 |
osname = urllib.unquote(osname)
|
|
|
195 |
try:
|
|
|
196 |
os = int(os)
|
|
|
197 |
except:
|
|
|
198 |
os = os_info[i].extract()
|
|
|
199 |
ps1 = os.find(">",ps2)
|
|
|
200 |
ps2 = os.find("<",ps1)
|
|
|
201 |
os = os[ps1+1:ps2]
|
|
|
202 |
os = os.replace("Rs.", "")
|
|
|
203 |
os = os.replace(",", "")
|
|
|
204 |
os = urllib.unquote(os)
|
|
|
205 |
os = int(os)
|
|
|
206 |
print os
|
|
|
207 |
print osname
|
|
|
208 |
LocalSellers_pricelist.append(os)
|
|
|
209 |
LocalSellers_namelist.append(osname)
|
|
|
210 |
i = i+1
|
|
|
211 |
ct = ct-1
|
|
|
212 |
|
|
|
213 |
l = len(LocalSellers_pricelist)
|
|
|
214 |
i = 0
|
|
|
215 |
|
|
|
216 |
nid = da.get_naaptolphone(name,range).id
|
|
|
217 |
while l > 0:
|
|
|
218 |
da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
|
|
|
219 |
i = i+1
|
|
|
220 |
l = l-1
|
|
|
221 |
|
|
|
222 |
|
|
|
223 |
'''
|
|
|
224 |
l = len(OnlineSellers_list)
|
|
|
225 |
i = 0
|
|
|
226 |
while l > 0:
|
|
|
227 |
#print OnlineSellers_list[i]
|
|
|
228 |
i = i+1
|
|
|
229 |
l = l-1
|
|
|
230 |
'''
|
|
|
231 |
|
|
|
232 |
f.close()
|
|
|
233 |
#del DataHelper
|
|
|
234 |
|
|
|
235 |
'''
|
|
|
236 |
site = response.url
|
|
|
237 |
vatplustax = 0
|
|
|
238 |
pos1 = pos2 = 0
|
|
|
239 |
temp = ""
|
|
|
240 |
pos1 = site.rfind('/')
|
|
|
241 |
if pos1 != -1:
|
|
|
242 |
temp = site[pos1+1:len(site)]
|
|
|
243 |
#pos2 = site.rfind('/',0,pos1-1)
|
|
|
244 |
#if pos2 > 0:
|
|
|
245 |
#temp = site[pos2+1:len(site)]
|
|
|
246 |
pos3 = temp.find('.')
|
|
|
247 |
temp1 = temp[pos3:len(temp)]
|
|
|
248 |
name = temp.replace(temp1,"")
|
|
|
249 |
hxs = HtmlXPathSelector(response)
|
|
|
250 |
prices = hxs.select('//div[@id ="priceComp"]//tr[2]/td[3]/span/text()')
|
|
|
251 |
|
|
|
252 |
da = DataHelper()
|
|
|
253 |
for price in prices:
|
|
|
254 |
name = str(name).strip()
|
|
|
255 |
price = price.extract()
|
|
|
256 |
price = str(price).strip()
|
|
|
257 |
price = price.replace("Rs", "")
|
|
|
258 |
price = price.replace("/", "")
|
|
|
259 |
price = price.replace("-", "")
|
|
|
260 |
price = price.replace(".", "")
|
|
|
261 |
shown_pr = int(price)
|
|
|
262 |
final_pr = shown_pr + vatplustax
|
|
|
263 |
da.add_new_mobstorephone(name,shown_pr,final_pr)
|
|
|
264 |
print name
|
|
|
265 |
print final_pr
|
|
|
266 |
print "\n"
|
|
|
267 |
|
|
|
268 |
for i in items:
|
|
|
269 |
str1 = str(i['title']).strip()
|
|
|
270 |
print str1
|
|
|
271 |
amnt = i['price'].replace(",","")
|
|
|
272 |
amnt = amnt.replace("Rs", "")
|
|
|
273 |
amnt = amnt.replace("/", "")
|
|
|
274 |
amnt = amnt.replace("-", "")
|
|
|
275 |
amnt = amnt.strip()
|
|
|
276 |
vatplustax = 4*int(amnt)/100
|
|
|
277 |
pr = int(amnt) + vatplustax
|
|
|
278 |
#print pr
|
|
|
279 |
da.add_new_univerphone(str1,amnt,pr)
|
|
|
280 |
'''
|
|
|
281 |
SPIDER = naaptol_price()
|
|
|
282 |
|