| Line 19... |
Line 19... |
| 19 |
|
19 |
|
| 20 |
import urllib
|
20 |
import urllib
|
| 21 |
from html2text.unescaping import *
|
21 |
from html2text.unescaping import *
|
| 22 |
|
22 |
|
| 23 |
class naaptol_price(BaseSpider):
|
23 |
class naaptol_price(BaseSpider):
|
| 24 |
|
24 |
"""
|
| - |
|
25 |
Documentation for class naaptol_price
|
| - |
|
26 |
Since the urls collected in the previous spider for naaptol.com
|
| - |
|
27 |
are redirected to get the data for individual phones.
|
| - |
|
28 |
Some are of the form "http://www.naaptol.com/features/10417-Fly-E300.html"
|
| - |
|
29 |
while others are of the form "http://www.naaptol.com/price/10417-Fly-E300.html".
|
| - |
|
30 |
So to make data extraction symmetric, this spider will accomplish 2 tasks
|
| - |
|
31 |
First, for the urls conatining 'features' it collects the information for the
|
| - |
|
32 |
individual phones and store them in table datastore_datadefinition_naaptol_phones
|
| - |
|
33 |
for the ones conatining 'prices' in the url, a new url having 'price' repalced
|
| - |
|
34 |
with 'features' is framed and stored in the table datastore_datadefinition_morenaaptol_urls.
|
| - |
|
35 |
"""
|
| 25 |
def __init__(self):
|
36 |
def __init__(self):
|
| - |
|
37 |
"""
|
| - |
|
38 |
Documentation for constructor
|
| - |
|
39 |
initialize_table is called to make all the tables known in
|
| - |
|
40 |
the scope of this class.
|
| - |
|
41 |
Also start url needs to be feeded to the spider through start_urls.append
|
| - |
|
42 |
Domainname1 is name by which this spider is known outside
|
| - |
|
43 |
So this will be used as an argument for calling this spider
|
| - |
|
44 |
"""
|
| 26 |
initialize_table()
|
45 |
initialize_table()
|
| 27 |
#NAAPTOL_DOMAINNAME1 = "naaptol1"
|
46 |
#NAAPTOL_DOMAINNAME1 = "naaptol1"
|
| 28 |
NAAPTOL_DOMAINNAME1 = get_code_word("NAAPTOL_DOMAINNAME1")
|
47 |
NAAPTOL_DOMAINNAME1 = get_code_word("NAAPTOL_DOMAINNAME1")
|
| 29 |
self.domain_name = NAAPTOL_DOMAINNAME1
|
48 |
self.domain_name = NAAPTOL_DOMAINNAME1
|
| 30 |
|
49 |
|
| Line 34... |
Line 53... |
| 34 |
#self.start_urls.append(url)
|
53 |
#self.start_urls.append(url)
|
| 35 |
for pitem in da.get_allnaaptolurls():
|
54 |
for pitem in da.get_allnaaptolurls():
|
| 36 |
self.start_urls.append(pitem.url.strip())
|
55 |
self.start_urls.append(pitem.url.strip())
|
| 37 |
|
56 |
|
| 38 |
def start_requests(self):
|
57 |
def start_requests(self):
|
| - |
|
58 |
"""
|
| - |
|
59 |
Documentation for method start_requests
|
| - |
|
60 |
To set various properties of the request to be made
|
| - |
|
61 |
like referer, headers and all.
|
| - |
|
62 |
@return a list of well formed requests which will be
|
| - |
|
63 |
crawled by spider and spider will return the response
|
| - |
|
64 |
"""
|
| 39 |
#for each request a referer has to be set
|
65 |
#for each request a referer has to be set
|
| 40 |
listreq = []
|
66 |
listreq = []
|
| 41 |
#NAAPTOL_REFERER = "http://www.google.com"
|
67 |
#NAAPTOL_REFERER = "http://www.google.com"
|
| 42 |
NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
|
68 |
NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
|
| 43 |
for url1 in self.start_urls:
|
69 |
for url1 in self.start_urls:
|
| Line 45... |
Line 71... |
| 45 |
request.headers.setdefault("Referer", NAAPTOL_REFERER)
|
71 |
request.headers.setdefault("Referer", NAAPTOL_REFERER)
|
| 46 |
listreq.append(request)
|
72 |
listreq.append(request)
|
| 47 |
return listreq
|
73 |
return listreq
|
| 48 |
|
74 |
|
| 49 |
def parse(self, response):
|
75 |
def parse(self, response):
|
| - |
|
76 |
"""
|
| - |
|
77 |
Documentation for method parse
|
| - |
|
78 |
@param response of individual requests
|
| - |
|
79 |
Using Xpaths needed information is extracted out of the response
|
| - |
|
80 |
and added to the database
|
| - |
|
81 |
Xpath2 = Give us price-range for individual phone
|
| - |
|
82 |
Xpath3 = Give us price-range for individual phone, if unable to retrieve from xpath2
|
| - |
|
83 |
Xpath4 = Give us number of onlinesellers for a particular phone
|
| - |
|
84 |
Xpath5 = Give us price for a particular phone offered by onlinesellers
|
| - |
|
85 |
Xpath6 and Xpath7 = Give us name of onlinesellers for a particular phone
|
| - |
|
86 |
Xpath8 = Give us number of offlinesellers for a particular phone
|
| - |
|
87 |
Xpath9 = Give us price for a particular phone offered by offlinesellers
|
| - |
|
88 |
Xpath10 = Give us name of offlinesellers for a particular phone
|
| - |
|
89 |
Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
|
| - |
|
90 |
chklist2 = contains what needs to be replaced, presently it conatains 'price'
|
| - |
|
91 |
part = contains 'features'
|
| - |
|
92 |
"""
|
| 50 |
# there are two different type of urls one contains feature and other one contains price
|
93 |
# there are two different type of urls one contains feature and other one contains price
|
| 51 |
#both have to be processed differently
|
94 |
#both have to be processed differently
|
| 52 |
msg(response.url)
|
95 |
msg(response.url)
|
| 53 |
site = response.url
|
96 |
site = response.url
|
| 54 |
site = unescape(site)
|
97 |
site = unescape(site)
|
| Line 87... |
Line 130... |
| 87 |
#NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
|
130 |
#NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
|
| 88 |
NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
|
131 |
NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
|
| 89 |
prices = hxs.select(NAAPTOL_XPATH2)
|
132 |
prices = hxs.select(NAAPTOL_XPATH2)
|
| 90 |
try:
|
133 |
try:
|
| 91 |
price1 = prices.extract()[0]
|
134 |
price1 = prices.extract()[0]
|
| 92 |
#price1 = price1.decode("utf-8")
|
- |
|
| 93 |
price1 = price1.strip()
|
135 |
price1 = price1.strip()
|
| 94 |
except:
|
136 |
except:
|
| 95 |
price1 = ""
|
137 |
price1 = ""
|
| 96 |
|
138 |
|
| 97 |
try:
|
139 |
try:
|
| 98 |
price2 = prices.extract()[1]
|
140 |
price2 = prices.extract()[1]
|
| 99 |
#price2 = price2.decode("utf-8")
|
- |
|
| 100 |
price2 = price2.strip()
|
141 |
price2 = price2.strip()
|
| 101 |
except:
|
142 |
except:
|
| 102 |
price2 = ""
|
143 |
price2 = ""
|
| 103 |
try:
|
144 |
try:
|
| 104 |
if price1 == "" and price2 == "":
|
145 |
if price1 == "" and price2 == "":
|