| Line 21... |
Line 21... |
| 21 |
from xml.dom import INDEX_SIZE_ERR
|
21 |
from xml.dom import INDEX_SIZE_ERR
|
| 22 |
from html2text.unescaping import *
|
22 |
from html2text.unescaping import *
|
| 23 |
|
23 |
|
| 24 |
|
24 |
|
| 25 |
class indiaplaza_extra(BaseSpider):
|
25 |
class indiaplaza_extra(BaseSpider):
|
| - |
|
26 |
"""
|
| - |
|
27 |
Documentation for class indiaplaza_extra
|
| - |
|
28 |
This spider collects all the information for the individual phones
|
| - |
|
29 |
and store them in table datastore_datadefinition_indiaplaza_items.
|
| 26 |
|
30 |
"""
|
| 27 |
def __init__(self):
|
31 |
def __init__(self):
|
| - |
|
32 |
"""
|
| - |
|
33 |
Documentation for constructor
|
| - |
|
34 |
initialize_table is called to make all the tables known in
|
| - |
|
35 |
the scope of this class.
|
| - |
|
36 |
Also start url needs to be feeded to the spider through start_urls.append
|
| - |
|
37 |
Domainname1 is name by which this spider is known outside
|
| - |
|
38 |
So this will be used as an argument for calling this spider
|
| - |
|
39 |
"""
|
| 28 |
initialize_table()
|
40 |
initialize_table()
|
| 29 |
#INDIAPLAZA_DOMAINNAME1 = "indiaplaza1"
|
41 |
#INDIAPLAZA_DOMAINNAME1 = "indiaplaza1"
|
| 30 |
INDIAPLAZA_DOMAINNAME1 = get_code_word("INDIAPLAZA_DOMAINNAME1")
|
42 |
INDIAPLAZA_DOMAINNAME1 = get_code_word("INDIAPLAZA_DOMAINNAME1")
|
| 31 |
self.domain_name = INDIAPLAZA_DOMAINNAME1
|
43 |
self.domain_name = INDIAPLAZA_DOMAINNAME1
|
| 32 |
|
44 |
|
| Line 34... |
Line 46... |
| 34 |
da = DataHelper()
|
46 |
da = DataHelper()
|
| 35 |
for pitem in da.get_all_ipbasic():
|
47 |
for pitem in da.get_all_ipbasic():
|
| 36 |
self.start_urls.append(pitem.v_site.strip())
|
48 |
self.start_urls.append(pitem.v_site.strip())
|
| 37 |
|
49 |
|
| 38 |
def start_requests(self):
|
50 |
def start_requests(self):
|
| - |
|
51 |
"""
|
| - |
|
52 |
Documentation for method start_requests
|
| - |
|
53 |
To set various properties of the request to be made
|
| - |
|
54 |
like referer, headers and all.
|
| - |
|
55 |
@return a list of well formed requests which will be
|
| - |
|
56 |
crawled by spider and spider will return the response
|
| - |
|
57 |
"""
|
| 39 |
listreq = []
|
58 |
listreq = []
|
| 40 |
#for each request a referer has to be set
|
59 |
#for each request a referer has to be set
|
| 41 |
#INDIAPLAZA_REFERER = "www.google.com/search"
|
60 |
#INDIAPLAZA_REFERER = "www.google.com/search"
|
| 42 |
INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")
|
61 |
INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")
|
| 43 |
for url1 in self.start_urls:
|
62 |
for url1 in self.start_urls:
|
| Line 45... |
Line 64... |
| 45 |
request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
|
64 |
request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
|
| 46 |
listreq.append(request)
|
65 |
listreq.append(request)
|
| 47 |
return listreq
|
66 |
return listreq
|
| 48 |
|
67 |
|
| 49 |
def parse(self, response):
|
68 |
def parse(self, response):
|
| - |
|
69 |
"""
|
| - |
|
70 |
Documentation for method parse
|
| - |
|
71 |
@param response of individual requests
|
| - |
|
72 |
Using Xpaths needed information is extracted out of the response
|
| - |
|
73 |
and added to the database
|
| - |
|
74 |
Xpath4 = Give us name for individual phone
|
| - |
|
75 |
Xpath5 = Give us quoted-price for individual phone
|
| - |
|
76 |
Xpath6 = Give us ship-price for individual phone
|
| - |
|
77 |
Xpath7 = Give us ship_price for individual phone, if not gettable form xpath6
|
| - |
|
78 |
Xpath8 = Give us guarantee-info for individual phone
|
| - |
|
79 |
Xpath9 = Give us guarantee-info for individual phone, if not gettable form xpath8
|
| - |
|
80 |
Xpath10 = Give us ship-info for individual phone
|
| - |
|
81 |
Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
|
| - |
|
82 |
"""
|
| 50 |
hxs = HtmlXPathSelector(response)
|
83 |
hxs = HtmlXPathSelector(response)
|
| 51 |
#INDIAPLAZA_REMOVELIST = ["Rs.","Rs",",","-","/"]
|
84 |
#INDIAPLAZA_REMOVELIST = ["Rs.","Rs",",","-","/"]
|
| 52 |
#List separated by ';'
|
85 |
#List separated by ';'
|
| 53 |
INDIAPLAZA_REMOVELIST = get_code_word("INDIAPLAZA_REMOVELIST")
|
86 |
INDIAPLAZA_REMOVELIST = get_code_word("INDIAPLAZA_REMOVELIST")
|
| 54 |
INDIAPLAZA_REMOVELIST = INDIAPLAZA_REMOVELIST.split(';')
|
87 |
INDIAPLAZA_REMOVELIST = INDIAPLAZA_REMOVELIST.split(';')
|