| Line 21... |
Line 21... |
| 21 |
from datastore.DataCodeAccessor import *
|
21 |
from datastore.DataCodeAccessor import *
|
| 22 |
|
22 |
|
| 23 |
|
23 |
|
| 24 |
|
24 |
|
| 25 |
class indiaplaza_spider(BaseSpider):
|
25 |
class indiaplaza_spider(BaseSpider):
|
| - |
|
26 |
"""
|
| - |
|
27 |
Documentation for class indiaplaza_spider
|
| - |
|
28 |
This spider collects the url for the individual phones
|
| - |
|
29 |
and store them in table datastore_datadefinition_indiaplaza_data.
|
| 26 |
|
30 |
"""
|
| 27 |
def __init__(self):
|
31 |
def __init__(self):
|
| - |
|
32 |
"""
|
| - |
|
33 |
Documentation for constructor
|
| - |
|
34 |
initialize_table is called to make all the tables known in
|
| - |
|
35 |
the scope of this class.
|
| - |
|
36 |
Also start url needs to be feeded to the spider through start_urls.append
|
| - |
|
37 |
Domainname is name by which this spider is known outside
|
| - |
|
38 |
So this will be used as an argument for calling this spider
|
| - |
|
39 |
Since, the number of pages is not fixed Ct and no are used to make it dynamic
|
| - |
|
40 |
"""
|
| 28 |
initialize_table()
|
41 |
initialize_table()
|
| 29 |
da = DataHelper()
|
42 |
da = DataHelper()
|
| 30 |
#INDIAPLAZA_DOMAINNAME = "indiaplaza"
|
43 |
#INDIAPLAZA_DOMAINNAME = "indiaplaza"
|
| 31 |
INDIAPLAZA_DOMAINNAME = get_code_word("INDIAPLAZA_DOMAINNAME")
|
44 |
INDIAPLAZA_DOMAINNAME = get_code_word("INDIAPLAZA_DOMAINNAME")
|
| 32 |
self.domain_name = INDIAPLAZA_DOMAINNAME
|
45 |
self.domain_name = INDIAPLAZA_DOMAINNAME
|
| Line 43... |
Line 56... |
| 43 |
url1 = INDIAPLAZA_URL + str(NO)
|
56 |
url1 = INDIAPLAZA_URL + str(NO)
|
| 44 |
self.start_urls.append(url1)
|
57 |
self.start_urls.append(url1)
|
| 45 |
NO=NO+1
|
58 |
NO=NO+1
|
| 46 |
|
59 |
|
| 47 |
def start_requests(self):
|
60 |
def start_requests(self):
|
| - |
|
61 |
"""
|
| - |
|
62 |
Documentation for method start_requests
|
| - |
|
63 |
To set various properties of the request to be made
|
| - |
|
64 |
like referer, headers and all.
|
| - |
|
65 |
Also suppliers entry need to be done in the table
|
| - |
|
66 |
datastore_datadefinition_suppliers.
|
| - |
|
67 |
@return a list of well formed requests which will be
|
| - |
|
68 |
crawled by spider and spider will return the response
|
| - |
|
69 |
"""
|
| 48 |
#adding entry for the supplier i.e its name and site
|
70 |
#adding entry for the supplier i.e its name and site
|
| 49 |
#INDIAPLAZA_HOMEPAGE = "www.indiaplaza.com"
|
71 |
#INDIAPLAZA_HOMEPAGE = "www.indiaplaza.com"
|
| 50 |
INDIAPLAZA_HOMEPAGE = get_code_word("INDIAPLAZA_HOMEPAGE")
|
72 |
INDIAPLAZA_HOMEPAGE = get_code_word("INDIAPLAZA_HOMEPAGE")
|
| 51 |
da = DataHelper()
|
73 |
da = DataHelper()
|
| 52 |
da.add_supplier(self.domain_name, INDIAPLAZA_HOMEPAGE)
|
74 |
da.add_supplier(self.domain_name, INDIAPLAZA_HOMEPAGE)
|
| Line 60... |
Line 82... |
| 60 |
request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
|
82 |
request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
|
| 61 |
listreq.append(request)
|
83 |
listreq.append(request)
|
| 62 |
return listreq
|
84 |
return listreq
|
| 63 |
|
85 |
|
| 64 |
def parse(self, response):
|
86 |
def parse(self, response):
|
| - |
|
87 |
"""
|
| - |
|
88 |
Documentation for method parse
|
| - |
|
89 |
@param response of individual requests
|
| - |
|
90 |
Using Xpaths needed information is extracted out of the response
|
| - |
|
91 |
and added to the database
|
| - |
|
92 |
Xpath1 = Give us section for individual phone
|
| - |
|
93 |
Xpath2 = Give us name of individual phone
|
| - |
|
94 |
Xpath3 = Give us url of individual phone
|
| - |
|
95 |
Url1 = To get full url for individual phones
|
| - |
|
96 |
"""
|
| 65 |
da = DataHelper()
|
97 |
da = DataHelper()
|
| 66 |
#INDIAPLAZA_URL1 = "http://www.indiaplaza.in"
|
98 |
#INDIAPLAZA_URL1 = "http://www.indiaplaza.in"
|
| 67 |
INDIAPLAZA_URL1 = get_code_word("INDIAPLAZA_URL1")
|
99 |
INDIAPLAZA_URL1 = get_code_word("INDIAPLAZA_URL1")
|
| 68 |
hxs = HtmlXPathSelector(response)
|
100 |
hxs = HtmlXPathSelector(response)
|
| 69 |
#INDIAPLAZA_XPATH1 = '//tr/td/table[@id="browsesku"]'
|
101 |
#INDIAPLAZA_XPATH1 = '//tr/td/table[@id="browsesku"]'
|