| 149 |
ashish |
1 |
'''
|
|
|
2 |
Created on 13-May-2010
|
|
|
3 |
|
|
|
4 |
@author: gaurav
|
|
|
5 |
'''
|
|
|
6 |
|
|
|
7 |
from scrapy.spider import BaseSpider
|
|
|
8 |
from scrapy.selector import HtmlXPathSelector
|
|
|
9 |
from scrapy.http import Request
|
|
|
10 |
|
|
|
11 |
from demo.items import DemoItem
|
|
|
12 |
from scrapy.contrib.spidermiddleware import referer
|
|
|
13 |
from scrapy.http.headers import Headers
|
|
|
14 |
from scrapy.http.request.form import FormRequest
|
|
|
15 |
from scrapy.log import msg
|
|
|
16 |
from scrapy.http.response import Response
|
|
|
17 |
|
| 228 |
ashish |
18 |
from datastore.DataAccessor import *
|
|
|
19 |
from datastore.DataCodeAccessor import *
|
|
|
20 |
from html2text.unescaping import *
|
|
|
21 |
from elixir import *
|
| 149 |
ashish |
22 |
|
|
|
23 |
class infi_spider(BaseSpider):
|
| 259 |
ashish |
24 |
"""
|
|
|
25 |
Documentation for class infi_spider
|
|
|
26 |
This spider collects the information for the individual phones
|
|
|
27 |
and store them in table datastore_datadefinition_infibeam_data
|
|
|
28 |
"""
|
| 181 |
ashish |
29 |
def __init__(self):
|
| 259 |
ashish |
30 |
"""
|
|
|
31 |
Documentation for constructor
|
|
|
32 |
initialize_table is called to make all the tables known in
|
|
|
33 |
the scope of this class.
|
|
|
34 |
Also start url needs to be feeded to the spider through start_urls.append
|
|
|
35 |
Domainname is name by which this spider is known outside
|
|
|
36 |
So this will be used as an argument for calling this spider.
|
|
|
37 |
As the number of pages to be crawled are not fixed so ct and no are used to make it dynamic.
|
|
|
38 |
"""
|
| 228 |
ashish |
39 |
initialize_table()
|
|
|
40 |
da = DataHelper()
|
|
|
41 |
#INFIBEAM_DOMAINNAME = "infibeam"
|
|
|
42 |
INFIBEAM_DOMAINNAME = get_code_word("INFIBEAM_DOMAINNAME")
|
|
|
43 |
print INFIBEAM_DOMAINNAME
|
|
|
44 |
self.domain_name = INFIBEAM_DOMAINNAME
|
|
|
45 |
#INFIBEAM_CT = 15
|
|
|
46 |
#INFIBEAM_CT = int(get_code_word("INFIBEAM_CT"))
|
|
|
47 |
#ct = 15
|
|
|
48 |
ct = int(da.get_extra_vars('infibeam_count'))
|
|
|
49 |
#INFIBEAM_NO = 1
|
| 149 |
ashish |
50 |
no = 1
|
| 228 |
ashish |
51 |
if ct>14:
|
|
|
52 |
no = ct
|
|
|
53 |
#INFIBEAM_URL = "http://www.infibeam.com/Mobiles/search?page="
|
|
|
54 |
INFIBEAM_URL = get_code_word("INFIBEAM_URL")
|
| 149 |
ashish |
55 |
while(no<=ct):
|
| 228 |
ashish |
56 |
url1 = INFIBEAM_URL + str(no)
|
|
|
57 |
self.start_urls.append(url1)
|
| 149 |
ashish |
58 |
no=no+1
|
| 228 |
ashish |
59 |
|
| 149 |
ashish |
60 |
def start_requests(self):
|
| 259 |
ashish |
61 |
"""
|
|
|
62 |
Documentation for method start_requests
|
|
|
63 |
To set various properties of the request to be made
|
|
|
64 |
like referer, headers and all.
|
|
|
65 |
Also suppliers entry need to be done in the table
|
|
|
66 |
datastore_datadefinition_suppliers.
|
|
|
67 |
@return a list of well formed requests which will be
|
|
|
68 |
crawled by spider and spider will return the response
|
|
|
69 |
"""
|
| 228 |
ashish |
70 |
#adding entry for the supplier i.e its name and site
|
|
|
71 |
#INFIBEAM_HOMEPAGE = "www.infibeam.com"
|
|
|
72 |
INFIBEAM_HOMEPAGE = get_code_word("INFIBEAM_HOMEPAGE")
|
| 181 |
ashish |
73 |
da = DataHelper()
|
| 228 |
ashish |
74 |
da.add_supplier(self.domain_name, INFIBEAM_HOMEPAGE)
|
| 149 |
ashish |
75 |
listreq = []
|
| 228 |
ashish |
76 |
|
|
|
77 |
#for each request a referer has to be set
|
|
|
78 |
#INFIBEAM_REFERER = "www.google.com/search"
|
|
|
79 |
INFIBEAM_REFERER = get_code_word("INFIBEAM_REFERER")
|
| 149 |
ashish |
80 |
for url1 in self.start_urls:
|
| 228 |
ashish |
81 |
request = Request(url = str(url1), callback=self.parse, dont_filter=True)
|
|
|
82 |
request.headers.setdefault("Referer", INFIBEAM_REFERER)
|
|
|
83 |
listreq.append(request)
|
| 149 |
ashish |
84 |
return listreq
|
| 228 |
ashish |
85 |
|
| 149 |
ashish |
86 |
def parse(self, response):
|
| 259 |
ashish |
87 |
"""
|
|
|
88 |
Documentation for method parse
|
|
|
89 |
@param response of individual requests
|
|
|
90 |
Using Xpaths needed information is extracted out of the response
|
|
|
91 |
and added to the database
|
|
|
92 |
Xpath1 = Give us section for individual phone
|
|
|
93 |
Xpath2 = Give us name for individual phone
|
|
|
94 |
Xpath3 = Give us quoted price for individual phone
|
|
|
95 |
vatplustax = to get final price from quoted price
|
|
|
96 |
Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
|
|
|
97 |
"""
|
| 228 |
ashish |
98 |
da = DataHelper()
|
|
|
99 |
#INFIBEAM_VATPLUSTAX = 0
|
|
|
100 |
INFIBEAM_VATPLUSTAX = int(get_code_word("INFIBEAM_VATPLUSTAX"))
|
|
|
101 |
#list elements are separated by ';'
|
|
|
102 |
#INFIBEAM_REMOVELIST = ["Rs.",",","-","/","Rs"]
|
| 270 |
ashish |
103 |
INFIBEAM_REMOVELIST = str(get_code_word("INFIBEAM_REMOVELIST"))
|
|
|
104 |
if len(INFIBEAM_REMOVELIST)>0:
|
|
|
105 |
INFIBEAM_REMOVELIST = INFIBEAM_REMOVELIST.split(';')
|
| 286 |
ashish |
106 |
|
|
|
107 |
for x in INFIBEAM_REMOVELIST:
|
|
|
108 |
print x
|
|
|
109 |
|
| 149 |
ashish |
110 |
hxs = HtmlXPathSelector(response)
|
| 228 |
ashish |
111 |
#INFIBEAM_XPATH1 = '//ul[@class="srch_result portrait"]/li'
|
|
|
112 |
INFIBEAM_XPATH1 = get_code_word("INFIBEAM_XPATH1")
|
|
|
113 |
phone_info = hxs.select(INFIBEAM_XPATH1)
|
|
|
114 |
#INFIBEAM_XPATH2 = './/p/span[@class="title"]/text()'
|
|
|
115 |
INFIBEAM_XPATH2 = get_code_word("INFIBEAM_XPATH2")
|
|
|
116 |
#INFIBEAM_XPATH3 = './/p/span[@class="price"]/text()'
|
|
|
117 |
INFIBEAM_XPATH3 = get_code_word("INFIBEAM_XPATH3")
|
| 149 |
ashish |
118 |
items = []
|
| 228 |
ashish |
119 |
|
|
|
120 |
if not phone_info:
|
|
|
121 |
ct = int(da.get_extra_vars('infibeam_count'))
|
|
|
122 |
if ct>14:
|
|
|
123 |
fails = int(da.get_extra_vars('infibeam_fails'))
|
|
|
124 |
fails = fails+1
|
|
|
125 |
if fails > 0:
|
|
|
126 |
da.set_extra_vars('infibeam_flag','FALSE','')
|
|
|
127 |
da.set_extra_vars('infibeam_fails',str(fails),'')
|
|
|
128 |
else:
|
|
|
129 |
for i in phone_info:
|
|
|
130 |
item = {}
|
|
|
131 |
item['name'] = i.select(INFIBEAM_XPATH2)[0].extract()
|
|
|
132 |
item['price'] = i.select(INFIBEAM_XPATH3)[0].extract()
|
|
|
133 |
items.append(item)
|
| 149 |
ashish |
134 |
|
| 228 |
ashish |
135 |
for i in items:
|
|
|
136 |
amnt = i['price']
|
|
|
137 |
if amnt != '':
|
|
|
138 |
for r in INFIBEAM_REMOVELIST:
|
|
|
139 |
while amnt.find(r) != -1:
|
|
|
140 |
amnt = amnt.replace(r, "")
|
|
|
141 |
amnt = amnt.strip()
|
| 281 |
ashish |
142 |
pr = int(amnt) + int(INFIBEAM_VATPLUSTAX)
|
| 228 |
ashish |
143 |
da.add_infiphone(i['name'], amnt,pr)
|
|
|
144 |
|
| 149 |
ashish |
145 |
SPIDER = infi_spider()
|