Subversion Repositories SmartDukaan

Rev

Rev 262 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
189 ashish 1
'''
2
Created on 27-May-2010
3
 
4
@author: gaurav
5
'''
6
from scrapy.spider import BaseSpider
7
from scrapy.selector import HtmlXPathSelector
8
from scrapy.http import Request
9
 
10
from demo.items import DemoItem
11
from scrapy.contrib.spidermiddleware import referer
12
from scrapy.http.headers import Headers
13
from scrapy.http.request.form import FormRequest
14
from scrapy.log import msg
15
from scrapy.http.response import Response
16
from time import *
236 ashish 17
from datastore.DataCodeAccessor import *
18
from datastore.DataAccessor import *
189 ashish 19
 
20
import urllib
236 ashish 21
from html2text.unescaping import *
189 ashish 22
 
23
class naaptol_price(BaseSpider):
262 ashish 24
    """
25
    Documentation for class naaptol_price
26
    Since the urls collected in the previous spider for naaptol.com
27
    are redirected to get the data for individual phones.
28
    Some are of the form "http://www.naaptol.com/features/10417-Fly-E300.html"
29
    while others are of the form "http://www.naaptol.com/price/10417-Fly-E300.html".
30
    So to make data extraction symmetric, this spider will accomplish 2 tasks
31
    First, for the urls conatining 'features' it collects the information for the 
32
    individual phones and store them in table datastore_datadefinition_naaptol_phones
33
    for the ones conatining 'prices' in the url, a new url having 'price' repalced  
34
    with 'features' is framed and stored in the table datastore_datadefinition_morenaaptol_urls.
35
    """
236 ashish 36
    def __init__(self): 
262 ashish 37
       """
38
        Documentation for constructor
39
        initialize_table is called to make all the tables known in
40
        the scope of this class.
41
        Also start url needs to be feeded to the spider through start_urls.append
42
        Domainname1 is name by which this spider is known outside
43
        So this will be used as an argument for calling this spider 
44
       """ 
236 ashish 45
       initialize_table()
46
       #NAAPTOL_DOMAINNAME1 = "naaptol1"   
47
       NAAPTOL_DOMAINNAME1 = get_code_word("NAAPTOL_DOMAINNAME1")
48
       self.domain_name = NAAPTOL_DOMAINNAME1 
189 ashish 49
 
236 ashish 50
       # get urls from the database and append them in the list for crawling
189 ashish 51
       da = DataHelper()
236 ashish 52
       #url = "http://www.naaptol.com/features/10417-Fly-E300.html"
53
       #self.start_urls.append(url)
189 ashish 54
       for pitem in da.get_allnaaptolurls():
55
            self.start_urls.append(pitem.url.strip())
56
 
57
    def start_requests(self):
262 ashish 58
        """
59
        Documentation for method start_requests
60
        To set various properties of the request to be made
61
        like referer, headers and all.
62
        @return a list of well formed requests which will be 
63
        crawled by spider and spider will return the response
64
        """
236 ashish 65
        #for each request a referer has to be set
189 ashish 66
        listreq = []
236 ashish 67
        #NAAPTOL_REFERER = "http://www.google.com"
68
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
189 ashish 69
        for url1 in self.start_urls:
236 ashish 70
            request = Request(url = str(url1), callback=self.parse)
71
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
189 ashish 72
            listreq.append(request)
73
        return listreq
236 ashish 74
 
189 ashish 75
    def parse(self, response):
262 ashish 76
        """
77
        Documentation for method parse
78
        @param response of individual requests
79
        Using Xpaths needed information is extracted out of the response
80
        and added to the database
81
        Xpath2 = Give us price-range for individual phone
82
        Xpath3 = Give us price-range for individual phone, if unable to retrieve from xpath2
83
        Xpath4 = Give us number of onlinesellers for a particular phone
84
        Xpath5 = Give us price for a particular phone offered by onlinesellers
85
        Xpath6 and Xpath7 = Give us name of onlinesellers for a particular phone 
86
        Xpath8 = Give us number of offlinesellers for a particular phone
87
        Xpath9 = Give us price for a particular phone offered by offlinesellers
88
        Xpath10 = Give us name of offlinesellers for a particular phone
89
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
90
        chklist2 = contains what needs to be replaced, presently it conatains 'price'
91
        part = contains 'features'
92
        """
236 ashish 93
        # there are two different type of urls one contains feature and other one contains price
94
        #both have to be processed differently
95
        msg(response.url)
189 ashish 96
        site = response.url
236 ashish 97
        site = unescape(site)
189 ashish 98
        sp1 = site.rfind("/")
99
        sp2 = site.rfind("/",0,sp1-1)
100
        catg = site[sp2+1:sp1]
101
        da = DataHelper()
236 ashish 102
        #change price to features and add to urls as both provide the same data but in different formats
103
        #otherwise crawl the url containing features
104
        #NAAPTOL_CHKLIST2 = ['price']
105
        #list separated by ';'
272 ashish 106
        NAAPTOL_CHKLIST2 = str(get_code_word("NAAPTOL_CHKLIST2"))
107
        if len(NAAPTOL_CHKLIST2)>0:
108
            NAAPTOL_CHKLIST2 = NAAPTOL_CHKLIST2.split(';')
236 ashish 109
        #NAAPTOL_PART = "features"
110
        NAAPTOL_PART = get_code_word("NAAPTOL_PART")
111
        #NAAPTOL_REMOVELIST = ["Rs.",","]
272 ashish 112
        NAAPTOL_REMOVELIST = str(get_code_word("NAAPTOL_REMOVELIST"))
113
        if len(NAAPTOL_REMOVELIST)>0:
114
            NAAPTOL_REMOVELIST = NAAPTOL_REMOVELIST.split(';')
236 ashish 115
        for c in NAAPTOL_CHKLIST2:
116
            if c == catg:
117
                site = site.replace(c,NAAPTOL_PART)
118
                da.add_morenaaptolurl(site)
119
 
120
 
121
        if catg == NAAPTOL_PART:    
122
            #retreiving name from the the url
189 ashish 123
            name = str(response.url)
236 ashish 124
            name = unescape(name)
189 ashish 125
            name_pos = name.rfind("/")
126
            name = name[name_pos+1:len(name)-5]
236 ashish 127
            name_pos = name.find("-")
128
            name = name[name_pos+1:len(name)]
129
 
130
            hxs = HtmlXPathSelector(response)  
131
            #price and price2 determine range
132
            #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
133
            NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
134
            prices = hxs.select(NAAPTOL_XPATH2)
189 ashish 135
            try:
136
                price1 = prices.extract()[0]
137
                price1 = price1.strip()
138
            except:
139
                price1 = ""
140
 
141
            try:
142
                price2 = prices.extract()[1]
143
                price2 = price2.strip()
144
            except:
236 ashish 145
                price2 = ""  
189 ashish 146
            try:
147
                if price1 == "" and price2 == "":
236 ashish 148
                    #NAAPTOL_XPATH3 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()'
149
                    NAAPTOL_XPATH3 = get_code_word("NAAPTOL_XPATH3")
150
                    prices = hxs.select(NAAPTOL_XPATH3)
189 ashish 151
                    price = str(prices.extract()[0])
152
                    pos1 = price.find("'")
153
                    pos2 = price.find("'",pos1+1,len(price))
154
                    price1 = price[pos1+1:pos2] + "(approx)" 
155
                    price2 = ""
156
            except:
157
                price1 = price2 = ""
236 ashish 158
            #removelist is used for converting price to decimal format containing only numbers and '.'    
189 ashish 159
 
160
            if price1 != '':
236 ashish 161
                for r in NAAPTOL_REMOVELIST: 
162
                    while price1.find(r) != -1:
163
                        price1 = price1.replace(r, "")
189 ashish 164
                price1 = price1.strip()
165
            if price2 != '':        
236 ashish 166
                for r in NAAPTOL_REMOVELIST: 
167
                    while price2.find(r) != -1:
168
                        price2 = price2.replace(r, "")
189 ashish 169
                price2 = price2.strip()
170
 
171
            if price1 == "Rates Not Available":
172
                price1 = price2 = ""
236 ashish 173
 
174
            #range = price1 to price2
189 ashish 175
            range = price1
176
            if price2 != "":
177
                range = str(range) + " to " 
178
                range = range + str(price2) 
236 ashish 179
 
180
            da.add_new_naaptolphone(name, str(range))
189 ashish 181
 
182
 
183
            OnlineSellers_pricelist = []
184
            OnlineSellers_namelist = []
185
            try:
236 ashish 186
                #ct1 holds the count of online sellers
187
                #NAAPTOL_XPATH4 = '//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
188
                NAAPTOL_XPATH4 = get_code_word("NAAPTOL_XPATH4")
189
                ct1 = hxs.select(NAAPTOL_XPATH4)
189 ashish 190
                ct1 = str(ct1.extract()[0])
191
                ct1 = ct1.decode("utf-8")
192
                ct1 = ct1.strip()
193
                ps1 = ct1.find(" ")
194
                ct1 = ct1[0:ps1]
195
                ct1 = int(ct1)
196
            except:
197
                ct1 = 0
198
            ct = ct1
199
            i = 0
236 ashish 200
            #NAAPTOL_XPATH5 = '//div[@id="onSellerContents"]//td[@class="price"]'
201
            NAAPTOL_XPATH5 = get_code_word("NAAPTOL_XPATH5")
202
            os_info = hxs.select(NAAPTOL_XPATH5)
189 ashish 203
            while ct > 0:
204
                os = os_info[i].extract()
205
                ps1 = os.find(">")
206
                ps2 = os.find("<",ps1)
207
                os = os[ps1+1:ps2]
236 ashish 208
 
209
                if os != '':        
210
                    for r in NAAPTOL_REMOVELIST: 
211
                        while os.find(r) != -1:
212
                            os = os.replace(r, "")
189 ashish 213
                os = urllib.unquote(os)
214
                try:
215
                    os = int(os)
216
                except:
236 ashish 217
                    #stored in format different than previous one
189 ashish 218
                    os = os_info[i].extract()
219
                    ps1 = os.find(">",ps2)
220
                    ps2 = os.find("<",ps1)
221
                    os = os[ps1+1:ps2]
236 ashish 222
                    if os != '':        
223
                        for r in NAAPTOL_REMOVELIST: 
224
                            while os.find(r) != -1:
225
                                os = os.replace(r, "")
189 ashish 226
                    os = urllib.unquote(os)
227
                    os = int(os) 
236 ashish 228
 
189 ashish 229
                OnlineSellers_pricelist.append(os)
230
 
236 ashish 231
                #NAAPTOL_XPATH6 = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'  
232
                NAAPTOL_XPATH6 = get_code_word("NAAPTOL_XPATH6")
233
                #NAAPTOL_XPATH7 = '"]/span/text()'
234
                NAAPTOL_XPATH7 = get_code_word("NAAPTOL_XPATH7")
235
                NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + str(i) 
236
                NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + NAAPTOL_XPATH7 
237
                path = NAAPTOL_XPATH6
189 ashish 238
                osname = hxs.select(path)
239
                osname = osname.extract()[0]
236 ashish 240
                osname = unescape(osname)
189 ashish 241
                osname = urllib.unquote(osname)
242
                OnlineSellers_namelist.append(osname)
243
                i = i+1
244
                ct = ct-1
245
 
246
            l = len(OnlineSellers_pricelist)
236 ashish 247
            i = 0 
189 ashish 248
            nid = da.get_naaptolphone(name,range).id
249
            while l > 0:
250
                da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
251
                i = i+1
252
                l = l-1
253
 
254
            LocalSellers_pricelist = []   
255
            LocalSellers_namelist = []
256
            try:
236 ashish 257
                #ct1 holds the count of online sellers
258
                #NAAPTOL_XPATH8 = '//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
259
                NAAPTOL_XPATH8 = get_code_word("NAAPTOL_XPATH8")
260
                ct1 = hxs.select(NAAPTOL_XPATH8)
189 ashish 261
                ct1 = str(ct1.extract()[0])
262
                ct1 = ct1.decode("utf-8")
263
                ct1 = ct1.strip()
264
                ps1 = ct1.find(" ")
265
                ct1 = ct1[0:ps1]
266
                ct1 = int(ct1)
267
            except:
268
                ct1 = 0
269
            ct = ct1
270
            i = 0
236 ashish 271
            #NAAPTOL_XPATH9 = '//div[@id="offSellerContents"]//td[@class="price"]'
272
            NAAPTOL_XPATH9 = get_code_word("NAAPTOL_XPATH9")
273
            #NAAPTOL_XPATH10 = '//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()'
274
            NAAPTOL_XPATH10 = get_code_word("NAAPTOL_XPATH10")
275
            os_info = hxs.select(NAAPTOL_XPATH9)
276
            os_names = hxs.select(NAAPTOL_XPATH10)
277
 
189 ashish 278
            while ct > 0:
279
                os = os_info[i].extract()
280
                osname = os_names[i].extract() 
281
                ps1 = os.find(">")
282
                ps2 = os.find("<",ps1)
283
                os = os[ps1+1:ps2]
236 ashish 284
                if os != '':        
285
                    for r in NAAPTOL_REMOVELIST: 
286
                        while os.find(r) != -1:
287
                            os = os.replace(r, "")
189 ashish 288
                os = urllib.unquote(os)
289
                osname = urllib.unquote(osname)
236 ashish 290
                osname = unescape(osname)
189 ashish 291
                try:
292
                    os = int(os)
293
                except:
236 ashish 294
                    #stored in format different than previous one
189 ashish 295
                    os = os_info[i].extract()
296
                    ps1 = os.find(">",ps2)
297
                    ps2 = os.find("<",ps1)
298
                    os = os[ps1+1:ps2]
236 ashish 299
                    if os != '':        
300
                        for r in NAAPTOL_REMOVELIST: 
301
                            while os.find(r) != -1:
302
                                os = os.replace(r, "")
189 ashish 303
                    os = urllib.unquote(os)
304
                    os = int(os)        
305
                LocalSellers_pricelist.append(os)
306
                LocalSellers_namelist.append(osname)
307
                i = i+1
308
                ct = ct-1
309
 
310
            l = len(LocalSellers_pricelist)
311
            i = 0
312
            nid = da.get_naaptolphone(name,range).id
313
            while l > 0:
314
                da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
315
                i = i+1
316
                l = l-1
317
 
318
SPIDER = naaptol_price()
319