Subversion Repositories SmartDukaan

Rev

Rev 236 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
189 ashish 1
'''
2
Created on 27-May-2010
3
 
4
@author: gaurav
5
'''
6
from scrapy.spider import BaseSpider
7
from scrapy.selector import HtmlXPathSelector
8
from scrapy.http import Request
9
 
10
from demo.items import DemoItem
11
from scrapy.contrib.spidermiddleware import referer
12
from scrapy.http.headers import Headers
13
from scrapy.http.request.form import FormRequest
14
from scrapy.log import msg
15
from scrapy.http.response import Response
16
from time import *
236 ashish 17
from datastore.DataCodeAccessor import *
18
from datastore.DataAccessor import *
189 ashish 19
 
20
import urllib
236 ashish 21
from html2text.unescaping import *
189 ashish 22
 
23
class naaptol_price(BaseSpider):
262 ashish 24
    """
25
    Documentation for class naaptol_price
26
    Since the urls collected in the previous spider for naaptol.com
27
    are redirected to get the data for individual phones.
28
    Some are of the form "http://www.naaptol.com/features/10417-Fly-E300.html"
29
    while others are of the form "http://www.naaptol.com/price/10417-Fly-E300.html".
30
    So to make data extraction symmetric, this spider will accomplish 2 tasks
31
    First, for the urls conatining 'features' it collects the information for the 
32
    individual phones and store them in table datastore_datadefinition_naaptol_phones
33
    for the ones conatining 'prices' in the url, a new url having 'price' repalced  
34
    with 'features' is framed and stored in the table datastore_datadefinition_morenaaptol_urls.
35
    """
236 ashish 36
    def __init__(self): 
262 ashish 37
       """
38
        Documentation for constructor
39
        initialize_table is called to make all the tables known in
40
        the scope of this class.
41
        Also start url needs to be feeded to the spider through start_urls.append
42
        Domainname1 is name by which this spider is known outside
43
        So this will be used as an argument for calling this spider 
44
       """ 
236 ashish 45
       initialize_table()
46
       #NAAPTOL_DOMAINNAME1 = "naaptol1"   
47
       NAAPTOL_DOMAINNAME1 = get_code_word("NAAPTOL_DOMAINNAME1")
48
       self.domain_name = NAAPTOL_DOMAINNAME1 
189 ashish 49
 
236 ashish 50
       # get urls from the database and append them in the list for crawling
189 ashish 51
       da = DataHelper()
236 ashish 52
       #url = "http://www.naaptol.com/features/10417-Fly-E300.html"
53
       #self.start_urls.append(url)
189 ashish 54
       for pitem in da.get_allnaaptolurls():
55
            self.start_urls.append(pitem.url.strip())
56
 
57
    def start_requests(self):
262 ashish 58
        """
59
        Documentation for method start_requests
60
        To set various properties of the request to be made
61
        like referer, headers and all.
62
        @return a list of well formed requests which will be 
63
        crawled by spider and spider will return the response
64
        """
236 ashish 65
        #for each request a referer has to be set
189 ashish 66
        listreq = []
236 ashish 67
        #NAAPTOL_REFERER = "http://www.google.com"
68
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
189 ashish 69
        for url1 in self.start_urls:
236 ashish 70
            request = Request(url = str(url1), callback=self.parse)
71
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
189 ashish 72
            listreq.append(request)
73
        return listreq
236 ashish 74
 
189 ashish 75
    def parse(self, response):
262 ashish 76
        """
77
        Documentation for method parse
78
        @param response of individual requests
79
        Using Xpaths needed information is extracted out of the response
80
        and added to the database
81
        Xpath2 = Give us price-range for individual phone
82
        Xpath3 = Give us price-range for individual phone, if unable to retrieve from xpath2
83
        Xpath4 = Give us number of onlinesellers for a particular phone
84
        Xpath5 = Give us price for a particular phone offered by onlinesellers
85
        Xpath6 and Xpath7 = Give us name of onlinesellers for a particular phone 
86
        Xpath8 = Give us number of offlinesellers for a particular phone
87
        Xpath9 = Give us price for a particular phone offered by offlinesellers
88
        Xpath10 = Give us name of offlinesellers for a particular phone
89
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
90
        chklist2 = contains what needs to be replaced, presently it conatains 'price'
91
        part = contains 'features'
92
        """
236 ashish 93
        # there are two different type of urls one contains feature and other one contains price
94
        #both have to be processed differently
95
        msg(response.url)
189 ashish 96
        site = response.url
236 ashish 97
        site = unescape(site)
189 ashish 98
        sp1 = site.rfind("/")
99
        sp2 = site.rfind("/",0,sp1-1)
100
        catg = site[sp2+1:sp1]
101
        da = DataHelper()
236 ashish 102
        #change price to features and add to urls as both provide the same data but in different formats
103
        #otherwise crawl the url containing features
104
        #NAAPTOL_CHKLIST2 = ['price']
105
        #list separated by ';'
106
        NAAPTOL_CHKLIST2 = get_code_word("NAAPTOL_CHKLIST2")
107
        NAAPTOL_CHKLIST2 = NAAPTOL_CHKLIST2.split(';')
108
        #NAAPTOL_PART = "features"
109
        NAAPTOL_PART = get_code_word("NAAPTOL_PART")
110
        #NAAPTOL_REMOVELIST = ["Rs.",","]
111
        NAAPTOL_REMOVELIST = get_code_word("NAAPTOL_REMOVELIST")
112
        NAAPTOL_REMOVELIST = NAAPTOL_REMOVELIST.split(';')
113
        for c in NAAPTOL_CHKLIST2:
114
            if c == catg:
115
                site = site.replace(c,NAAPTOL_PART)
116
                da.add_morenaaptolurl(site)
117
 
118
 
119
        if catg == NAAPTOL_PART:    
120
            #retreiving name from the the url
189 ashish 121
            name = str(response.url)
236 ashish 122
            name = unescape(name)
189 ashish 123
            name_pos = name.rfind("/")
124
            name = name[name_pos+1:len(name)-5]
236 ashish 125
            name_pos = name.find("-")
126
            name = name[name_pos+1:len(name)]
127
 
128
            hxs = HtmlXPathSelector(response)  
129
            #price and price2 determine range
130
            #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
131
            NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
132
            prices = hxs.select(NAAPTOL_XPATH2)
189 ashish 133
            try:
134
                price1 = prices.extract()[0]
135
                price1 = price1.strip()
136
            except:
137
                price1 = ""
138
 
139
            try:
140
                price2 = prices.extract()[1]
141
                price2 = price2.strip()
142
            except:
236 ashish 143
                price2 = ""  
189 ashish 144
            try:
145
                if price1 == "" and price2 == "":
236 ashish 146
                    #NAAPTOL_XPATH3 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()'
147
                    NAAPTOL_XPATH3 = get_code_word("NAAPTOL_XPATH3")
148
                    prices = hxs.select(NAAPTOL_XPATH3)
189 ashish 149
                    price = str(prices.extract()[0])
150
                    pos1 = price.find("'")
151
                    pos2 = price.find("'",pos1+1,len(price))
152
                    price1 = price[pos1+1:pos2] + "(approx)" 
153
                    price2 = ""
154
            except:
155
                price1 = price2 = ""
236 ashish 156
            #removelist is used for converting price to decimal format containing only numbers and '.'    
189 ashish 157
 
158
            if price1 != '':
236 ashish 159
                for r in NAAPTOL_REMOVELIST: 
160
                    while price1.find(r) != -1:
161
                        price1 = price1.replace(r, "")
189 ashish 162
                price1 = price1.strip()
163
            if price2 != '':        
236 ashish 164
                for r in NAAPTOL_REMOVELIST: 
165
                    while price2.find(r) != -1:
166
                        price2 = price2.replace(r, "")
189 ashish 167
                price2 = price2.strip()
168
 
169
            if price1 == "Rates Not Available":
170
                price1 = price2 = ""
236 ashish 171
 
172
            #range = price1 to price2
189 ashish 173
            range = price1
174
            if price2 != "":
175
                range = str(range) + " to " 
176
                range = range + str(price2) 
236 ashish 177
 
178
            da.add_new_naaptolphone(name, str(range))
189 ashish 179
 
180
 
181
            OnlineSellers_pricelist = []
182
            OnlineSellers_namelist = []
183
            try:
236 ashish 184
                #ct1 holds the count of online sellers
185
                #NAAPTOL_XPATH4 = '//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
186
                NAAPTOL_XPATH4 = get_code_word("NAAPTOL_XPATH4")
187
                ct1 = hxs.select(NAAPTOL_XPATH4)
189 ashish 188
                ct1 = str(ct1.extract()[0])
189
                ct1 = ct1.decode("utf-8")
190
                ct1 = ct1.strip()
191
                ps1 = ct1.find(" ")
192
                ct1 = ct1[0:ps1]
193
                ct1 = int(ct1)
194
            except:
195
                ct1 = 0
196
            ct = ct1
197
            i = 0
236 ashish 198
            #NAAPTOL_XPATH5 = '//div[@id="onSellerContents"]//td[@class="price"]'
199
            NAAPTOL_XPATH5 = get_code_word("NAAPTOL_XPATH5")
200
            os_info = hxs.select(NAAPTOL_XPATH5)
189 ashish 201
            while ct > 0:
202
                os = os_info[i].extract()
203
                ps1 = os.find(">")
204
                ps2 = os.find("<",ps1)
205
                os = os[ps1+1:ps2]
236 ashish 206
 
207
                if os != '':        
208
                    for r in NAAPTOL_REMOVELIST: 
209
                        while os.find(r) != -1:
210
                            os = os.replace(r, "")
189 ashish 211
                os = urllib.unquote(os)
212
                try:
213
                    os = int(os)
214
                except:
236 ashish 215
                    #stored in format different than previous one
189 ashish 216
                    os = os_info[i].extract()
217
                    ps1 = os.find(">",ps2)
218
                    ps2 = os.find("<",ps1)
219
                    os = os[ps1+1:ps2]
236 ashish 220
                    if os != '':        
221
                        for r in NAAPTOL_REMOVELIST: 
222
                            while os.find(r) != -1:
223
                                os = os.replace(r, "")
189 ashish 224
                    os = urllib.unquote(os)
225
                    os = int(os) 
236 ashish 226
 
189 ashish 227
                OnlineSellers_pricelist.append(os)
228
 
236 ashish 229
                #NAAPTOL_XPATH6 = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'  
230
                NAAPTOL_XPATH6 = get_code_word("NAAPTOL_XPATH6")
231
                #NAAPTOL_XPATH7 = '"]/span/text()'
232
                NAAPTOL_XPATH7 = get_code_word("NAAPTOL_XPATH7")
233
                NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + str(i) 
234
                NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + NAAPTOL_XPATH7 
235
                path = NAAPTOL_XPATH6
189 ashish 236
                osname = hxs.select(path)
237
                osname = osname.extract()[0]
236 ashish 238
                osname = unescape(osname)
189 ashish 239
                osname = urllib.unquote(osname)
240
                OnlineSellers_namelist.append(osname)
241
                i = i+1
242
                ct = ct-1
243
 
244
            l = len(OnlineSellers_pricelist)
236 ashish 245
            i = 0 
189 ashish 246
            nid = da.get_naaptolphone(name,range).id
247
            while l > 0:
248
                da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
249
                i = i+1
250
                l = l-1
251
 
252
            LocalSellers_pricelist = []   
253
            LocalSellers_namelist = []
254
            try:
236 ashish 255
                #ct1 holds the count of online sellers
256
                #NAAPTOL_XPATH8 = '//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
257
                NAAPTOL_XPATH8 = get_code_word("NAAPTOL_XPATH8")
258
                ct1 = hxs.select(NAAPTOL_XPATH8)
189 ashish 259
                ct1 = str(ct1.extract()[0])
260
                ct1 = ct1.decode("utf-8")
261
                ct1 = ct1.strip()
262
                ps1 = ct1.find(" ")
263
                ct1 = ct1[0:ps1]
264
                ct1 = int(ct1)
265
            except:
266
                ct1 = 0
267
            ct = ct1
268
            i = 0
236 ashish 269
            #NAAPTOL_XPATH9 = '//div[@id="offSellerContents"]//td[@class="price"]'
270
            NAAPTOL_XPATH9 = get_code_word("NAAPTOL_XPATH9")
271
            #NAAPTOL_XPATH10 = '//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()'
272
            NAAPTOL_XPATH10 = get_code_word("NAAPTOL_XPATH10")
273
            os_info = hxs.select(NAAPTOL_XPATH9)
274
            os_names = hxs.select(NAAPTOL_XPATH10)
275
 
189 ashish 276
            while ct > 0:
277
                os = os_info[i].extract()
278
                osname = os_names[i].extract() 
279
                ps1 = os.find(">")
280
                ps2 = os.find("<",ps1)
281
                os = os[ps1+1:ps2]
236 ashish 282
                if os != '':        
283
                    for r in NAAPTOL_REMOVELIST: 
284
                        while os.find(r) != -1:
285
                            os = os.replace(r, "")
189 ashish 286
                os = urllib.unquote(os)
287
                osname = urllib.unquote(osname)
236 ashish 288
                osname = unescape(osname)
189 ashish 289
                try:
290
                    os = int(os)
291
                except:
236 ashish 292
                    #stored in format different than previous one
189 ashish 293
                    os = os_info[i].extract()
294
                    ps1 = os.find(">",ps2)
295
                    ps2 = os.find("<",ps1)
296
                    os = os[ps1+1:ps2]
236 ashish 297
                    if os != '':        
298
                        for r in NAAPTOL_REMOVELIST: 
299
                            while os.find(r) != -1:
300
                                os = os.replace(r, "")
189 ashish 301
                    os = urllib.unquote(os)
302
                    os = int(os)        
303
                LocalSellers_pricelist.append(os)
304
                LocalSellers_namelist.append(osname)
305
                i = i+1
306
                ct = ct-1
307
 
308
            l = len(LocalSellers_pricelist)
309
            i = 0
310
            nid = da.get_naaptolphone(name,range).id
311
            while l > 0:
312
                da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
313
                i = i+1
314
                l = l-1
315
 
316
SPIDER = naaptol_price()
317