Subversion Repositories SmartDukaan

Rev

Rev 237 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
191 ashish 1
'''
2
Created on 28-May-2010
3
 
4
@author: gaurav
5
'''
6
 
7
from scrapy.spider import BaseSpider
8
from scrapy.selector import HtmlXPathSelector
9
from scrapy.http import Request
10
 
11
from demo.items import DemoItem
12
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
16
from scrapy.http.response import Response
17
from time import *
237 ashish 18
from datastore.DataCodeAccessor import *
19
from datastore.DataAccessor import *
191 ashish 20
 
21
import urllib
237 ashish 22
from html2text.unescaping import *
191 ashish 23
 
24
class naaptol_price2(BaseSpider):
263 ashish 25
    """
26
    Documentation for class naaptol_price
27
    Spider collects the information for the individual phones and store them in table 
28
    datastore_datadefinition_naaptol_phones   
29
    """
191 ashish 30
    def __init__(self):
263 ashish 31
       """
32
        Documentation for constructor
33
        initialize_table is called to make all the tables known in
34
        the scope of this class.
35
        Also start url needs to be feeded to the spider through start_urls.append
36
        Domainname2 is name by which this spider is known outside
37
        So this will be used as an argument for calling this spider 
38
       """ 
237 ashish 39
       initialize_table()
40
       #NAAPTOL_DOMAINNAME2 = "naaptol2"   
41
       NAAPTOL_DOMAINNAME2 = get_code_word("NAAPTOL_DOMAINNAME2")
42
       self.domain_name = NAAPTOL_DOMAINNAME2 
43
 
44
       # get urls from the database and append them in the list for crawling
191 ashish 45
       da = DataHelper()
46
       for pitem in da.get_allmorenaaptolurls():
47
            self.start_urls.append(pitem.url.strip())
48
 
49
    def start_requests(self):
263 ashish 50
        """
51
        Documentation for method start_requests
52
        To set various properties of the request to be made
53
        like referer, headers and all.
54
        @return a list of well formed requests which will be 
55
        crawled by spider and spider will return the response
56
        """
237 ashish 57
        #for each request a referer has to be set
191 ashish 58
        listreq = []
237 ashish 59
        #NAAPTOL_REFERER = "http://www.google.com"
60
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
191 ashish 61
        for url1 in self.start_urls:
237 ashish 62
            request = Request(url = str(url1), callback=self.parse)
63
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
191 ashish 64
            listreq.append(request)
65
        return listreq
237 ashish 66
 
191 ashish 67
 
68
    def parse(self, response): 
263 ashish 69
        """
70
        Documentation for method parse
71
        @param response of individual requests
72
        Using Xpaths needed information is extracted out of the response
73
        and added to the database
74
        Xpath2 = Give us price-range for individual phone
75
        Xpath3 = Give us price-range for individual phone, if unable to retrieve from xpath2
76
        Xpath4 = Give us number of onlinesellers for a particular phone
77
        Xpath5 = Give us price for a particular phone offered by onlinesellers
78
        Xpath6 and Xpath7 = Give us name of onlinesellers for a particular phone 
79
        Xpath8 = Give us number of offlinesellers for a particular phone
80
        Xpath9 = Give us price for a particular phone offered by offlinesellers
81
        Xpath10 = Give us name of offlinesellers for a particular phone
82
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
83
        """
237 ashish 84
        da = DataHelper()
85
        #NAAPTOL_REMOVELIST = ["Rs.",","]
86
        #list separated by ';'
87
        NAAPTOL_REMOVELIST = get_code_word("NAAPTOL_REMOVELIST")
88
        NAAPTOL_REMOVELIST = NAAPTOL_REMOVELIST.split(';')
89
        #retreiving name from the the url
191 ashish 90
        name = str(response.url)
237 ashish 91
        name = unescape(name)
191 ashish 92
        name_pos = name.rfind("/")
93
        name = name[name_pos+1:len(name)-5]
237 ashish 94
        name_pos = name.find("-")
95
        name = name[name_pos+1:len(name)]
96
 
191 ashish 97
        hxs = HtmlXPathSelector(response)
237 ashish 98
 
99
        #price and price2 determine range
100
        #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
101
        NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
102
        prices = hxs.select(NAAPTOL_XPATH2)
191 ashish 103
        try:
104
            price1 = prices.extract()[0]
105
            price1 = price1.strip()
106
        except:
107
            price1 = ""
108
 
109
        try:
110
            price2 = prices.extract()[1]
111
            price2 = price2.strip()
112
        except:
113
            price2 = ""
114
 
115
        try:
116
            if price1 == "" and price2 == "":
237 ashish 117
                #NAAPTOL_XPATH3 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()'
118
                NAAPTOL_XPATH3 = get_code_word("NAAPTOL_XPATH3")
119
                prices = hxs.select(NAAPTOL_XPATH3)
191 ashish 120
                price = str(prices.extract()[0])
121
                pos1 = price.find("'")
122
                pos2 = price.find("'",pos1+1,len(price))
123
                price1 = price[pos1+1:pos2] + "(approx)" 
124
                price2 = ""
125
        except:
126
            price1 = price2 = ""
237 ashish 127
        #removelist is used for converting price to decimal format containing only numbers and '.'    
191 ashish 128
 
129
        if price1 != '':
237 ashish 130
            for r in NAAPTOL_REMOVELIST: 
131
                while price1.find(r) != -1:
132
                    price1 = price1.replace(r, "")
191 ashish 133
            price1 = price1.strip()
134
        if price2 != '':        
237 ashish 135
            for r in NAAPTOL_REMOVELIST: 
136
                while price2.find(r) != -1:
137
                    price2 = price2.replace(r, "")
191 ashish 138
            price2 = price2.strip()
139
 
140
        if price1 == "Rates Not Available":
141
            price1 = price2 = ""
142
 
237 ashish 143
        #range = price1 to price2
191 ashish 144
        range = price1
145
        if price2 != "":
146
            range = str(range) + " to " 
147
            range = range + str(price2) 
237 ashish 148
        da.add_new_naaptolphone(name, range)
191 ashish 149
 
237 ashish 150
 
191 ashish 151
        OnlineSellers_pricelist = []
152
        OnlineSellers_namelist = []
153
        try:
237 ashish 154
            #ct1 holds the count of online sellers
155
            #NAAPTOL_XPATH4 = '//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
156
            NAAPTOL_XPATH4 = get_code_word("NAAPTOL_XPATH4")
157
            ct1 = hxs.select(NAAPTOL_XPATH4)
191 ashish 158
            ct1 = str(ct1.extract()[0])
159
            ct1 = ct1.decode("utf-8")
160
            ct1 = ct1.strip()
161
            ps1 = ct1.find(" ")
162
            ct1 = ct1[0:ps1]
163
            ct1 = int(ct1)
164
        except:
165
            ct1 = 0
166
        ct = ct1
167
        i = 0
237 ashish 168
        #NAAPTOL_XPATH5 = '//div[@id="onSellerContents"]//td[@class="price"]'
169
        NAAPTOL_XPATH5 = get_code_word("NAAPTOL_XPATH5")
170
        os_info = hxs.select(NAAPTOL_XPATH5)
191 ashish 171
        while ct > 0:
172
            os = os_info[i].extract()
173
            ps1 = os.find(">")
174
            ps2 = os.find("<",ps1)
175
            os = os[ps1+1:ps2]
237 ashish 176
 
177
            if os != '':        
178
                for r in NAAPTOL_REMOVELIST: 
179
                    while os.find(r) != -1:
180
                        os = os.replace(r, "")
191 ashish 181
            os = urllib.unquote(os)
182
            try:
183
                os = int(os)
184
            except:
237 ashish 185
                #stored in format different than previous one
191 ashish 186
                os = os_info[i].extract()
187
                ps1 = os.find(">",ps2)
188
                ps2 = os.find("<",ps1)
189
                os = os[ps1+1:ps2]
237 ashish 190
                if os != '':        
191
                    for r in NAAPTOL_REMOVELIST: 
192
                        while os.find(r) != -1:
193
                            os = os.replace(r, "")
191 ashish 194
                os = urllib.unquote(os)
195
                os = int(os) 
237 ashish 196
 
191 ashish 197
            OnlineSellers_pricelist.append(os)
198
 
237 ashish 199
            #NAAPTOL_XPATH6 = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'  
200
            NAAPTOL_XPATH6 = get_code_word("NAAPTOL_XPATH6")
201
            #NAAPTOL_XPATH7 = '"]/span/text()'
202
            NAAPTOL_XPATH7 = get_code_word("NAAPTOL_XPATH7")
203
            NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + str(i) 
204
            NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + NAAPTOL_XPATH7 
205
            path = NAAPTOL_XPATH6
191 ashish 206
            osname = hxs.select(path)
207
            osname = osname.extract()[0]
237 ashish 208
            osname = unescape(osname)
191 ashish 209
            osname = urllib.unquote(osname)
210
            OnlineSellers_namelist.append(osname)
211
            i = i+1
212
            ct = ct-1
213
 
214
        l = len(OnlineSellers_pricelist)
237 ashish 215
        i = 0 
191 ashish 216
        nid = da.get_naaptolphone(name,range).id
217
        while l > 0:
218
            da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
219
            i = i+1
220
            l = l-1
221
 
222
        LocalSellers_pricelist = []   
223
        LocalSellers_namelist = []
224
        try:
237 ashish 225
            #ct1 holds the count of online sellers
226
            #NAAPTOL_XPATH8 = '//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
227
            NAAPTOL_XPATH8 = get_code_word("NAAPTOL_XPATH8")
228
            ct1 = hxs.select(NAAPTOL_XPATH8)
191 ashish 229
            ct1 = str(ct1.extract()[0])
230
            ct1 = ct1.decode("utf-8")
231
            ct1 = ct1.strip()
232
            ps1 = ct1.find(" ")
233
            ct1 = ct1[0:ps1]
234
            ct1 = int(ct1)
235
        except:
236
            ct1 = 0
237
        ct = ct1
238
        i = 0
237 ashish 239
        #NAAPTOL_XPATH9 = '//div[@id="offSellerContents"]//td[@class="price"]'
240
        NAAPTOL_XPATH9 = get_code_word("NAAPTOL_XPATH9")
241
        #NAAPTOL_XPATH10 = '//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()'
242
        NAAPTOL_XPATH10 = get_code_word("NAAPTOL_XPATH10")
243
        os_info = hxs.select(NAAPTOL_XPATH9)
244
        os_names = hxs.select(NAAPTOL_XPATH10)
245
 
191 ashish 246
        while ct > 0:
247
            os = os_info[i].extract()
248
            osname = os_names[i].extract() 
249
            ps1 = os.find(">")
250
            ps2 = os.find("<",ps1)
251
            os = os[ps1+1:ps2]
237 ashish 252
            if os != '':        
253
                for r in NAAPTOL_REMOVELIST: 
254
                    while os.find(r) != -1:
255
                        os = os.replace(r, "")
191 ashish 256
            os = urllib.unquote(os)
257
            osname = urllib.unquote(osname)
237 ashish 258
            osname = unescape(osname)
191 ashish 259
            try:
260
                os = int(os)
261
            except:
237 ashish 262
                #stored in format different than previous one
191 ashish 263
                os = os_info[i].extract()
264
                ps1 = os.find(">",ps2)
265
                ps2 = os.find("<",ps1)
266
                os = os[ps1+1:ps2]
237 ashish 267
                if os != '':        
268
                    for r in NAAPTOL_REMOVELIST: 
269
                        while os.find(r) != -1:
270
                            os = os.replace(r, "")
191 ashish 271
                os = urllib.unquote(os)
272
                os = int(os)        
273
            LocalSellers_pricelist.append(os)
274
            LocalSellers_namelist.append(osname)
275
            i = i+1
276
            ct = ct-1
277
 
278
        l = len(LocalSellers_pricelist)
279
        i = 0
280
        nid = da.get_naaptolphone(name,range).id
281
        while l > 0:
282
            da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
283
            i = i+1
237 ashish 284
            l = l-1
285
 
191 ashish 286
SPIDER = naaptol_price2()
287