Subversion Repositories SmartDukaan

Rev

Rev 263 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
191 ashish 1
'''
2
Created on 28-May-2010
3
 
4
@author: gaurav
5
'''
6
 
7
from scrapy.spider import BaseSpider
8
from scrapy.selector import HtmlXPathSelector
9
from scrapy.http import Request
10
 
11
from demo.items import DemoItem
12
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
16
from scrapy.http.response import Response
17
from time import *
237 ashish 18
from datastore.DataCodeAccessor import *
19
from datastore.DataAccessor import *
191 ashish 20
 
21
import urllib
237 ashish 22
from html2text.unescaping import *
191 ashish 23
 
24
class naaptol_price2(BaseSpider):
263 ashish 25
    """
26
    Documentation for class naaptol_price
27
    Spider collects the information for the individual phones and store them in table 
28
    datastore_datadefinition_naaptol_phones   
29
    """
191 ashish 30
    def __init__(self):
263 ashish 31
       """
32
        Documentation for constructor
33
        initialize_table is called to make all the tables known in
34
        the scope of this class.
35
        Also start url needs to be feeded to the spider through start_urls.append
36
        Domainname2 is name by which this spider is known outside
37
        So this will be used as an argument for calling this spider 
38
       """ 
237 ashish 39
       initialize_table()
40
       #NAAPTOL_DOMAINNAME2 = "naaptol2"   
41
       NAAPTOL_DOMAINNAME2 = get_code_word("NAAPTOL_DOMAINNAME2")
42
       self.domain_name = NAAPTOL_DOMAINNAME2 
43
 
44
       # get urls from the database and append them in the list for crawling
191 ashish 45
       da = DataHelper()
46
       for pitem in da.get_allmorenaaptolurls():
47
            self.start_urls.append(pitem.url.strip())
48
 
49
    def start_requests(self):
263 ashish 50
        """
51
        Documentation for method start_requests
52
        To set various properties of the request to be made
53
        like referer, headers and all.
54
        @return a list of well formed requests which will be 
55
        crawled by spider and spider will return the response
56
        """
237 ashish 57
        #for each request a referer has to be set
191 ashish 58
        listreq = []
237 ashish 59
        #NAAPTOL_REFERER = "http://www.google.com"
60
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
191 ashish 61
        for url1 in self.start_urls:
237 ashish 62
            request = Request(url = str(url1), callback=self.parse)
63
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
191 ashish 64
            listreq.append(request)
65
        return listreq
237 ashish 66
 
191 ashish 67
 
68
    def parse(self, response): 
263 ashish 69
        """
70
        Documentation for method parse
71
        @param response of individual requests
72
        Using Xpaths needed information is extracted out of the response
73
        and added to the database
74
        Xpath2 = Give us price-range for individual phone
75
        Xpath3 = Give us price-range for individual phone, if unable to retrieve from xpath2
76
        Xpath4 = Give us number of onlinesellers for a particular phone
77
        Xpath5 = Give us price for a particular phone offered by onlinesellers
78
        Xpath6 and Xpath7 = Give us name of onlinesellers for a particular phone 
79
        Xpath8 = Give us number of offlinesellers for a particular phone
80
        Xpath9 = Give us price for a particular phone offered by offlinesellers
81
        Xpath10 = Give us name of offlinesellers for a particular phone
82
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
83
        """
237 ashish 84
        da = DataHelper()
85
        #NAAPTOL_REMOVELIST = ["Rs.",","]
86
        #list separated by ';'
273 ashish 87
        NAAPTOL_REMOVELIST = str(get_code_word("NAAPTOL_REMOVELIST"))
88
        if len(NAAPTOL_REMOVELIST)>0:
89
            NAAPTOL_REMOVELIST = NAAPTOL_REMOVELIST.split(';')
237 ashish 90
        #retreiving name from the the url
191 ashish 91
        name = str(response.url)
237 ashish 92
        name = unescape(name)
191 ashish 93
        name_pos = name.rfind("/")
94
        name = name[name_pos+1:len(name)-5]
237 ashish 95
        name_pos = name.find("-")
96
        name = name[name_pos+1:len(name)]
97
 
191 ashish 98
        hxs = HtmlXPathSelector(response)
237 ashish 99
 
100
        #price and price2 determine range
101
        #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
102
        NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
103
        prices = hxs.select(NAAPTOL_XPATH2)
191 ashish 104
        try:
105
            price1 = prices.extract()[0]
106
            price1 = price1.strip()
107
        except:
108
            price1 = ""
109
 
110
        try:
111
            price2 = prices.extract()[1]
112
            price2 = price2.strip()
113
        except:
114
            price2 = ""
115
 
116
        try:
117
            if price1 == "" and price2 == "":
237 ashish 118
                #NAAPTOL_XPATH3 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()'
119
                NAAPTOL_XPATH3 = get_code_word("NAAPTOL_XPATH3")
120
                prices = hxs.select(NAAPTOL_XPATH3)
191 ashish 121
                price = str(prices.extract()[0])
122
                pos1 = price.find("'")
123
                pos2 = price.find("'",pos1+1,len(price))
124
                price1 = price[pos1+1:pos2] + "(approx)" 
125
                price2 = ""
126
        except:
127
            price1 = price2 = ""
237 ashish 128
        #removelist is used for converting price to decimal format containing only numbers and '.'    
191 ashish 129
 
130
        if price1 != '':
237 ashish 131
            for r in NAAPTOL_REMOVELIST: 
132
                while price1.find(r) != -1:
133
                    price1 = price1.replace(r, "")
191 ashish 134
            price1 = price1.strip()
135
        if price2 != '':        
237 ashish 136
            for r in NAAPTOL_REMOVELIST: 
137
                while price2.find(r) != -1:
138
                    price2 = price2.replace(r, "")
191 ashish 139
            price2 = price2.strip()
140
 
141
        if price1 == "Rates Not Available":
142
            price1 = price2 = ""
143
 
237 ashish 144
        #range = price1 to price2
191 ashish 145
        range = price1
146
        if price2 != "":
147
            range = str(range) + " to " 
148
            range = range + str(price2) 
237 ashish 149
        da.add_new_naaptolphone(name, range)
191 ashish 150
 
237 ashish 151
 
191 ashish 152
        OnlineSellers_pricelist = []
153
        OnlineSellers_namelist = []
154
        try:
237 ashish 155
            #ct1 holds the count of online sellers
156
            #NAAPTOL_XPATH4 = '//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
157
            NAAPTOL_XPATH4 = get_code_word("NAAPTOL_XPATH4")
158
            ct1 = hxs.select(NAAPTOL_XPATH4)
191 ashish 159
            ct1 = str(ct1.extract()[0])
160
            ct1 = ct1.decode("utf-8")
161
            ct1 = ct1.strip()
162
            ps1 = ct1.find(" ")
163
            ct1 = ct1[0:ps1]
164
            ct1 = int(ct1)
165
        except:
166
            ct1 = 0
167
        ct = ct1
168
        i = 0
237 ashish 169
        #NAAPTOL_XPATH5 = '//div[@id="onSellerContents"]//td[@class="price"]'
170
        NAAPTOL_XPATH5 = get_code_word("NAAPTOL_XPATH5")
171
        os_info = hxs.select(NAAPTOL_XPATH5)
191 ashish 172
        while ct > 0:
173
            os = os_info[i].extract()
174
            ps1 = os.find(">")
175
            ps2 = os.find("<",ps1)
176
            os = os[ps1+1:ps2]
237 ashish 177
 
178
            if os != '':        
179
                for r in NAAPTOL_REMOVELIST: 
180
                    while os.find(r) != -1:
181
                        os = os.replace(r, "")
191 ashish 182
            os = urllib.unquote(os)
183
            try:
184
                os = int(os)
185
            except:
237 ashish 186
                #stored in format different than previous one
191 ashish 187
                os = os_info[i].extract()
188
                ps1 = os.find(">",ps2)
189
                ps2 = os.find("<",ps1)
190
                os = os[ps1+1:ps2]
237 ashish 191
                if os != '':        
192
                    for r in NAAPTOL_REMOVELIST: 
193
                        while os.find(r) != -1:
194
                            os = os.replace(r, "")
191 ashish 195
                os = urllib.unquote(os)
196
                os = int(os) 
237 ashish 197
 
191 ashish 198
            OnlineSellers_pricelist.append(os)
199
 
237 ashish 200
            #NAAPTOL_XPATH6 = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'  
201
            NAAPTOL_XPATH6 = get_code_word("NAAPTOL_XPATH6")
202
            #NAAPTOL_XPATH7 = '"]/span/text()'
203
            NAAPTOL_XPATH7 = get_code_word("NAAPTOL_XPATH7")
204
            NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + str(i) 
205
            NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + NAAPTOL_XPATH7 
206
            path = NAAPTOL_XPATH6
191 ashish 207
            osname = hxs.select(path)
208
            osname = osname.extract()[0]
237 ashish 209
            osname = unescape(osname)
191 ashish 210
            osname = urllib.unquote(osname)
211
            OnlineSellers_namelist.append(osname)
212
            i = i+1
213
            ct = ct-1
214
 
215
        l = len(OnlineSellers_pricelist)
237 ashish 216
        i = 0 
191 ashish 217
        nid = da.get_naaptolphone(name,range).id
218
        while l > 0:
219
            da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
220
            i = i+1
221
            l = l-1
222
 
223
        LocalSellers_pricelist = []   
224
        LocalSellers_namelist = []
225
        try:
237 ashish 226
            #ct1 holds the count of online sellers
227
            #NAAPTOL_XPATH8 = '//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
228
            NAAPTOL_XPATH8 = get_code_word("NAAPTOL_XPATH8")
229
            ct1 = hxs.select(NAAPTOL_XPATH8)
191 ashish 230
            ct1 = str(ct1.extract()[0])
231
            ct1 = ct1.decode("utf-8")
232
            ct1 = ct1.strip()
233
            ps1 = ct1.find(" ")
234
            ct1 = ct1[0:ps1]
235
            ct1 = int(ct1)
236
        except:
237
            ct1 = 0
238
        ct = ct1
239
        i = 0
237 ashish 240
        #NAAPTOL_XPATH9 = '//div[@id="offSellerContents"]//td[@class="price"]'
241
        NAAPTOL_XPATH9 = get_code_word("NAAPTOL_XPATH9")
242
        #NAAPTOL_XPATH10 = '//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()'
243
        NAAPTOL_XPATH10 = get_code_word("NAAPTOL_XPATH10")
244
        os_info = hxs.select(NAAPTOL_XPATH9)
245
        os_names = hxs.select(NAAPTOL_XPATH10)
246
 
191 ashish 247
        while ct > 0:
248
            os = os_info[i].extract()
249
            osname = os_names[i].extract() 
250
            ps1 = os.find(">")
251
            ps2 = os.find("<",ps1)
252
            os = os[ps1+1:ps2]
237 ashish 253
            if os != '':        
254
                for r in NAAPTOL_REMOVELIST: 
255
                    while os.find(r) != -1:
256
                        os = os.replace(r, "")
191 ashish 257
            os = urllib.unquote(os)
258
            osname = urllib.unquote(osname)
237 ashish 259
            osname = unescape(osname)
191 ashish 260
            try:
261
                os = int(os)
262
            except:
237 ashish 263
                #stored in format different than previous one
191 ashish 264
                os = os_info[i].extract()
265
                ps1 = os.find(">",ps2)
266
                ps2 = os.find("<",ps1)
267
                os = os[ps1+1:ps2]
237 ashish 268
                if os != '':        
269
                    for r in NAAPTOL_REMOVELIST: 
270
                        while os.find(r) != -1:
271
                            os = os.replace(r, "")
191 ashish 272
                os = urllib.unquote(os)
273
                os = int(os)        
274
            LocalSellers_pricelist.append(os)
275
            LocalSellers_namelist.append(osname)
276
            i = i+1
277
            ct = ct-1
278
 
279
        l = len(LocalSellers_pricelist)
280
        i = 0
281
        nid = da.get_naaptolphone(name,range).id
282
        while l > 0:
283
            da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
284
            i = i+1
237 ashish 285
            l = l-1
286
 
191 ashish 287
SPIDER = naaptol_price2()
288