Subversion Repositories SmartDukaan

Rev

Rev 189 | Rev 262 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
189 ashish 1
'''
2
Created on 27-May-2010
3
 
4
@author: gaurav
5
'''
6
from scrapy.spider import BaseSpider
7
from scrapy.selector import HtmlXPathSelector
8
from scrapy.http import Request
9
 
10
from demo.items import DemoItem
11
from scrapy.contrib.spidermiddleware import referer
12
from scrapy.http.headers import Headers
13
from scrapy.http.request.form import FormRequest
14
from scrapy.log import msg
15
from scrapy.http.response import Response
16
from time import *
236 ashish 17
from datastore.DataCodeAccessor import *
18
from datastore.DataAccessor import *
189 ashish 19
 
20
import urllib
236 ashish 21
from html2text.unescaping import *
189 ashish 22
 
23
class naaptol_price(BaseSpider):
24
 
236 ashish 25
    def __init__(self): 
26
       initialize_table()
27
       #NAAPTOL_DOMAINNAME1 = "naaptol1"   
28
       NAAPTOL_DOMAINNAME1 = get_code_word("NAAPTOL_DOMAINNAME1")
29
       self.domain_name = NAAPTOL_DOMAINNAME1 
189 ashish 30
 
236 ashish 31
       # get urls from the database and append them in the list for crawling
189 ashish 32
       da = DataHelper()
236 ashish 33
       #url = "http://www.naaptol.com/features/10417-Fly-E300.html"
34
       #self.start_urls.append(url)
189 ashish 35
       for pitem in da.get_allnaaptolurls():
36
            self.start_urls.append(pitem.url.strip())
37
 
38
    def start_requests(self):
236 ashish 39
        #for each request a referer has to be set
189 ashish 40
        listreq = []
236 ashish 41
        #NAAPTOL_REFERER = "http://www.google.com"
42
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
189 ashish 43
        for url1 in self.start_urls:
236 ashish 44
            request = Request(url = str(url1), callback=self.parse)
45
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
189 ashish 46
            listreq.append(request)
47
        return listreq
236 ashish 48
 
189 ashish 49
    def parse(self, response):
236 ashish 50
        # there are two different type of urls one contains feature and other one contains price
51
        #both have to be processed differently
52
        msg(response.url)
189 ashish 53
        site = response.url
236 ashish 54
        site = unescape(site)
189 ashish 55
        sp1 = site.rfind("/")
56
        sp2 = site.rfind("/",0,sp1-1)
57
        catg = site[sp2+1:sp1]
58
        da = DataHelper()
236 ashish 59
        #change price to features and add to urls as both provide the same data but in different formats
60
        #otherwise crawl the url containing features
61
        #NAAPTOL_CHKLIST2 = ['price']
62
        #list separated by ';'
63
        NAAPTOL_CHKLIST2 = get_code_word("NAAPTOL_CHKLIST2")
64
        NAAPTOL_CHKLIST2 = NAAPTOL_CHKLIST2.split(';')
65
        #NAAPTOL_PART = "features"
66
        NAAPTOL_PART = get_code_word("NAAPTOL_PART")
67
        #NAAPTOL_REMOVELIST = ["Rs.",","]
68
        NAAPTOL_REMOVELIST = get_code_word("NAAPTOL_REMOVELIST")
69
        NAAPTOL_REMOVELIST = NAAPTOL_REMOVELIST.split(';')
70
        for c in NAAPTOL_CHKLIST2:
71
            if c == catg:
72
                site = site.replace(c,NAAPTOL_PART)
73
                da.add_morenaaptolurl(site)
74
 
75
 
76
        if catg == NAAPTOL_PART:    
77
            #retreiving name from the the url
189 ashish 78
            name = str(response.url)
236 ashish 79
            name = unescape(name)
189 ashish 80
            name_pos = name.rfind("/")
81
            name = name[name_pos+1:len(name)-5]
236 ashish 82
            name_pos = name.find("-")
83
            name = name[name_pos+1:len(name)]
84
 
85
            hxs = HtmlXPathSelector(response)  
86
            #price and price2 determine range
87
            #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
88
            NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
89
            prices = hxs.select(NAAPTOL_XPATH2)
189 ashish 90
            try:
91
                price1 = prices.extract()[0]
236 ashish 92
                #price1 = price1.decode("utf-8")
189 ashish 93
                price1 = price1.strip()
94
            except:
95
                price1 = ""
96
 
97
            try:
98
                price2 = prices.extract()[1]
236 ashish 99
                #price2 = price2.decode("utf-8")
189 ashish 100
                price2 = price2.strip()
101
            except:
236 ashish 102
                price2 = ""  
189 ashish 103
            try:
104
                if price1 == "" and price2 == "":
236 ashish 105
                    #NAAPTOL_XPATH3 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()'
106
                    NAAPTOL_XPATH3 = get_code_word("NAAPTOL_XPATH3")
107
                    prices = hxs.select(NAAPTOL_XPATH3)
189 ashish 108
                    price = str(prices.extract()[0])
109
                    pos1 = price.find("'")
110
                    pos2 = price.find("'",pos1+1,len(price))
111
                    price1 = price[pos1+1:pos2] + "(approx)" 
112
                    price2 = ""
113
            except:
114
                price1 = price2 = ""
236 ashish 115
            #removelist is used for converting price to decimal format containing only numbers and '.'    
189 ashish 116
 
117
            if price1 != '':
236 ashish 118
                for r in NAAPTOL_REMOVELIST: 
119
                    while price1.find(r) != -1:
120
                        price1 = price1.replace(r, "")
189 ashish 121
                price1 = price1.strip()
122
            if price2 != '':        
236 ashish 123
                for r in NAAPTOL_REMOVELIST: 
124
                    while price2.find(r) != -1:
125
                        price2 = price2.replace(r, "")
189 ashish 126
                price2 = price2.strip()
127
 
128
            if price1 == "Rates Not Available":
129
                price1 = price2 = ""
236 ashish 130
 
131
            #range = price1 to price2
189 ashish 132
            range = price1
133
            if price2 != "":
134
                range = str(range) + " to " 
135
                range = range + str(price2) 
236 ashish 136
 
137
            da.add_new_naaptolphone(name, str(range))
189 ashish 138
 
139
 
140
            OnlineSellers_pricelist = []
141
            OnlineSellers_namelist = []
142
            try:
236 ashish 143
                #ct1 holds the count of online sellers
144
                #NAAPTOL_XPATH4 = '//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
145
                NAAPTOL_XPATH4 = get_code_word("NAAPTOL_XPATH4")
146
                ct1 = hxs.select(NAAPTOL_XPATH4)
189 ashish 147
                ct1 = str(ct1.extract()[0])
148
                ct1 = ct1.decode("utf-8")
149
                ct1 = ct1.strip()
150
                ps1 = ct1.find(" ")
151
                ct1 = ct1[0:ps1]
152
                ct1 = int(ct1)
153
            except:
154
                ct1 = 0
155
            ct = ct1
156
            i = 0
236 ashish 157
            #NAAPTOL_XPATH5 = '//div[@id="onSellerContents"]//td[@class="price"]'
158
            NAAPTOL_XPATH5 = get_code_word("NAAPTOL_XPATH5")
159
            os_info = hxs.select(NAAPTOL_XPATH5)
189 ashish 160
            while ct > 0:
161
                os = os_info[i].extract()
162
                ps1 = os.find(">")
163
                ps2 = os.find("<",ps1)
164
                os = os[ps1+1:ps2]
236 ashish 165
 
166
                if os != '':        
167
                    for r in NAAPTOL_REMOVELIST: 
168
                        while os.find(r) != -1:
169
                            os = os.replace(r, "")
189 ashish 170
                os = urllib.unquote(os)
171
                try:
172
                    os = int(os)
173
                except:
236 ashish 174
                    #stored in format different than previous one
189 ashish 175
                    os = os_info[i].extract()
176
                    ps1 = os.find(">",ps2)
177
                    ps2 = os.find("<",ps1)
178
                    os = os[ps1+1:ps2]
236 ashish 179
                    if os != '':        
180
                        for r in NAAPTOL_REMOVELIST: 
181
                            while os.find(r) != -1:
182
                                os = os.replace(r, "")
189 ashish 183
                    os = urllib.unquote(os)
184
                    os = int(os) 
236 ashish 185
 
189 ashish 186
                OnlineSellers_pricelist.append(os)
187
 
236 ashish 188
                #NAAPTOL_XPATH6 = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'  
189
                NAAPTOL_XPATH6 = get_code_word("NAAPTOL_XPATH6")
190
                #NAAPTOL_XPATH7 = '"]/span/text()'
191
                NAAPTOL_XPATH7 = get_code_word("NAAPTOL_XPATH7")
192
                NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + str(i) 
193
                NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + NAAPTOL_XPATH7 
194
                path = NAAPTOL_XPATH6
189 ashish 195
                osname = hxs.select(path)
196
                osname = osname.extract()[0]
236 ashish 197
                osname = unescape(osname)
189 ashish 198
                osname = urllib.unquote(osname)
199
                OnlineSellers_namelist.append(osname)
200
                i = i+1
201
                ct = ct-1
202
 
203
            l = len(OnlineSellers_pricelist)
236 ashish 204
            i = 0 
189 ashish 205
            nid = da.get_naaptolphone(name,range).id
206
            while l > 0:
207
                da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
208
                i = i+1
209
                l = l-1
210
 
211
            LocalSellers_pricelist = []   
212
            LocalSellers_namelist = []
213
            try:
236 ashish 214
                #ct1 holds the count of online sellers
215
                #NAAPTOL_XPATH8 = '//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
216
                NAAPTOL_XPATH8 = get_code_word("NAAPTOL_XPATH8")
217
                ct1 = hxs.select(NAAPTOL_XPATH8)
189 ashish 218
                ct1 = str(ct1.extract()[0])
219
                ct1 = ct1.decode("utf-8")
220
                ct1 = ct1.strip()
221
                ps1 = ct1.find(" ")
222
                ct1 = ct1[0:ps1]
223
                ct1 = int(ct1)
224
            except:
225
                ct1 = 0
226
            ct = ct1
227
            i = 0
236 ashish 228
            #NAAPTOL_XPATH9 = '//div[@id="offSellerContents"]//td[@class="price"]'
229
            NAAPTOL_XPATH9 = get_code_word("NAAPTOL_XPATH9")
230
            #NAAPTOL_XPATH10 = '//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()'
231
            NAAPTOL_XPATH10 = get_code_word("NAAPTOL_XPATH10")
232
            os_info = hxs.select(NAAPTOL_XPATH9)
233
            os_names = hxs.select(NAAPTOL_XPATH10)
234
 
189 ashish 235
            while ct > 0:
236
                os = os_info[i].extract()
237
                osname = os_names[i].extract() 
238
                ps1 = os.find(">")
239
                ps2 = os.find("<",ps1)
240
                os = os[ps1+1:ps2]
236 ashish 241
                if os != '':        
242
                    for r in NAAPTOL_REMOVELIST: 
243
                        while os.find(r) != -1:
244
                            os = os.replace(r, "")
189 ashish 245
                os = urllib.unquote(os)
246
                osname = urllib.unquote(osname)
236 ashish 247
                osname = unescape(osname)
189 ashish 248
                try:
249
                    os = int(os)
250
                except:
236 ashish 251
                    #stored in format different than previous one
189 ashish 252
                    os = os_info[i].extract()
253
                    ps1 = os.find(">",ps2)
254
                    ps2 = os.find("<",ps1)
255
                    os = os[ps1+1:ps2]
236 ashish 256
                    if os != '':        
257
                        for r in NAAPTOL_REMOVELIST: 
258
                            while os.find(r) != -1:
259
                                os = os.replace(r, "")
189 ashish 260
                    os = urllib.unquote(os)
261
                    os = int(os)        
262
                LocalSellers_pricelist.append(os)
263
                LocalSellers_namelist.append(osname)
264
                i = i+1
265
                ct = ct-1
266
 
267
            l = len(LocalSellers_pricelist)
268
            i = 0
269
            nid = da.get_naaptolphone(name,range).id
270
            while l > 0:
271
                da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
272
                i = i+1
273
                l = l-1
274
 
275
SPIDER = naaptol_price()
276