Subversion Repositories SmartDukaan

Rev

Rev 236 | Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
189 ashish 1
'''
2
Created on 27-May-2010
3
 
4
@author: gaurav
5
'''
6
from scrapy.spider import BaseSpider
7
from scrapy.selector import HtmlXPathSelector
8
from scrapy.http import Request
9
 
10
from demo.items import DemoItem
11
from scrapy.contrib.spidermiddleware import referer
12
from scrapy.http.headers import Headers
13
from scrapy.http.request.form import FormRequest
14
from scrapy.log import msg
15
from scrapy.http.response import Response
16
from time import *
17
 
18
from datastore import DataAccessor
19
from datastore.DataAccessor import DataHelper
20
import urllib
21
 
22
class naaptol_price(BaseSpider):
23
 
24
    def __init__(self):
25
       self.domain_name = "naaptolphones"
26
       #self.start_urls.append("http://www.naaptol.com/price/10415-Fly-Hummer-HT1.html")
27
 
28
       da = DataHelper()
29
       for pitem in da.get_allnaaptolurls():
30
            self.start_urls.append(pitem.url.strip())
31
 
32
    def start_requests(self):
33
        listreq = []
34
        for url1 in self.start_urls:
35
            request = Request(url = url1, callback=self.parse)
36
            request.headers.setdefault("Referer", "www.naaptol.com")
37
            listreq.append(request)
38
        return listreq
39
 
40
    def parse(self, response):
41
       #msg(response.body)
42
        site = response.url
43
        sp1 = site.rfind("/")
44
        sp2 = site.rfind("/",0,sp1-1)
45
        catg = site[sp2+1:sp1]
46
        da = DataHelper()
47
        if catg == "price":
48
            site = site.replace("price","features")
49
            da.add_morenaaptolurl(site)
50
        else:    
51
            f = open('/home/gaurav/Desktop/response.txt', 'w')
52
            f.write(response.body)
53
            print "  url  " + response.url
54
            name = str(response.url)
55
            name_pos = name.rfind("/")
56
            name = name[name_pos+1:len(name)-5]
57
            hxs = HtmlXPathSelector(response)
58
           #prices = hxs.select('//table[@class ="ProductDetails"]/tbody/tr[6]/td/span/text()')
59
            prices = hxs.select('//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()')
60
            try:
61
                price1 = prices.extract()[0]
62
                price1 = price1.decode("utf-8")
63
                price1 = price1.strip()
64
            except:
65
                price1 = ""
66
 
67
            try:
68
                price2 = prices.extract()[1]
69
                price2 = price2.decode("utf-8")
70
                price2 = price2.strip()
71
            except:
72
                price2 = ""
73
 
74
            try:
75
                if price1 == "" and price2 == "":
76
                    prices = hxs.select('//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()')
77
                    price = str(prices.extract()[0])
78
                    pos1 = price.find("'")
79
                    pos2 = price.find("'",pos1+1,len(price))
80
                    price1 = price[pos1+1:pos2] + "(approx)" 
81
                    price2 = ""
82
            except:
83
                price1 = price2 = ""
84
 
85
            if price1 != '':
86
                price1 = price1.replace("Rs.", "")
87
                price1 = price1.replace(",", "")
88
                price1 = price1.strip()
89
            if price2 != '':        
90
                price2 = price2.replace("Rs.", "")
91
                price2 = price2.replace(",", "")
92
                price2 = price2.strip()
93
 
94
            if price1 == "Rates Not Available":
95
                price1 = price2 = ""
96
            print name
97
            print price1
98
            print price2
99
            print "\n"
100
            range = price1
101
            if price2 != "":
102
                range = str(range) + " to " 
103
                range = range + str(price2) 
104
            da.add_new_naaptolphone(name, range)
105
 
106
 
107
            OnlineSellers_pricelist = []
108
            OnlineSellers_namelist = []
109
            try:
110
                ct1 = hxs.select('//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()')
111
                ct1 = str(ct1.extract()[0])
112
                ct1 = ct1.decode("utf-8")
113
                ct1 = ct1.strip()
114
                ps1 = ct1.find(" ")
115
                ct1 = ct1[0:ps1]
116
                ct1 = int(ct1)
117
            except:
118
                ct1 = 0
119
            ct = ct1
120
            i = 0
121
            os_info = hxs.select('//div[@id="onSellerContents"]//td[@class="price"]')
122
            #print len(os_info)
123
            while ct > 0:
124
                os = os_info[i].extract()
125
                ps1 = os.find(">")
126
                ps2 = os.find("<",ps1)
127
                os = os[ps1+1:ps2]
128
                os = os.replace("Rs.", "")
129
                os = os.replace(",", "")
130
                os = urllib.unquote(os)
131
                try:
132
                    os = int(os)
133
                except:
134
                    os = os_info[i].extract()
135
                    ps1 = os.find(">",ps2)
136
                    ps2 = os.find("<",ps1)
137
                    os = os[ps1+1:ps2]
138
                    os = os.replace("Rs.", "")
139
                    os = os.replace(",", "")
140
                    os = urllib.unquote(os)
141
                    os = int(os) 
142
                print os
143
                OnlineSellers_pricelist.append(os)
144
 
145
                path = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'
146
                path = path + str(i) 
147
                path = path + '"]/span/text()'
148
                osname = hxs.select(path)
149
                #print len(osname)
150
                osname = osname.extract()[0]
151
                osname = urllib.unquote(osname)
152
                OnlineSellers_namelist.append(osname)
153
                print osname
154
                i = i+1
155
                ct = ct-1
156
 
157
            l = len(OnlineSellers_pricelist)
158
            i = 0
159
 
160
            nid = da.get_naaptolphone(name,range).id
161
            while l > 0:
162
                da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
163
                #print OnlineSellers_list[i]
164
                i = i+1
165
                l = l-1
166
 
167
            LocalSellers_pricelist = []   
168
            LocalSellers_namelist = []
169
            try:
170
                ct1 = hxs.select('//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()')
171
                ct1 = str(ct1.extract()[0])
172
                ct1 = ct1.decode("utf-8")
173
                ct1 = ct1.strip()
174
                ps1 = ct1.find(" ")
175
                ct1 = ct1[0:ps1]
176
                ct1 = int(ct1)
177
            except:
178
                ct1 = 0
179
            ct = ct1
180
            i = 0
181
            os_info = hxs.select('//div[@id="offSellerContents"]//td[@class="price"]')
182
            os_names = hxs.select('//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()')
183
            #print len(os_info)
184
            while ct > 0:
185
                os = os_info[i].extract()
186
                osname = os_names[i].extract() 
187
                #os = os.encode("utf-8")
188
                ps1 = os.find(">")
189
                ps2 = os.find("<",ps1)
190
                os = os[ps1+1:ps2]
191
                os = os.replace("Rs.", "")
192
                os = os.replace(",", "")
193
                os = urllib.unquote(os)
194
                osname = urllib.unquote(osname)
195
                try:
196
                    os = int(os)
197
                except:
198
                    os = os_info[i].extract()
199
                    ps1 = os.find(">",ps2)
200
                    ps2 = os.find("<",ps1)
201
                    os = os[ps1+1:ps2]
202
                    os = os.replace("Rs.", "")
203
                    os = os.replace(",", "")
204
                    os = urllib.unquote(os)
205
                    os = int(os)        
206
                print os
207
                print osname
208
                LocalSellers_pricelist.append(os)
209
                LocalSellers_namelist.append(osname)
210
                i = i+1
211
                ct = ct-1
212
 
213
            l = len(LocalSellers_pricelist)
214
            i = 0
215
 
216
            nid = da.get_naaptolphone(name,range).id
217
            while l > 0:
218
                da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
219
                i = i+1
220
                l = l-1
221
 
222
 
223
            '''
224
            l = len(OnlineSellers_list)
225
            i = 0
226
            while l > 0:
227
                #print OnlineSellers_list[i]
228
                i = i+1
229
                l = l-1 
230
            '''    
231
 
232
            f.close()
233
            #del DataHelper
234
 
235
        '''    
236
            site = response.url
237
            vatplustax = 0
238
            pos1 = pos2 = 0
239
            temp = ""
240
            pos1 = site.rfind('/')
241
            if pos1 != -1:
242
                temp = site[pos1+1:len(site)]
243
                #pos2 = site.rfind('/',0,pos1-1)
244
            #if pos2 > 0:
245
                #temp = site[pos2+1:len(site)]
246
            pos3 = temp.find('.')
247
            temp1 = temp[pos3:len(temp)]
248
            name = temp.replace(temp1,"")         
249
            hxs = HtmlXPathSelector(response)
250
            prices = hxs.select('//div[@id ="priceComp"]//tr[2]/td[3]/span/text()')
251
 
252
            da = DataHelper()
253
            for price in prices:
254
                 name = str(name).strip()
255
                 price = price.extract()
256
                 price = str(price).strip()
257
                 price = price.replace("Rs", "")
258
                 price = price.replace("/", "")
259
                 price = price.replace("-", "")
260
                 price = price.replace(".", "")
261
                 shown_pr = int(price)
262
                 final_pr = shown_pr + vatplustax
263
                 da.add_new_mobstorephone(name,shown_pr,final_pr)
264
                 print name
265
                 print final_pr
266
                 print "\n"
267
 
268
            for i in items:
269
                str1 = str(i['title']).strip() 
270
                print str1
271
                amnt = i['price'].replace(",","")
272
                amnt = amnt.replace("Rs", "")
273
                amnt = amnt.replace("/", "")
274
                amnt = amnt.replace("-", "")
275
                amnt = amnt.strip()
276
                vatplustax = 4*int(amnt)/100
277
                pr = int(amnt) + vatplustax 
278
                #print pr
279
                da.add_new_univerphone(str1,amnt,pr)
280
            '''        
281
SPIDER = naaptol_price()
282