Subversion Repositories SmartDukaan

Rev

Rev 189 | Rev 262 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 189 Rev 236
Line 12... Line 12...
12
from scrapy.http.headers import Headers
12
from scrapy.http.headers import Headers
13
from scrapy.http.request.form import FormRequest
13
from scrapy.http.request.form import FormRequest
14
from scrapy.log import msg
14
from scrapy.log import msg
15
from scrapy.http.response import Response
15
from scrapy.http.response import Response
16
from time import *
16
from time import *
-
 
17
from datastore.DataCodeAccessor import *
-
 
18
from datastore.DataAccessor import *
17
 
19
 
18
from datastore import DataAccessor
-
 
19
from datastore.DataAccessor import DataHelper
-
 
20
import urllib
20
import urllib
-
 
21
from html2text.unescaping import *
21
 
22
 
22
class naaptol_price(BaseSpider):
23
class naaptol_price(BaseSpider):
23
    
24
    
24
    def __init__(self):
25
    def __init__(self): 
-
 
26
       initialize_table()
25
       self.domain_name = "naaptolphones"
27
       #NAAPTOL_DOMAINNAME1 = "naaptol1"   
26
       #self.start_urls.append("http://www.naaptol.com/price/10415-Fly-Hummer-HT1.html")
28
       NAAPTOL_DOMAINNAME1 = get_code_word("NAAPTOL_DOMAINNAME1")
-
 
29
       self.domain_name = NAAPTOL_DOMAINNAME1 
27
       
30
       
-
 
31
       # get urls from the database and append them in the list for crawling
28
       da = DataHelper()
32
       da = DataHelper()
-
 
33
       #url = "http://www.naaptol.com/features/10417-Fly-E300.html"
-
 
34
       #self.start_urls.append(url)
29
       for pitem in da.get_allnaaptolurls():
35
       for pitem in da.get_allnaaptolurls():
30
            self.start_urls.append(pitem.url.strip())
36
            self.start_urls.append(pitem.url.strip())
31
    
37
    
32
    def start_requests(self):
38
    def start_requests(self):
-
 
39
        #for each request a referer has to be set
33
        listreq = []
40
        listreq = []
-
 
41
        #NAAPTOL_REFERER = "http://www.google.com"
-
 
42
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
34
        for url1 in self.start_urls:
43
        for url1 in self.start_urls:
35
            request = Request(url = url1, callback=self.parse)
44
            request = Request(url = str(url1), callback=self.parse)
36
            request.headers.setdefault("Referer", "www.naaptol.com")
45
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
37
            listreq.append(request)
46
            listreq.append(request)
38
        return listreq
47
        return listreq
39
       
48
    
40
    def parse(self, response):
49
    def parse(self, response):
-
 
50
        # there are two different type of urls one contains feature and other one contains price
-
 
51
        #both have to be processed differently
41
       #msg(response.body)
52
        msg(response.url)
42
        site = response.url
53
        site = response.url
-
 
54
        site = unescape(site)
43
        sp1 = site.rfind("/")
55
        sp1 = site.rfind("/")
44
        sp2 = site.rfind("/",0,sp1-1)
56
        sp2 = site.rfind("/",0,sp1-1)
45
        catg = site[sp2+1:sp1]
57
        catg = site[sp2+1:sp1]
46
        da = DataHelper()
58
        da = DataHelper()
-
 
59
        #change price to features and add to urls as both provide the same data but in different formats
-
 
60
        #otherwise crawl the url containing features
-
 
61
        #NAAPTOL_CHKLIST2 = ['price']
-
 
62
        #list separated by ';'
-
 
63
        NAAPTOL_CHKLIST2 = get_code_word("NAAPTOL_CHKLIST2")
-
 
64
        NAAPTOL_CHKLIST2 = NAAPTOL_CHKLIST2.split(';')
-
 
65
        #NAAPTOL_PART = "features"
-
 
66
        NAAPTOL_PART = get_code_word("NAAPTOL_PART")
-
 
67
        #NAAPTOL_REMOVELIST = ["Rs.",","]
-
 
68
        NAAPTOL_REMOVELIST = get_code_word("NAAPTOL_REMOVELIST")
-
 
69
        NAAPTOL_REMOVELIST = NAAPTOL_REMOVELIST.split(';')
-
 
70
        for c in NAAPTOL_CHKLIST2:
47
        if catg == "price":
71
            if c == catg:
48
            site = site.replace("price","features")
72
                site = site.replace(c,NAAPTOL_PART)
49
            da.add_morenaaptolurl(site)
73
                da.add_morenaaptolurl(site)
50
        else:    
74
                
51
            f = open('/home/gaurav/Desktop/response.txt', 'w')
75
                
52
            f.write(response.body)
76
        if catg == NAAPTOL_PART:    
53
            print "  url  " + response.url
77
            #retreiving name from the the url
54
            name = str(response.url)
78
            name = str(response.url)
-
 
79
            name = unescape(name)
55
            name_pos = name.rfind("/")
80
            name_pos = name.rfind("/")
56
            name = name[name_pos+1:len(name)-5]
81
            name = name[name_pos+1:len(name)-5]
-
 
82
            name_pos = name.find("-")
-
 
83
            name = name[name_pos+1:len(name)]
-
 
84
            
57
            hxs = HtmlXPathSelector(response)
85
            hxs = HtmlXPathSelector(response)  
58
           #prices = hxs.select('//table[@class ="ProductDetails"]/tbody/tr[6]/td/span/text()')
86
            #price and price2 determine range
59
            prices = hxs.select('//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()')
87
            #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
-
 
88
            NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
-
 
89
            prices = hxs.select(NAAPTOL_XPATH2)
60
            try:
90
            try:
61
                price1 = prices.extract()[0]
91
                price1 = prices.extract()[0]
62
                price1 = price1.decode("utf-8")
92
                #price1 = price1.decode("utf-8")
63
                price1 = price1.strip()
93
                price1 = price1.strip()
64
            except:
94
            except:
65
                price1 = ""
95
                price1 = ""
66
            
96
            
67
            try:
97
            try:
68
                price2 = prices.extract()[1]
98
                price2 = prices.extract()[1]
69
                price2 = price2.decode("utf-8")
99
                #price2 = price2.decode("utf-8")
70
                price2 = price2.strip()
100
                price2 = price2.strip()
71
            except:
101
            except:
72
                price2 = ""
102
                price2 = ""  
73
            
-
 
74
            try:
103
            try:
75
                if price1 == "" and price2 == "":
104
                if price1 == "" and price2 == "":
76
                    prices = hxs.select('//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()')
105
                    #NAAPTOL_XPATH3 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/script/text()'
-
 
106
                    NAAPTOL_XPATH3 = get_code_word("NAAPTOL_XPATH3")
-
 
107
                    prices = hxs.select(NAAPTOL_XPATH3)
77
                    price = str(prices.extract()[0])
108
                    price = str(prices.extract()[0])
78
                    pos1 = price.find("'")
109
                    pos1 = price.find("'")
79
                    pos2 = price.find("'",pos1+1,len(price))
110
                    pos2 = price.find("'",pos1+1,len(price))
80
                    price1 = price[pos1+1:pos2] + "(approx)" 
111
                    price1 = price[pos1+1:pos2] + "(approx)" 
81
                    price2 = ""
112
                    price2 = ""
82
            except:
113
            except:
83
                price1 = price2 = ""
114
                price1 = price2 = ""
-
 
115
            #removelist is used for converting price to decimal format containing only numbers and '.'    
84
                
116
                
85
            if price1 != '':
117
            if price1 != '':
-
 
118
                for r in NAAPTOL_REMOVELIST: 
86
                price1 = price1.replace("Rs.", "")
119
                    while price1.find(r) != -1:
87
                price1 = price1.replace(",", "")
120
                        price1 = price1.replace(r, "")
88
                price1 = price1.strip()
121
                price1 = price1.strip()
89
            if price2 != '':        
122
            if price2 != '':        
-
 
123
                for r in NAAPTOL_REMOVELIST: 
90
                price2 = price2.replace("Rs.", "")
124
                    while price2.find(r) != -1:
91
                price2 = price2.replace(",", "")
125
                        price2 = price2.replace(r, "")
92
                price2 = price2.strip()
126
                price2 = price2.strip()
93
            
127
            
94
            if price1 == "Rates Not Available":
128
            if price1 == "Rates Not Available":
95
                price1 = price2 = ""
129
                price1 = price2 = ""
96
            print name
130
            
97
            print price1
-
 
98
            print price2
131
            #range = price1 to price2
99
            print "\n"
-
 
100
            range = price1
132
            range = price1
101
            if price2 != "":
133
            if price2 != "":
102
                range = str(range) + " to " 
134
                range = str(range) + " to " 
103
                range = range + str(price2) 
135
                range = range + str(price2) 
-
 
136
                
104
            da.add_new_naaptolphone(name, range)
137
            da.add_new_naaptolphone(name, str(range))
105
            
138
            
106
            
139
            
107
            OnlineSellers_pricelist = []
140
            OnlineSellers_pricelist = []
108
            OnlineSellers_namelist = []
141
            OnlineSellers_namelist = []
109
            try:
142
            try:
-
 
143
                #ct1 holds the count of online sellers
110
                ct1 = hxs.select('//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()')
144
                #NAAPTOL_XPATH4 = '//div[@id="OnlineSellers"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
-
 
145
                NAAPTOL_XPATH4 = get_code_word("NAAPTOL_XPATH4")
-
 
146
                ct1 = hxs.select(NAAPTOL_XPATH4)
111
                ct1 = str(ct1.extract()[0])
147
                ct1 = str(ct1.extract()[0])
112
                ct1 = ct1.decode("utf-8")
148
                ct1 = ct1.decode("utf-8")
113
                ct1 = ct1.strip()
149
                ct1 = ct1.strip()
114
                ps1 = ct1.find(" ")
150
                ps1 = ct1.find(" ")
115
                ct1 = ct1[0:ps1]
151
                ct1 = ct1[0:ps1]
116
                ct1 = int(ct1)
152
                ct1 = int(ct1)
117
            except:
153
            except:
118
                ct1 = 0
154
                ct1 = 0
119
            ct = ct1
155
            ct = ct1
120
            i = 0
156
            i = 0
121
            os_info = hxs.select('//div[@id="onSellerContents"]//td[@class="price"]')
157
            #NAAPTOL_XPATH5 = '//div[@id="onSellerContents"]//td[@class="price"]'
-
 
158
            NAAPTOL_XPATH5 = get_code_word("NAAPTOL_XPATH5")
122
            #print len(os_info)
159
            os_info = hxs.select(NAAPTOL_XPATH5)
123
            while ct > 0:
160
            while ct > 0:
124
                os = os_info[i].extract()
161
                os = os_info[i].extract()
125
                ps1 = os.find(">")
162
                ps1 = os.find(">")
126
                ps2 = os.find("<",ps1)
163
                ps2 = os.find("<",ps1)
127
                os = os[ps1+1:ps2]
164
                os = os[ps1+1:ps2]
-
 
165
                
128
                os = os.replace("Rs.", "")
166
                if os != '':        
-
 
167
                    for r in NAAPTOL_REMOVELIST: 
-
 
168
                        while os.find(r) != -1:
129
                os = os.replace(",", "")
169
                            os = os.replace(r, "")
130
                os = urllib.unquote(os)
170
                os = urllib.unquote(os)
131
                try:
171
                try:
132
                    os = int(os)
172
                    os = int(os)
133
                except:
173
                except:
-
 
174
                    #stored in format different than previous one
134
                    os = os_info[i].extract()
175
                    os = os_info[i].extract()
135
                    ps1 = os.find(">",ps2)
176
                    ps1 = os.find(">",ps2)
136
                    ps2 = os.find("<",ps1)
177
                    ps2 = os.find("<",ps1)
137
                    os = os[ps1+1:ps2]
178
                    os = os[ps1+1:ps2]
138
                    os = os.replace("Rs.", "")
179
                    if os != '':        
-
 
180
                        for r in NAAPTOL_REMOVELIST: 
-
 
181
                            while os.find(r) != -1:
139
                    os = os.replace(",", "")
182
                                os = os.replace(r, "")
140
                    os = urllib.unquote(os)
183
                    os = urllib.unquote(os)
141
                    os = int(os) 
184
                    os = int(os) 
142
                print os
185
                
143
                OnlineSellers_pricelist.append(os)
186
                OnlineSellers_pricelist.append(os)
144
                
187
                
145
                path = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'
188
                #NAAPTOL_XPATH6 = '//div[@id="onSellerContents"]//tr[@class="DottedBorder"]/td/a[@id="storeInfoPop'  
-
 
189
                NAAPTOL_XPATH6 = get_code_word("NAAPTOL_XPATH6")
-
 
190
                #NAAPTOL_XPATH7 = '"]/span/text()'
-
 
191
                NAAPTOL_XPATH7 = get_code_word("NAAPTOL_XPATH7")
146
                path = path + str(i) 
192
                NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + str(i) 
-
 
193
                NAAPTOL_XPATH6 = NAAPTOL_XPATH6 + NAAPTOL_XPATH7 
147
                path = path + '"]/span/text()'
194
                path = NAAPTOL_XPATH6
148
                osname = hxs.select(path)
195
                osname = hxs.select(path)
149
                #print len(osname)
-
 
150
                osname = osname.extract()[0]
196
                osname = osname.extract()[0]
-
 
197
                osname = unescape(osname)
151
                osname = urllib.unquote(osname)
198
                osname = urllib.unquote(osname)
152
                OnlineSellers_namelist.append(osname)
199
                OnlineSellers_namelist.append(osname)
153
                print osname
-
 
154
                i = i+1
200
                i = i+1
155
                ct = ct-1
201
                ct = ct-1
156
                
202
                
157
            l = len(OnlineSellers_pricelist)
203
            l = len(OnlineSellers_pricelist)
158
            i = 0
204
            i = 0 
159
            
-
 
160
            nid = da.get_naaptolphone(name,range).id
205
            nid = da.get_naaptolphone(name,range).id
161
            while l > 0:
206
            while l > 0:
162
                da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
207
                da.add_new_ntonlinesp(nid, OnlineSellers_namelist[i], OnlineSellers_pricelist[i])
163
                #print OnlineSellers_list[i]
-
 
164
                i = i+1
208
                i = i+1
165
                l = l-1
209
                l = l-1
166
            
210
            
167
            LocalSellers_pricelist = []   
211
            LocalSellers_pricelist = []   
168
            LocalSellers_namelist = []
212
            LocalSellers_namelist = []
169
            try:
213
            try:
-
 
214
                #ct1 holds the count of online sellers
170
                ct1 = hxs.select('//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()')
215
                #NAAPTOL_XPATH8 = '//div[@id="LocalStores"]//div[@class="ProductResultHead"]//div[@class="headingstyle"]/text()'
-
 
216
                NAAPTOL_XPATH8 = get_code_word("NAAPTOL_XPATH8")
-
 
217
                ct1 = hxs.select(NAAPTOL_XPATH8)
171
                ct1 = str(ct1.extract()[0])
218
                ct1 = str(ct1.extract()[0])
172
                ct1 = ct1.decode("utf-8")
219
                ct1 = ct1.decode("utf-8")
173
                ct1 = ct1.strip()
220
                ct1 = ct1.strip()
174
                ps1 = ct1.find(" ")
221
                ps1 = ct1.find(" ")
175
                ct1 = ct1[0:ps1]
222
                ct1 = ct1[0:ps1]
176
                ct1 = int(ct1)
223
                ct1 = int(ct1)
177
            except:
224
            except:
178
                ct1 = 0
225
                ct1 = 0
179
            ct = ct1
226
            ct = ct1
180
            i = 0
227
            i = 0
181
            os_info = hxs.select('//div[@id="offSellerContents"]//td[@class="price"]')
228
            #NAAPTOL_XPATH9 = '//div[@id="offSellerContents"]//td[@class="price"]'
-
 
229
            NAAPTOL_XPATH9 = get_code_word("NAAPTOL_XPATH9")
182
            os_names = hxs.select('//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()')
230
            #NAAPTOL_XPATH10 = '//div[@id="offSellerContents"]//span[@class="LocalStoreHeading"]/text()'
-
 
231
            NAAPTOL_XPATH10 = get_code_word("NAAPTOL_XPATH10")
183
            #print len(os_info)
232
            os_info = hxs.select(NAAPTOL_XPATH9)
-
 
233
            os_names = hxs.select(NAAPTOL_XPATH10)
-
 
234
            
184
            while ct > 0:
235
            while ct > 0:
185
                os = os_info[i].extract()
236
                os = os_info[i].extract()
186
                osname = os_names[i].extract() 
237
                osname = os_names[i].extract() 
187
                #os = os.encode("utf-8")
-
 
188
                ps1 = os.find(">")
238
                ps1 = os.find(">")
189
                ps2 = os.find("<",ps1)
239
                ps2 = os.find("<",ps1)
190
                os = os[ps1+1:ps2]
240
                os = os[ps1+1:ps2]
191
                os = os.replace("Rs.", "")
241
                if os != '':        
-
 
242
                    for r in NAAPTOL_REMOVELIST: 
-
 
243
                        while os.find(r) != -1:
192
                os = os.replace(",", "")
244
                            os = os.replace(r, "")
193
                os = urllib.unquote(os)
245
                os = urllib.unquote(os)
194
                osname = urllib.unquote(osname)
246
                osname = urllib.unquote(osname)
-
 
247
                osname = unescape(osname)
195
                try:
248
                try:
196
                    os = int(os)
249
                    os = int(os)
197
                except:
250
                except:
-
 
251
                    #stored in format different than previous one
198
                    os = os_info[i].extract()
252
                    os = os_info[i].extract()
199
                    ps1 = os.find(">",ps2)
253
                    ps1 = os.find(">",ps2)
200
                    ps2 = os.find("<",ps1)
254
                    ps2 = os.find("<",ps1)
201
                    os = os[ps1+1:ps2]
255
                    os = os[ps1+1:ps2]
202
                    os = os.replace("Rs.", "")
256
                    if os != '':        
-
 
257
                        for r in NAAPTOL_REMOVELIST: 
-
 
258
                            while os.find(r) != -1:
203
                    os = os.replace(",", "")
259
                                os = os.replace(r, "")
204
                    os = urllib.unquote(os)
260
                    os = urllib.unquote(os)
205
                    os = int(os)        
261
                    os = int(os)        
206
                print os
-
 
207
                print osname
-
 
208
                LocalSellers_pricelist.append(os)
262
                LocalSellers_pricelist.append(os)
209
                LocalSellers_namelist.append(osname)
263
                LocalSellers_namelist.append(osname)
210
                i = i+1
264
                i = i+1
211
                ct = ct-1
265
                ct = ct-1
212
                
266
                
213
            l = len(LocalSellers_pricelist)
267
            l = len(LocalSellers_pricelist)
214
            i = 0
268
            i = 0
215
            
-
 
216
            nid = da.get_naaptolphone(name,range).id
269
            nid = da.get_naaptolphone(name,range).id
217
            while l > 0:
270
            while l > 0:
218
                da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
271
                da.add_new_ntofflinesp(nid, LocalSellers_namelist[i], LocalSellers_pricelist[i])
219
                i = i+1
272
                i = i+1
220
                l = l-1
273
                l = l-1
221
            
274
            
222
                    
-
 
223
            '''
-
 
224
            l = len(OnlineSellers_list)
-
 
225
            i = 0
-
 
226
            while l > 0:
-
 
227
                #print OnlineSellers_list[i]
-
 
228
                i = i+1
-
 
229
                l = l-1 
-
 
230
            '''    
-
 
231
            
-
 
232
            f.close()
-
 
233
            #del DataHelper
-
 
234
                   
-
 
235
        '''    
-
 
236
            site = response.url
-
 
237
            vatplustax = 0
-
 
238
            pos1 = pos2 = 0
-
 
239
            temp = ""
-
 
240
            pos1 = site.rfind('/')
-
 
241
            if pos1 != -1:
-
 
242
                temp = site[pos1+1:len(site)]
-
 
243
                #pos2 = site.rfind('/',0,pos1-1)
-
 
244
            #if pos2 > 0:
-
 
245
                #temp = site[pos2+1:len(site)]
-
 
246
            pos3 = temp.find('.')
-
 
247
            temp1 = temp[pos3:len(temp)]
-
 
248
            name = temp.replace(temp1,"")         
-
 
249
            hxs = HtmlXPathSelector(response)
-
 
250
            prices = hxs.select('//div[@id ="priceComp"]//tr[2]/td[3]/span/text()')
-
 
251
            
-
 
252
            da = DataHelper()
-
 
253
            for price in prices:
-
 
254
                 name = str(name).strip()
-
 
255
                 price = price.extract()
-
 
256
                 price = str(price).strip()
-
 
257
                 price = price.replace("Rs", "")
-
 
258
                 price = price.replace("/", "")
-
 
259
                 price = price.replace("-", "")
-
 
260
                 price = price.replace(".", "")
-
 
261
                 shown_pr = int(price)
-
 
262
                 final_pr = shown_pr + vatplustax
-
 
263
                 da.add_new_mobstorephone(name,shown_pr,final_pr)
-
 
264
                 print name
-
 
265
                 print final_pr
-
 
266
                 print "\n"
-
 
267
              
-
 
268
            for i in items:
-
 
269
                str1 = str(i['title']).strip() 
-
 
270
                print str1
-
 
271
                amnt = i['price'].replace(",","")
-
 
272
                amnt = amnt.replace("Rs", "")
-
 
273
                amnt = amnt.replace("/", "")
-
 
274
                amnt = amnt.replace("-", "")
-
 
275
                amnt = amnt.strip()
-
 
276
                vatplustax = 4*int(amnt)/100
-
 
277
                pr = int(amnt) + vatplustax 
-
 
278
                #print pr
-
 
279
                da.add_new_univerphone(str1,amnt,pr)
-
 
280
            '''        
-
 
281
SPIDER = naaptol_price()
275
SPIDER = naaptol_price()
282
 
276