Subversion Repositories SmartDukaan

Rev

Rev 180 | Rev 258 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 180 Rev 227
Line 13... Line 13...
13
from scrapy.http.headers import Headers
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
15
from scrapy.log import msg
16
from scrapy.http.response import Response
16
from scrapy.http.response import Response
17
 
17
 
18
from datastore import DataAccessor
18
from datastore.DataAccessor import *
19
from datastore.DataAccessor import DataHelper
19
from datastore.DataCodeAccessor import *
20
import urllib
20
import urllib
21
from xml.dom import INDEX_SIZE_ERR
21
from xml.dom import INDEX_SIZE_ERR
-
 
22
from html2text.unescaping import *
22
 
23
 
23
 
24
 
24
class indiaplaza_extra(BaseSpider):
25
class indiaplaza_extra(BaseSpider):
25
    
26
    
26
    def __init__(self):
27
    def __init__(self):
-
 
28
       initialize_table()
-
 
29
       #INDIAPLAZA_DOMAINNAME1 = "indiaplaza1" 
-
 
30
       INDIAPLAZA_DOMAINNAME1 = get_code_word("INDIAPLAZA_DOMAINNAME1")  
27
       self.domain_name = "indiaplazaextrainfo"
31
       self.domain_name = INDIAPLAZA_DOMAINNAME1
-
 
32
       
-
 
33
        # get urls from the database and append them in the list for crawling
28
       da = DataHelper()
34
       da = DataHelper()
29
       for pitem in da.get_all_ipbasic():
35
       for pitem in da.get_all_ipbasic():
30
            self.start_urls.append(pitem.v_site.strip())
36
            self.start_urls.append(pitem.v_site.strip())
31
    
37
    
32
    def start_requests(self):
38
    def start_requests(self):
33
        listreq = []
39
        listreq = []
-
 
40
        #for each request a referer has to be set
-
 
41
        #INDIAPLAZA_REFERER = "www.google.com/search"
-
 
42
        INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")
34
        for url1 in self.start_urls:
43
        for url1 in self.start_urls:
35
            request = Request(url = url1, callback=self.parse)
44
            request = Request(url = str(url1), callback=self.parse)
36
            request.headers.setdefault("Referer", "www.google.com/search")
45
            request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
37
            listreq.append(request)
46
            listreq.append(request)
38
        return listreq
47
        return listreq
39
       
48
        
40
    def parse(self, response):
49
    def parse(self, response):
41
        hxs = HtmlXPathSelector(response)
50
        hxs = HtmlXPathSelector(response)
42
        #sites = hxs.select('//td[@class="gray-border"]')
51
        #INDIAPLAZA_REMOVELIST = ["Rs.","Rs",",","-","/"]
43
        #msg(response.url)
52
        #List separated by ';'
-
 
53
        INDIAPLAZA_REMOVELIST = get_code_word("INDIAPLAZA_REMOVELIST")
44
        #print(len(sites))
54
        INDIAPLAZA_REMOVELIST = INDIAPLAZA_REMOVELIST.split(';') 
45
        name = hxs.select('.//div[@class="finDetHdr"]/h1/text()')[0].extract()
55
        #INDIAPLAZA_XPATH4 = './/div[@class="finDetHdr"]/h1/text()' 
-
 
56
        INDIAPLAZA_XPATH4 = get_code_word("INDIAPLAZA_XPATH4")
46
        price = hxs.select('.//div[@class="priceArea"]/span[1]/text()')[0].extract()
57
        #INDIAPLAZA_XPATH5 = './/div[@class="priceArea"]/span[1]/text()'
-
 
58
        INDIAPLAZA_XPATH5 = get_code_word("INDIAPLAZA_XPATH5")
-
 
59
        #INDIAPLAZA_XPATH6 = './/div[@class="priceArea"]/div[@class="row"][2]/text()'
-
 
60
        INDIAPLAZA_XPATH6 = get_code_word("INDIAPLAZA_XPATH6")
-
 
61
        #INDIAPLAZA_XPATH7 = './/div[@class="priceArea"]/div[@class="row"][2]/span/text()'
-
 
62
        INDIAPLAZA_XPATH7 = get_code_word("INDIAPLAZA_XPATH7")
-
 
63
        #INDIAPLAZA_XPATH8 = './/div[@class="priceArea"]/div[@class="row"][3]/text()'
-
 
64
        INDIAPLAZA_XPATH8 = get_code_word("INDIAPLAZA_XPATH8")
-
 
65
        #INDIAPLAZA_XPATH9 = './/div[@class="priceArea"]/div[@class="row"][4]/text()'
-
 
66
        INDIAPLAZA_XPATH9 = get_code_word("INDIAPLAZA_XPATH9")
-
 
67
        #INDIAPLAZA_XPATH10 = './/div[@class="priceArea"]/div[@class="row"][1]/text()'
-
 
68
        INDIAPLAZA_XPATH10 = get_code_word("INDIAPLAZA_XPATH10")
-
 
69
        name = hxs.select(INDIAPLAZA_XPATH4)[0].extract()
-
 
70
        name = unescape(name)
-
 
71
        price = hxs.select(INDIAPLAZA_XPATH5)[0].extract()
47
        try:
72
        try:
48
            ship_price = hxs.select('.//div[@class="priceArea"]/div[@class="row"][2]/text()')[0].extract()
73
            ship_price = hxs.select(INDIAPLAZA_XPATH6)[0].extract()
49
        except IndexError:
74
        except IndexError:
50
            ship_price = hxs.select('.//div[@class="priceArea"]/div[@class="row"][2]/span/text()')[0].extract()
75
            ship_price = hxs.select(INDIAPLAZA_XPATH7)[0].extract()
51
        try:
76
        try:
52
            guarantee_info = hxs.select('.//div[@class="priceArea"]/div[@class="row"][3]/text()')[0].extract()
77
            guarantee_info = hxs.select(INDIAPLAZA_XPATH8)[0].extract()
53
        except IndexError:
78
        except IndexError:
54
            guarantee_info = hxs.select('.//div[@class="priceArea"]/div[@class="row"][4]/text()')[0].extract()
79
            guarantee_info = hxs.select(INDIAPLAZA_XPATH9)[0].extract()
55
        ship_info = hxs.select('.//div[@class="priceArea"]/div[@class="row"][1]/text()')[0].extract() 
80
        ship_info = hxs.select(INDIAPLAZA_XPATH10)[0].extract() 
56
        
81
        
57
        urllib.unquote(name)
82
        urllib.unquote(name)
58
        urllib.unquote(price)
83
        urllib.unquote(price)
59
        urllib.unquote(ship_price)
84
        urllib.unquote(ship_price)
60
        urllib.unquote(guarantee_info)
85
        urllib.unquote(guarantee_info)
61
        urllib.unquote(ship_info)
86
        urllib.unquote(ship_info)
-
 
87
        
-
 
88
        #INDIAPLAZA_VAR1 = "Free shipping" 
-
 
89
        INDIAPLAZA_VAR1 = get_code_word("INDIAPLAZA_VAR1")
62
        if ship_price == "Free shipping" :
90
        if ship_price == INDIAPLAZA_VAR1:
63
            ship_price = "0"
91
            ship_price = "0"
64
        else :
92
        else :
-
 
93
            if ship_price != '':        
-
 
94
                for r in INDIAPLAZA_REMOVELIST: 
-
 
95
                    while ship_price.find(r) != -1:
65
            ship_price = ship_price.replace("Rs.","")
96
                        ship_price = ship_price.replace(r, "")
-
 
97
        if price != '':        
-
 
98
                for r in INDIAPLAZA_REMOVELIST: 
-
 
99
                    while price.find(r) != -1:
-
 
100
                        price = price.replace(r, "")
66
                
101
                
67
        price = price.replace("Rs.","")
-
 
68
        
102
        
69
        name = name.strip()
103
        name = name.strip()
70
        price = price.strip()
104
        price = price.strip()
71
        ship_price = ship_price.strip()
105
        ship_price = ship_price.strip()
72
        guarantee_info = guarantee_info.strip()
106
        guarantee_info = guarantee_info.strip()
73
        ship_info = ship_info.strip()
107
        ship_info = ship_info.strip()
74
        
108
        
75
        shown_pr = int(price)
109
        shown_pr = int(price)
76
        final_pr = shown_pr + int(ship_price)
110
        final_pr = shown_pr + int(ship_price)
77
        print name
111
         
78
        print shown_pr
-
 
79
        print final_pr
-
 
80
        print guarantee_info
-
 
81
        print ship_info
-
 
82
        da = DataHelper()
112
        da = DataHelper()
83
        da.add_ipextra(name,shown_pr,final_pr,guarantee_info,ship_info)
113
        da.add_ipextra(name,shown_pr,final_pr,guarantee_info,ship_info)        
84
    '''
-
 
85
        for site in sites:
-
 
86
            item = {}
-
 
87
            #tmp = site.select('.//tr[2]/td/a/text()')
-
 
88
            item['name'] = response.select('.//div[@class="finDetHdr"]/h1/text()')[0].extract()
-
 
89
            #psite = site.select(".//a[3][@href]/@href")[0].extract()
-
 
90
            item['price'] =site.select('.//tr[3]/th/label/text()')[0].extract()
-
 
91
            items.append(item)
-
 
92
        
-
 
93
              
-
 
94
        for i in items:
-
 
95
            str1 = str(i['title']).strip() 
-
 
96
            print str1
-
 
97
            amnt = i['price'].replace(",","")
-
 
98
            amnt = amnt.replace("Rs", "")
-
 
99
            amnt = amnt.replace("/", "")
-
 
100
            amnt = amnt.replace("-", "")
-
 
101
            amnt = amnt.strip()
-
 
102
            pr = int(amnt) + vatplustax 
-
 
103
            #print pr
-
 
104
            da.add_new_univerphone(str1,amnt,pr) 
-
 
105
        '''    
-
 
106
        #lt = len(da.get_all_phones())  
-
 
107
        #print "length" + str(lt)
-
 
108
        #for ph in da.get_all_phones():
-
 
109
         #   print ph
-
 
110
            
-
 
111
        #f = open('/home/gaurav/twopassscrapy/pricelinks.txt', 'w')
-
 
112
        #for i in items:
-
 
113
            #f.write(i['title'])
-
 
114
            #f.write("\n")
-
 
115
            #f.write(i['link'])
-
 
116
            #f.write("\n")
-
 
117
        #f.close()    
-
 
118
        
-
 
119
SPIDER = indiaplaza_extra()
114
SPIDER = indiaplaza_extra()
120
 
115