Subversion Repositories SmartDukaan

Rev

Rev 223 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 223 Rev 253
Line 16... Line 16...
16
from scrapy.http.response import Response
16
from scrapy.http.response import Response
17
 
17
 
18
from datastore.DataAccessor import *
18
from datastore.DataAccessor import *
19
from datastore.DataCodeAccessor import *
19
from datastore.DataCodeAccessor import *
20
from html2text import *
20
from html2text import *
21
from babel.messages.pofile import unescape
-
 
22
import urllib
21
import urllib
23
 
22
 
24
class babuchak3(BaseSpider):
23
class babuchak3(BaseSpider):
-
 
24
    """
-
 
25
    Documentation for class babuchak3
-
 
26
    This spider collects the information for the individual phones
-
 
27
    and store them in table datastore_datadefinition_babuchak_phones.
25
    
28
    """
26
    def __init__(self):
29
    def __init__(self):
-
 
30
        """
-
 
31
        Documentation for constructor
-
 
32
        initialize_table is called to make all the tables known in
-
 
33
        the scope of this class.
-
 
34
        Also start url needs to be feeded to the spider through start_urls.append
-
 
35
        Domainname2 is name by which this spider is known outside
-
 
36
        So this will be used as an argument for calling this spider 
-
 
37
        """
27
        initialize_table()
38
        initialize_table()
28
        #BABUCHAK_DOMAINNAME2 = "babuchak2"   
39
        #BABUCHAK_DOMAINNAME2 = "babuchak2"   
29
        BABUCHAK_DOMAINNAME2 = get_code_word("BABUCHAK_DOMAINNAME2")
40
        BABUCHAK_DOMAINNAME2 = get_code_word("BABUCHAK_DOMAINNAME2")
30
        self.domain_name = BABUCHAK_DOMAINNAME2  
41
        self.domain_name = BABUCHAK_DOMAINNAME2  
31
        da = DataHelper()
42
        da = DataHelper()
32
        for item in da.get_allbabuchakphoneurls():
43
        for item in da.get_allbabuchakphoneurls():
33
            self.start_urls.append(item.url)
44
            self.start_urls.append(item.url)
34
            
45
            
35
    def start_requests(self):
46
    def start_requests(self):
-
 
47
        """
-
 
48
        Documentation for method start_requests
-
 
49
        To set various properties of the request to be made
-
 
50
        like referer, headers and all.
-
 
51
        @return a list of well formed requests which will be 
-
 
52
        crawled by spider and spider will return the response
-
 
53
        """
36
        listreq = []
54
        listreq = []
37
        #for each request a referer has to be set
55
        #for each request a referer has to be set
38
        #BABUCHAK_REFERER = "www.google.com/search"
56
        #BABUCHAK_REFERER = "www.google.com/search"
39
        BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")
57
        BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")
40
        for url1 in self.start_urls:
58
        for url1 in self.start_urls:
Line 42... Line 60...
42
            request.headers.setdefault("Referer", BABUCHAK_REFERER)
60
            request.headers.setdefault("Referer", BABUCHAK_REFERER)
43
            listreq.append(request)
61
            listreq.append(request)
44
        return listreq
62
        return listreq
45
        
63
        
46
    def parse(self, response):
64
    def parse(self, response):
-
 
65
        """
-
 
66
        Documentation for method parse
-
 
67
        @param response of individual requests
-
 
68
        Using Xpaths needed information is extracted out of the response
47
        #url1 needed to get complete urls
69
        and added to the database
-
 
70
        Xpath5 = Give us name for individual phone
-
 
71
        Xpath6 = Give us quoted-price for individual phone
-
 
72
        Xpath7 = Give us final_price for individual phone
-
 
73
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
-
 
74
        """
48
        da = DataHelper()
75
        da = DataHelper()
49
        hxs = HtmlXPathSelector(response)
76
        hxs = HtmlXPathSelector(response)
50
        #BABUCHAK_XPATH5 = '//td[@class="text-header"]/text()'
77
        #BABUCHAK_XPATH5 = '//td[@class="text-header"]/text()'
51
        BABUCHAK_XPATH5 = get_code_word("BABUCHAK_XPATH5")
78
        BABUCHAK_XPATH5 = get_code_word("BABUCHAK_XPATH5")
52
        #BABUCHAK_XPATH6 = '//td[@class="xl63"]//strong/span/text()'
79
        #BABUCHAK_XPATH6 = '//td[@class="xl63"]//strong/span/text()'
Line 84... Line 111...
84
        if ps1 != -1:
111
        if ps1 != -1:
85
            shown_price = shown_price[0:ps1]
112
            shown_price = shown_price[0:ps1]
86
            final_price = shown_price
113
            final_price = shown_price
87
        shown_price = int(shown_price)
114
        shown_price = int(shown_price)
88
        final_price = int(final_price)
115
        final_price = int(final_price)
-
 
116
        
-
 
117
        #There were some phones on which discount was there so it had
-
 
118
        #marked price, quoted price and final price
89
        if shown_price>final_price:
119
        if shown_price>final_price:
90
            try:
120
            try:
91
                shown_price = hxs.select(BABUCHAK_XPATH6)[1].extract()
121
                shown_price = hxs.select(BABUCHAK_XPATH6)[1].extract()
92
                final_price = hxs.select(BABUCHAK_XPATH6)[2].extract()
122
                final_price = hxs.select(BABUCHAK_XPATH6)[2].extract()
93
            except:
123
            except:
Line 113... Line 143...
113
                shown_price = shown_price[0:ps1]
143
                shown_price = shown_price[0:ps1]
114
                final_price = shown_price
144
                final_price = shown_price
115
        
145
        
116
            shown_price = int(shown_price)
146
            shown_price = int(shown_price)
117
            final_price = int(final_price)
147
            final_price = int(final_price)
118
            
-
 
119
        print name
-
 
120
        print shown_price
-
 
121
        print final_price
-
 
122
        da.add_babuchakphone(name,shown_price,final_price)        
148
        da.add_babuchakphone(name,shown_price,final_price)        
123
SPIDER = babuchak3()
149
SPIDER = babuchak3()
124
150