Subversion Repositories SmartDukaan

Rev

Rev 237 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 237 Rev 263
Line 20... Line 20...
20
 
20
 
21
import urllib
21
import urllib
22
from html2text.unescaping import *
22
from html2text.unescaping import *
23
 
23
 
24
class naaptol_price2(BaseSpider):
24
class naaptol_price2(BaseSpider):
-
 
25
    """
-
 
26
    Documentation for class naaptol_price
-
 
27
    Spider collects the information for the individual phones and store them in table 
-
 
28
    datastore_datadefinition_naaptol_phones   
25
    
29
    """
26
    def __init__(self):
30
    def __init__(self):
27
       
31
       """
-
 
32
        Documentation for constructor
-
 
33
        initialize_table is called to make all the tables known in
-
 
34
        the scope of this class.
-
 
35
        Also start url needs to be feeded to the spider through start_urls.append
-
 
36
        Domainname2 is name by which this spider is known outside
-
 
37
        So this will be used as an argument for calling this spider 
-
 
38
       """ 
28
       initialize_table()
39
       initialize_table()
29
       #NAAPTOL_DOMAINNAME2 = "naaptol2"   
40
       #NAAPTOL_DOMAINNAME2 = "naaptol2"   
30
       NAAPTOL_DOMAINNAME2 = get_code_word("NAAPTOL_DOMAINNAME2")
41
       NAAPTOL_DOMAINNAME2 = get_code_word("NAAPTOL_DOMAINNAME2")
31
       self.domain_name = NAAPTOL_DOMAINNAME2 
42
       self.domain_name = NAAPTOL_DOMAINNAME2 
32
       
43
       
Line 34... Line 45...
34
       da = DataHelper()
45
       da = DataHelper()
35
       for pitem in da.get_allmorenaaptolurls():
46
       for pitem in da.get_allmorenaaptolurls():
36
            self.start_urls.append(pitem.url.strip())
47
            self.start_urls.append(pitem.url.strip())
37
    
48
    
38
    def start_requests(self):
49
    def start_requests(self):
-
 
50
        """
-
 
51
        Documentation for method start_requests
-
 
52
        To set various properties of the request to be made
-
 
53
        like referer, headers and all.
-
 
54
        @return a list of well formed requests which will be 
-
 
55
        crawled by spider and spider will return the response
-
 
56
        """
39
        #for each request a referer has to be set
57
        #for each request a referer has to be set
40
        listreq = []
58
        listreq = []
41
        #NAAPTOL_REFERER = "http://www.google.com"
59
        #NAAPTOL_REFERER = "http://www.google.com"
42
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
60
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
43
        for url1 in self.start_urls:
61
        for url1 in self.start_urls:
Line 46... Line 64...
46
            listreq.append(request)
64
            listreq.append(request)
47
        return listreq
65
        return listreq
48
    
66
    
49
       
67
       
50
    def parse(self, response): 
68
    def parse(self, response): 
-
 
69
        """
-
 
70
        Documentation for method parse
-
 
71
        @param response of individual requests
-
 
72
        Using Xpaths needed information is extracted out of the response
-
 
73
        and added to the database
-
 
74
        Xpath2 = Give us price-range for individual phone
-
 
75
        Xpath3 = Give us price-range for individual phone, if unable to retrieve from xpath2
-
 
76
        Xpath4 = Give us number of onlinesellers for a particular phone
-
 
77
        Xpath5 = Give us price for a particular phone offered by onlinesellers
-
 
78
        Xpath6 and Xpath7 = Give us name of onlinesellers for a particular phone 
-
 
79
        Xpath8 = Give us number of offlinesellers for a particular phone
-
 
80
        Xpath9 = Give us price for a particular phone offered by offlinesellers
-
 
81
        Xpath10 = Give us name of offlinesellers for a particular phone
-
 
82
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
-
 
83
        """
51
        da = DataHelper()
84
        da = DataHelper()
52
        #NAAPTOL_REMOVELIST = ["Rs.",","]
85
        #NAAPTOL_REMOVELIST = ["Rs.",","]
53
        #list separated by ';'
86
        #list separated by ';'
54
        NAAPTOL_REMOVELIST = get_code_word("NAAPTOL_REMOVELIST")
87
        NAAPTOL_REMOVELIST = get_code_word("NAAPTOL_REMOVELIST")
55
        NAAPTOL_REMOVELIST = NAAPTOL_REMOVELIST.split(';')
88
        NAAPTOL_REMOVELIST = NAAPTOL_REMOVELIST.split(';')
Line 67... Line 100...
67
        #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
100
        #NAAPTOL_XPATH2 = '//table[@class ="ProductDetails"]//td[@class="Price"]/span/text()'
68
        NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
101
        NAAPTOL_XPATH2 = get_code_word("NAAPTOL_XPATH2")
69
        prices = hxs.select(NAAPTOL_XPATH2)
102
        prices = hxs.select(NAAPTOL_XPATH2)
70
        try:
103
        try:
71
            price1 = prices.extract()[0]
104
            price1 = prices.extract()[0]
72
            #price1 = price1.decode("utf-8")
-
 
73
            price1 = price1.strip()
105
            price1 = price1.strip()
74
        except:
106
        except:
75
            price1 = ""
107
            price1 = ""
76
        
108
        
77
        try:
109
        try:
78
            price2 = prices.extract()[1]
110
            price2 = prices.extract()[1]
79
            #price2 = price2.decode("utf-8")
-
 
80
            price2 = price2.strip()
111
            price2 = price2.strip()
81
        except:
112
        except:
82
            price2 = ""
113
            price2 = ""
83
        
114
        
84
        try:
115
        try: