Subversion Repositories SmartDukaan

Rev

Rev 227 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 227 Rev 258
Line 21... Line 21...
21
from xml.dom import INDEX_SIZE_ERR
21
from xml.dom import INDEX_SIZE_ERR
22
from html2text.unescaping import *
22
from html2text.unescaping import *
23
 
23
 
24
 
24
 
25
class indiaplaza_extra(BaseSpider):
25
class indiaplaza_extra(BaseSpider):
-
 
26
    """
-
 
27
    Documentation for class indiaplaza_extra
-
 
28
    This spider collects all the information for the individual phones
-
 
29
    and store them in table datastore_datadefinition_indiaplaza_items.
26
    
30
    """
27
    def __init__(self):
31
    def __init__(self):
-
 
32
       """
-
 
33
        Documentation for constructor
-
 
34
        initialize_table is called to make all the tables known in
-
 
35
        the scope of this class.
-
 
36
        Also start url needs to be feeded to the spider through start_urls.append
-
 
37
        Domainname1 is name by which this spider is known outside
-
 
38
        So this will be used as an argument for calling this spider 
-
 
39
       """ 
28
       initialize_table()
40
       initialize_table()
29
       #INDIAPLAZA_DOMAINNAME1 = "indiaplaza1" 
41
       #INDIAPLAZA_DOMAINNAME1 = "indiaplaza1" 
30
       INDIAPLAZA_DOMAINNAME1 = get_code_word("INDIAPLAZA_DOMAINNAME1")  
42
       INDIAPLAZA_DOMAINNAME1 = get_code_word("INDIAPLAZA_DOMAINNAME1")  
31
       self.domain_name = INDIAPLAZA_DOMAINNAME1
43
       self.domain_name = INDIAPLAZA_DOMAINNAME1
32
       
44
       
Line 34... Line 46...
34
       da = DataHelper()
46
       da = DataHelper()
35
       for pitem in da.get_all_ipbasic():
47
       for pitem in da.get_all_ipbasic():
36
            self.start_urls.append(pitem.v_site.strip())
48
            self.start_urls.append(pitem.v_site.strip())
37
    
49
    
38
    def start_requests(self):
50
    def start_requests(self):
-
 
51
        """
-
 
52
        Documentation for method start_requests
-
 
53
        To set various properties of the request to be made
-
 
54
        like referer, headers and all.
-
 
55
        @return a list of well formed requests which will be 
-
 
56
        crawled by spider and spider will return the response
-
 
57
        """
39
        listreq = []
58
        listreq = []
40
        #for each request a referer has to be set
59
        #for each request a referer has to be set
41
        #INDIAPLAZA_REFERER = "www.google.com/search"
60
        #INDIAPLAZA_REFERER = "www.google.com/search"
42
        INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")
61
        INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")
43
        for url1 in self.start_urls:
62
        for url1 in self.start_urls:
Line 45... Line 64...
45
            request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
64
            request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
46
            listreq.append(request)
65
            listreq.append(request)
47
        return listreq
66
        return listreq
48
        
67
        
49
    def parse(self, response):
68
    def parse(self, response):
-
 
69
        """
-
 
70
        Documentation for method parse
-
 
71
        @param response of individual requests
-
 
72
        Using Xpaths needed information is extracted out of the response
-
 
73
        and added to the database
-
 
74
        Xpath4 = Give us name for individual phone
-
 
75
        Xpath5 = Give us quoted-price for individual phone
-
 
76
        Xpath6 = Give us ship-price for individual phone
-
 
77
        Xpath7 = Give us ship_price for individual phone, if not gettable form xpath6
-
 
78
        Xpath8 = Give us guarantee-info for individual phone
-
 
79
        Xpath9 = Give us guarantee-info for individual phone, if not gettable form xpath8
-
 
80
        Xpath10 = Give us ship-info for individual phone
-
 
81
        Removelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'
-
 
82
        """
50
        hxs = HtmlXPathSelector(response)
83
        hxs = HtmlXPathSelector(response)
51
        #INDIAPLAZA_REMOVELIST = ["Rs.","Rs",",","-","/"]
84
        #INDIAPLAZA_REMOVELIST = ["Rs.","Rs",",","-","/"]
52
        #List separated by ';'
85
        #List separated by ';'
53
        INDIAPLAZA_REMOVELIST = get_code_word("INDIAPLAZA_REMOVELIST")
86
        INDIAPLAZA_REMOVELIST = get_code_word("INDIAPLAZA_REMOVELIST")
54
        INDIAPLAZA_REMOVELIST = INDIAPLAZA_REMOVELIST.split(';') 
87
        INDIAPLAZA_REMOVELIST = INDIAPLAZA_REMOVELIST.split(';')