Subversion Repositories SmartDukaan

Rev

Rev 235 | Rev 271 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 235 Rev 261
Line 19... Line 19...
19
from datastore.DataAccessor import *
19
from datastore.DataAccessor import *
20
 
20
 
21
from html2text.unescaping import *
21
from html2text.unescaping import *
22
 
22
 
23
class naaptol_spider(BaseSpider):
23
class naaptol_spider(BaseSpider):
-
 
24
    """
-
 
25
    Documentation for class naaptol_spider
-
 
26
    This spider collects the url for the individual phones
-
 
27
    and store them in table datastore_datadefinition_naaptol_urls.
24
   
28
    """
25
    def __init__(self):
29
    def __init__(self):
-
 
30
       """
-
 
31
        Documentation for constructor
-
 
32
        initialize_table is called to make all the tables known in
-
 
33
        the scope of this class.
-
 
34
        Also start url needs to be feeded to the spider through start_urls.append
-
 
35
        Domainname is name by which this spider is known outside
-
 
36
        So this will be used as an argument for calling this spider 
-
 
37
       """
26
       initialize_table() 
38
       initialize_table() 
27
       #NAAPTOL_DOMAINNAME = "naaptol"   
39
       #NAAPTOL_DOMAINNAME = "naaptol"   
28
       NAAPTOL_DOMAINNAME = get_code_word("NAAPTOL_DOMAINNAME")
40
       NAAPTOL_DOMAINNAME = get_code_word("NAAPTOL_DOMAINNAME")
29
       self.domain_name = NAAPTOL_DOMAINNAME 
41
       self.domain_name = NAAPTOL_DOMAINNAME 
30
       #NAAPTOL_URL = "http://www.naaptol.com/sitemap.xml"
42
       #NAAPTOL_URL = "http://www.naaptol.com/sitemap.xml"
31
       NAAPTOL_URL = get_code_word("NAAPTOL_URL")
43
       NAAPTOL_URL = get_code_word("NAAPTOL_URL")
32
       self.start_urls.append(NAAPTOL_URL)
44
       self.start_urls.append(NAAPTOL_URL)
33
    
45
    
34
    
46
    
35
    def start_requests(self):
47
    def start_requests(self):
-
 
48
        """
-
 
49
        Documentation for method start_requests
-
 
50
        To set various properties of the request to be made
-
 
51
        like referer, headers and all.
-
 
52
        Also suppliers entry need to be done in the table
-
 
53
        datastore_datadefinition_suppliers.
-
 
54
        @return a list of well formed requests which will be 
-
 
55
        crawled by spider and spider will return the response
-
 
56
       """
36
        #adding entry for the supplier i.e its name and site
57
        #adding entry for the supplier i.e its name and site
37
        #NAAPTOL_HOMEPAGE = "http://www.naaptol.com"
58
        #NAAPTOL_HOMEPAGE = "http://www.naaptol.com"
38
        NAAPTOL_HOMEPAGE = get_code_word("NAAPTOL_HOMEPAGE")
59
        NAAPTOL_HOMEPAGE = get_code_word("NAAPTOL_HOMEPAGE")
39
        da = DataHelper()
60
        da = DataHelper()
40
        da.add_supplier(self.domain_name, NAAPTOL_HOMEPAGE)
61
        da.add_supplier(self.domain_name, NAAPTOL_HOMEPAGE)
Line 48... Line 69...
48
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
69
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
49
            listreq.append(request)
70
            listreq.append(request)
50
        return listreq
71
        return listreq
51
    
72
    
52
    def parse(self, response):
73
    def parse(self, response):
-
 
74
        """
-
 
75
        Documentation for method parse
-
 
76
        @param response of individual requests
-
 
77
        Using Xpaths needed information is extracted out of the response
-
 
78
        and added to the database
-
 
79
        Xpath1 = Give us url for individual phones
-
 
80
        chklist1 = elements in chk_list are specific to this site for determining valid sites
-
 
81
        """
53
        da = DataHelper()
82
        da = DataHelper()
54
        hxs = HtmlXPathSelector(response)
83
        hxs = HtmlXPathSelector(response)
55
        #NAAPTOL_XPATH1 = '//url/loc/text()'
84
        #NAAPTOL_XPATH1 = '//url/loc/text()'
56
        NAAPTOL_XPATH1 = get_code_word("NAAPTOL_XPATH1")
85
        NAAPTOL_XPATH1 = get_code_word("NAAPTOL_XPATH1")
57
        phone_urls = hxs.select(NAAPTOL_XPATH1)
86
        phone_urls = hxs.select(NAAPTOL_XPATH1)