Subversion Repositories SmartDukaan

Rev

Rev 226 | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 226 Rev 257
Line 21... Line 21...
21
from datastore.DataCodeAccessor import *
21
from datastore.DataCodeAccessor import *
22
 
22
 
23
 
23
 
24
 
24
 
25
class indiaplaza_spider(BaseSpider):
25
class indiaplaza_spider(BaseSpider):
-
 
26
    """
-
 
27
    Documentation for class indiaplaza_spider
-
 
28
    This spider collects the url for the individual phones
-
 
29
    and store them in table datastore_datadefinition_indiaplaza_data.
26
   
30
    """
27
    def __init__(self):
31
    def __init__(self):
-
 
32
       """
-
 
33
        Documentation for constructor
-
 
34
        initialize_table is called to make all the tables known in
-
 
35
        the scope of this class.
-
 
36
        Also start url needs to be feeded to the spider through start_urls.append
-
 
37
        Domainname is name by which this spider is known outside
-
 
38
        So this will be used as an argument for calling this spider
-
 
39
        Since, the number of pages is not fixed Ct and no are used to make it dynamic 
-
 
40
       """
28
       initialize_table()
41
       initialize_table()
29
       da = DataHelper() 
42
       da = DataHelper() 
30
       #INDIAPLAZA_DOMAINNAME = "indiaplaza"
43
       #INDIAPLAZA_DOMAINNAME = "indiaplaza"
31
       INDIAPLAZA_DOMAINNAME = get_code_word("INDIAPLAZA_DOMAINNAME")   
44
       INDIAPLAZA_DOMAINNAME = get_code_word("INDIAPLAZA_DOMAINNAME")   
32
       self.domain_name = INDIAPLAZA_DOMAINNAME
45
       self.domain_name = INDIAPLAZA_DOMAINNAME
Line 43... Line 56...
43
            url1 = INDIAPLAZA_URL + str(NO)
56
            url1 = INDIAPLAZA_URL + str(NO)
44
            self.start_urls.append(url1)
57
            self.start_urls.append(url1)
45
            NO=NO+1
58
            NO=NO+1
46
        
59
        
47
    def start_requests(self):
60
    def start_requests(self):
-
 
61
        """
-
 
62
        Documentation for method start_requests
-
 
63
        To set various properties of the request to be made
-
 
64
        like referer, headers and all.
-
 
65
        Also suppliers entry need to be done in the table
-
 
66
        datastore_datadefinition_suppliers.
-
 
67
        @return a list of well formed requests which will be 
-
 
68
        crawled by spider and spider will return the response
-
 
69
        """
48
        #adding entry for the supplier i.e its name and site
70
        #adding entry for the supplier i.e its name and site
49
        #INDIAPLAZA_HOMEPAGE = "www.indiaplaza.com"
71
        #INDIAPLAZA_HOMEPAGE = "www.indiaplaza.com"
50
        INDIAPLAZA_HOMEPAGE = get_code_word("INDIAPLAZA_HOMEPAGE")
72
        INDIAPLAZA_HOMEPAGE = get_code_word("INDIAPLAZA_HOMEPAGE")
51
        da = DataHelper()
73
        da = DataHelper()
52
        da.add_supplier(self.domain_name, INDIAPLAZA_HOMEPAGE)
74
        da.add_supplier(self.domain_name, INDIAPLAZA_HOMEPAGE)
Line 60... Line 82...
60
            request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
82
            request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
61
            listreq.append(request)
83
            listreq.append(request)
62
        return listreq
84
        return listreq
63
        
85
        
64
    def parse(self, response):
86
    def parse(self, response):
-
 
87
        """
-
 
88
        Documentation for method parse
-
 
89
        @param response of individual requests
-
 
90
        Using Xpaths needed information is extracted out of the response
-
 
91
        and added to the database
-
 
92
        Xpath1 = Give us section for individual phone
-
 
93
        Xpath2 = Give us name of individual phone
-
 
94
        Xpath3 = Give us url of individual phone
-
 
95
        Url1 = To get full url for individual phones
-
 
96
        """
65
        da = DataHelper()
97
        da = DataHelper()
66
        #INDIAPLAZA_URL1 = "http://www.indiaplaza.in"
98
        #INDIAPLAZA_URL1 = "http://www.indiaplaza.in"
67
        INDIAPLAZA_URL1 = get_code_word("INDIAPLAZA_URL1")
99
        INDIAPLAZA_URL1 = get_code_word("INDIAPLAZA_URL1")
68
        hxs = HtmlXPathSelector(response)
100
        hxs = HtmlXPathSelector(response)
69
        #INDIAPLAZA_XPATH1 = '//tr/td/table[@id="browsesku"]'
101
        #INDIAPLAZA_XPATH1 = '//tr/td/table[@id="browsesku"]'