Subversion Repositories SmartDukaan

Rev

Rev 221 | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 221 Rev 252
Line 16... Line 16...
16
from scrapy.http.response import Response
16
from scrapy.http.response import Response
17
 
17
 
18
from datastore.DataAccessor import *
18
from datastore.DataAccessor import *
19
from datastore.DataCodeAccessor import *
19
from datastore.DataCodeAccessor import *
20
from html2text import *
20
from html2text import *
21
from babel.messages.pofile import unescape
-
 
22
import urllib
21
import urllib
23
 
22
 
24
class babuchak2(BaseSpider):
23
class babuchak2(BaseSpider):
-
 
24
    """
-
 
25
    Documentation for class babuchak2
-
 
26
    This spider collects the url for the individual phones
-
 
27
    and store them in table datastore_datadefinition_babuchak_phoneurls.
25
    
28
    """
26
    def __init__(self):
29
    def __init__(self):
-
 
30
        """
-
 
31
        Documentation for constructor
-
 
32
        initialize_table is called to make all the tables known in
-
 
33
        the scope of this class.
-
 
34
        Also start url needs to be feeded to the spider through start_urls.append
-
 
35
        Domainname1 is name by which this spider is known outside
-
 
36
        So this will be used as an argument for calling this spider 
-
 
37
        """
27
        initialize_table()
38
        initialize_table()
28
        #BABUCHAK_DOMAINNAME1 = "babuchak1"   
39
        #BABUCHAK_DOMAINNAME1 = "babuchak1"   
29
        BABUCHAK_DOMAINNAME1 = get_code_word("BABUCHAK_DOMAINNAME1")
40
        BABUCHAK_DOMAINNAME1 = get_code_word("BABUCHAK_DOMAINNAME1")
30
        self.domain_name = BABUCHAK_DOMAINNAME1 
41
        self.domain_name = BABUCHAK_DOMAINNAME1 
31
        #BABUCHAK_VAR1 = "&postPage=" 
42
        #BABUCHAK_VAR1 = "&postPage=" 
Line 35... Line 46...
35
            ct = item.no_pages
46
            ct = item.no_pages
36
            while ct>0:
47
            while ct>0:
37
                url = item.url + BABUCHAK_VAR1  
48
                url = item.url + BABUCHAK_VAR1  
38
                url = url + str(ct)
49
                url = url + str(ct)
39
                self.start_urls.append(url)
50
                self.start_urls.append(url)
40
                ct = ct -1
51
                ct = ct -1        
41
        session.close()        
-
 
42
 
52
 
43
    def start_requests(self):
53
    def start_requests(self):
-
 
54
        """
-
 
55
        Documentation for method start_requests
-
 
56
        To set various properties of the request to be made
-
 
57
        like referer, headers and all.
-
 
58
        @return a list of well formed requests which will be 
-
 
59
        crawled by spider and spider will return the response
-
 
60
        """
44
        listreq = []        
61
        listreq = []        
45
        #for each request a referer has to be set
62
        #for each request a referer has to be set
46
        #BABUCHAK_REFERER = "www.google.com/search"
63
        #BABUCHAK_REFERER = "www.google.com/search"
47
        BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")
64
        BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")
48
        for url1 in self.start_urls:
65
        for url1 in self.start_urls:
Line 50... Line 67...
50
            request.headers.setdefault("Referer", BABUCHAK_REFERER)
67
            request.headers.setdefault("Referer", BABUCHAK_REFERER)
51
            listreq.append(request)
68
            listreq.append(request)
52
        return listreq
69
        return listreq
53
        
70
        
54
    def parse(self, response):
71
    def parse(self, response):
-
 
72
        """
-
 
73
        Documentation for method parse
-
 
74
        @param response of individual requests
-
 
75
        Using Xpaths needed information is extracted out of the response
55
        #url1 needed to get complete urls
76
        and added to the database
-
 
77
        Xpath4 = Give us url for individual phone
-
 
78
        Url2 = To get full url for individual vendors
-
 
79
        """
56
        da = DataHelper()
80
        da = DataHelper()
57
        #BABUCHAK_URL2 = "http://www.shopping.babuchak.com/visitourstores.php"
81
        #BABUCHAK_URL2 = "http://www.shopping.babuchak.com/visitourstores.php"
58
        BABUCHAK_URL2 = get_code_word("BABUCHAK_URL2")
82
        BABUCHAK_URL2 = get_code_word("BABUCHAK_URL2")
59
        hxs = HtmlXPathSelector(response)
83
        hxs = HtmlXPathSelector(response)
60
        #BABUCHAK_XPATH4 = '//td[@class="mod-item-body-title"]/a/@href'
84
        #BABUCHAK_XPATH4 = '//td[@class="mod-item-body-title"]/a/@href'
Line 64... Line 88...
64
        for i in info:
88
        for i in info:
65
            url = i.extract()
89
            url = i.extract()
66
            url = url.strip()
90
            url = url.strip()
67
            url = BABUCHAK_URL2 + url 
91
            url = BABUCHAK_URL2 + url 
68
            da.add_babuchakphoneurl(url) 
92
            da.add_babuchakphoneurl(url) 
69
        session.remove()
-
 
70
       
93
       
71
SPIDER = babuchak2()
94
SPIDER = babuchak2()
72
95