Subversion Repositories SmartDukaan

Rev

Rev 221 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
221 ashish 1
'''
2
Created on 06-Jun-2010
3
 
4
@author: gaurav
5
'''
6
 
7
from scrapy.spider import BaseSpider
8
from scrapy.selector import HtmlXPathSelector
9
from scrapy.http import Request
10
 
11
from demo.items import DemoItem
12
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
16
from scrapy.http.response import Response
17
 
18
from datastore.DataAccessor import *
19
from datastore.DataCodeAccessor import *
20
from html2text import *
21
import urllib
22
 
23
class babuchak2(BaseSpider):
252 ashish 24
    """
25
    Documentation for class babuchak2
26
    This spider collects the url for the individual phones
27
    and store them in table datastore_datadefinition_babuchak_phoneurls.
28
    """
221 ashish 29
    def __init__(self):
252 ashish 30
        """
31
        Documentation for constructor
32
        initialize_table is called to make all the tables known in
33
        the scope of this class.
34
        Also start url needs to be feeded to the spider through start_urls.append
35
        Domainname1 is name by which this spider is known outside
36
        So this will be used as an argument for calling this spider 
37
        """
221 ashish 38
        initialize_table()
39
        #BABUCHAK_DOMAINNAME1 = "babuchak1"   
40
        BABUCHAK_DOMAINNAME1 = get_code_word("BABUCHAK_DOMAINNAME1")
41
        self.domain_name = BABUCHAK_DOMAINNAME1 
42
        #BABUCHAK_VAR1 = "&postPage=" 
43
        BABUCHAK_VAR1 = get_code_word("BABUCHAK_VAR1")
44
        da = DataHelper()
45
        for item in da.get_allbabuchakurls():
46
            ct = item.no_pages
47
            while ct>0:
48
                url = item.url + BABUCHAK_VAR1  
49
                url = url + str(ct)
50
                self.start_urls.append(url)
252 ashish 51
                ct = ct -1        
221 ashish 52
 
53
    def start_requests(self):
252 ashish 54
        """
55
        Documentation for method start_requests
56
        To set various properties of the request to be made
57
        like referer, headers and all.
58
        @return a list of well formed requests which will be 
59
        crawled by spider and spider will return the response
60
        """
221 ashish 61
        listreq = []        
62
        #for each request a referer has to be set
63
        #BABUCHAK_REFERER = "www.google.com/search"
64
        BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")
65
        for url1 in self.start_urls:
66
            request = Request(url = str(url1), callback=self.parse)
67
            request.headers.setdefault("Referer", BABUCHAK_REFERER)
68
            listreq.append(request)
69
        return listreq
70
 
71
    def parse(self, response):
252 ashish 72
        """
73
        Documentation for method parse
74
        @param response of individual requests
75
        Using Xpaths needed information is extracted out of the response
76
        and added to the database
77
        Xpath4 = Give us url for individual phone
78
        Url2 = To get full url for individual vendors
79
        """
221 ashish 80
        da = DataHelper()
81
        #BABUCHAK_URL2 = "http://www.shopping.babuchak.com/visitourstores.php"
82
        BABUCHAK_URL2 = get_code_word("BABUCHAK_URL2")
83
        hxs = HtmlXPathSelector(response)
84
        #BABUCHAK_XPATH4 = '//td[@class="mod-item-body-title"]/a/@href'
85
        BABUCHAK_XPATH4 = get_code_word("BABUCHAK_XPATH4")
86
 
87
        info = hxs.select(BABUCHAK_XPATH4)
88
        for i in info:
89
            url = i.extract()
90
            url = url.strip()
91
            url = BABUCHAK_URL2 + url 
92
            da.add_babuchakphoneurl(url) 
93
 
94
SPIDER = babuchak2()