Subversion Repositories SmartDukaan

Rev

Rev 271 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
187 ashish 1
'''
2
Created on 27-May-2010
3
 
4
@author: gaurav
5
'''
6
 
7
from scrapy.spider import BaseSpider
8
from scrapy.selector import HtmlXPathSelector
9
from scrapy.http import Request
10
 
11
from demo.items import DemoItem
12
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
16
from scrapy.http.response import Response
235 ashish 17
from datastore.DataCodeAccessor import *
18
from datastore.DataAccessor import *
187 ashish 19
 
235 ashish 20
from html2text.unescaping import *
187 ashish 21
 
22
class naaptol_spider(BaseSpider):
261 ashish 23
    """
24
    Documentation for class naaptol_spider
25
    This spider collects the url for the individual phones
26
    and store them in table datastore_datadefinition_naaptol_urls.
27
    """
235 ashish 28
    def __init__(self):
261 ashish 29
       """
30
        Documentation for constructor
31
        initialize_table is called to make all the tables known in
32
        the scope of this class.
33
        Also start url needs to be feeded to the spider through start_urls.append
34
        Domainname is name by which this spider is known outside
35
        So this will be used as an argument for calling this spider 
36
       """
235 ashish 37
       initialize_table() 
38
       #NAAPTOL_DOMAINNAME = "naaptol"   
39
       NAAPTOL_DOMAINNAME = get_code_word("NAAPTOL_DOMAINNAME")
40
       self.domain_name = NAAPTOL_DOMAINNAME 
41
       #NAAPTOL_URL = "http://www.naaptol.com/sitemap.xml"
42
       NAAPTOL_URL = get_code_word("NAAPTOL_URL")
43
       self.start_urls.append(NAAPTOL_URL)
187 ashish 44
 
45
 
46
    def start_requests(self):
261 ashish 47
        """
48
        Documentation for method start_requests
49
        To set various properties of the request to be made
50
        like referer, headers and all.
51
        Also suppliers entry need to be done in the table
52
        datastore_datadefinition_suppliers.
53
        @return a list of well formed requests which will be 
54
        crawled by spider and spider will return the response
55
       """
235 ashish 56
        #adding entry for the supplier i.e its name and site
57
        #NAAPTOL_HOMEPAGE = "http://www.naaptol.com"
58
        NAAPTOL_HOMEPAGE = get_code_word("NAAPTOL_HOMEPAGE")
187 ashish 59
        da = DataHelper()
235 ashish 60
        da.add_supplier(self.domain_name, NAAPTOL_HOMEPAGE)
187 ashish 61
        listreq = []
235 ashish 62
 
63
        #for each request a referer has to be set
64
        #NAAPTOL_REFERER = "http://www.google.com"
65
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
187 ashish 66
        for url1 in self.start_urls:
67
            request = Request(url = str(url1), callback=self.parse)
235 ashish 68
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
187 ashish 69
            listreq.append(request)
70
        return listreq
71
 
72
    def parse(self, response):
261 ashish 73
        """
74
        Documentation for method parse
75
        @param response of individual requests
76
        Using Xpaths needed information is extracted out of the response
77
        and added to the database
78
        Xpath1 = Give us url for individual phones
79
        chklist1 = elements in chk_list are specific to this site for determining valid sites
80
        """
187 ashish 81
        da = DataHelper()
82
        hxs = HtmlXPathSelector(response)
235 ashish 83
        #NAAPTOL_XPATH1 = '//url/loc/text()'
84
        NAAPTOL_XPATH1 = get_code_word("NAAPTOL_XPATH1")
290 gaurav 85
 
86
	phone_urls = hxs.select(NAAPTOL_XPATH1)
87
 
235 ashish 88
        #elements in chk_list are specific to this site for determining valid sites 
89
        #NAAPTOL_CHKLIST1 = ["mobile_phones/pdas_and_smartphones" ,"mobile_phones/gsm_handsets" ,"mobile_phones/cdma_handsets"]
90
        #list separeated by ';'
271 ashish 91
        NAAPTOL_CHKLIST1 = str(get_code_word("NAAPTOL_CHKLIST1"))
290 gaurav 92
 
271 ashish 93
        if len(NAAPTOL_CHKLIST1)>0:
94
            NAAPTOL_CHKLIST1 = NAAPTOL_CHKLIST1.split(';')
290 gaurav 95
	for i in phone_urls:
187 ashish 96
            site = i.extract()
235 ashish 97
            site = unescape(site)
187 ashish 98
            pos1 = pos2 = pos3 = 0
99
            temp =""
235 ashish 100
 
101
            # temp contains string b/w 2nd last and 3rd last slash(/)
187 ashish 102
            pos1 = site.rfind('/')
103
            if pos1 != -1:
104
                pos2 = site.rfind('/',0,pos1-1)
105
            if pos2 != -1:    
106
                pos3 = site.rfind('/',0,pos2-1)
107
            if pos3 > 0:
108
                temp = site[pos3+1:pos1]
235 ashish 109
            for c in NAAPTOL_CHKLIST1:
110
                if temp == c:
111
                    da.add_naaptolurl(site)           
290 gaurav 112
SPIDER = naaptol_spider()