Subversion Repositories SmartDukaan

Rev

Rev 187 | Rev 261 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 187 Rev 235
Line 13... Line 13...
13
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.contrib.spidermiddleware import referer
14
from scrapy.http.headers import Headers
14
from scrapy.http.headers import Headers
15
from scrapy.http.request.form import FormRequest
15
from scrapy.http.request.form import FormRequest
16
from scrapy.log import msg
16
from scrapy.log import msg
17
from scrapy.http.response import Response
17
from scrapy.http.response import Response
-
 
18
from datastore.DataCodeAccessor import *
-
 
19
from datastore.DataAccessor import *
18
 
20
 
19
from datastore import DataAccessor
-
 
20
from datastore.DataAccessor import DataHelper
21
from html2text.unescaping import *
21
 
-
 
22
 
22
 
23
class naaptol_spider(BaseSpider):
23
class naaptol_spider(BaseSpider):
24
   
24
   
25
    def __init__(self): 
25
    def __init__(self):
-
 
26
       initialize_table() 
-
 
27
       #NAAPTOL_DOMAINNAME = "naaptol"   
-
 
28
       NAAPTOL_DOMAINNAME = get_code_word("NAAPTOL_DOMAINNAME")
26
       self.domain_name = "naaptolsites"
29
       self.domain_name = NAAPTOL_DOMAINNAME 
27
       str1 = "http://www.naaptol.com/sitemap.xml"
30
       #NAAPTOL_URL = "http://www.naaptol.com/sitemap.xml"
-
 
31
       NAAPTOL_URL = get_code_word("NAAPTOL_URL")
28
       self.start_urls.append(str1)
32
       self.start_urls.append(NAAPTOL_URL)
29
    
33
    
30
    
34
    
31
    def start_requests(self):
35
    def start_requests(self):
-
 
36
        #adding entry for the supplier i.e its name and site
-
 
37
        #NAAPTOL_HOMEPAGE = "http://www.naaptol.com"
-
 
38
        NAAPTOL_HOMEPAGE = get_code_word("NAAPTOL_HOMEPAGE")
32
        da = DataHelper()
39
        da = DataHelper()
33
        da.add_supplier(self.domain_name, "www.naaptol.com")
40
        da.add_supplier(self.domain_name, NAAPTOL_HOMEPAGE)
34
        listreq = []
41
        listreq = []
-
 
42
        
-
 
43
        #for each request a referer has to be set
-
 
44
        #NAAPTOL_REFERER = "http://www.google.com"
-
 
45
        NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")
35
        for url1 in self.start_urls:
46
        for url1 in self.start_urls:
36
            request = Request(url = str(url1), callback=self.parse)
47
            request = Request(url = str(url1), callback=self.parse)
37
            request.headers.setdefault("Referer", "http://www.naaptol.com/sitemap.xml")
48
            request.headers.setdefault("Referer", NAAPTOL_REFERER)
38
            listreq.append(request)
49
            listreq.append(request)
39
        return listreq
50
        return listreq
40
    
51
    
41
    def parse(self, response):
52
    def parse(self, response):
42
        da = DataHelper()
53
        da = DataHelper()
43
        hxs = HtmlXPathSelector(response)
54
        hxs = HtmlXPathSelector(response)
-
 
55
        #NAAPTOL_XPATH1 = '//url/loc/text()'
-
 
56
        NAAPTOL_XPATH1 = get_code_word("NAAPTOL_XPATH1")
44
        phone_urls = hxs.select('//url/loc/text()')
57
        phone_urls = hxs.select(NAAPTOL_XPATH1)
-
 
58
        
-
 
59
        #elements in chk_list are specific to this site for determining valid sites 
-
 
60
        #NAAPTOL_CHKLIST1 = ["mobile_phones/pdas_and_smartphones" ,"mobile_phones/gsm_handsets" ,"mobile_phones/cdma_handsets"]
-
 
61
        #list separeated by ';'
-
 
62
        NAAPTOL_CHKLIST1 = get_code_word("NAAPTOL_CHKLIST1")
-
 
63
        NAAPTOL_CHKLIST1 = NAAPTOL_CHKLIST1.split(';')
45
        for i in phone_urls:
64
        for i in phone_urls:
46
            site = i.extract()
65
            site = i.extract()
-
 
66
            site = unescape(site)
47
            pos1 = pos2 = pos3 = 0
67
            pos1 = pos2 = pos3 = 0
48
            temp =""
68
            temp =""
-
 
69
            
-
 
70
            # temp contains string b/w 2nd last and 3rd last slash(/)
49
            pos1 = site.rfind('/')
71
            pos1 = site.rfind('/')
50
            if pos1 != -1:
72
            if pos1 != -1:
51
                pos2 = site.rfind('/',0,pos1-1)
73
                pos2 = site.rfind('/',0,pos1-1)
52
            if pos2 != -1:    
74
            if pos2 != -1:    
53
                pos3 = site.rfind('/',0,pos2-1)
75
                pos3 = site.rfind('/',0,pos2-1)
54
            if pos3 > 0:
76
            if pos3 > 0:
55
                temp = site[pos3+1:pos1]
77
                temp = site[pos3+1:pos1]
56
            if temp == "mobile_phones/pdas_and_smartphones" or temp == "mobile_phones/gsm_handsets" or temp == "mobile_phones/cdma_handsets":
-
 
57
                da.add_naaptolurl(site)
-
 
58
                #print str(ct) + " " + site
78
            for c in NAAPTOL_CHKLIST1:
59
                #print "\n"
-
 
60
                #ct = ct +1    
79
                if temp == c:
61
                           
80
                    da.add_naaptolurl(site)           
62
SPIDER = naaptol_spider()
81
SPIDER = naaptol_spider()
63
82