Subversion Repositories SmartDukaan

Rev

Rev 271 | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 271 Rev 290
Line 2... Line 2...
2
Created on 27-May-2010
2
Created on 27-May-2010
3
 
3
 
4
@author: gaurav
4
@author: gaurav
5
'''
5
'''
6
 
6
 
7
 
-
 
8
from scrapy.spider import BaseSpider
7
from scrapy.spider import BaseSpider
9
from scrapy.selector import HtmlXPathSelector
8
from scrapy.selector import HtmlXPathSelector
10
from scrapy.http import Request
9
from scrapy.http import Request
11
 
10
 
12
from demo.items import DemoItem
11
from demo.items import DemoItem
Line 81... Line 80...
81
        """
80
        """
82
        da = DataHelper()
81
        da = DataHelper()
83
        hxs = HtmlXPathSelector(response)
82
        hxs = HtmlXPathSelector(response)
84
        #NAAPTOL_XPATH1 = '//url/loc/text()'
83
        #NAAPTOL_XPATH1 = '//url/loc/text()'
85
        NAAPTOL_XPATH1 = get_code_word("NAAPTOL_XPATH1")
84
        NAAPTOL_XPATH1 = get_code_word("NAAPTOL_XPATH1")
-
 
85
	
86
        phone_urls = hxs.select(NAAPTOL_XPATH1)
86
	phone_urls = hxs.select(NAAPTOL_XPATH1)
87
        
87
 
88
        #elements in chk_list are specific to this site for determining valid sites 
88
        #elements in chk_list are specific to this site for determining valid sites 
89
        #NAAPTOL_CHKLIST1 = ["mobile_phones/pdas_and_smartphones" ,"mobile_phones/gsm_handsets" ,"mobile_phones/cdma_handsets"]
89
        #NAAPTOL_CHKLIST1 = ["mobile_phones/pdas_and_smartphones" ,"mobile_phones/gsm_handsets" ,"mobile_phones/cdma_handsets"]
90
        #list separeated by ';'
90
        #list separeated by ';'
91
        NAAPTOL_CHKLIST1 = str(get_code_word("NAAPTOL_CHKLIST1"))
91
        NAAPTOL_CHKLIST1 = str(get_code_word("NAAPTOL_CHKLIST1"))
-
 
92
		
92
        if len(NAAPTOL_CHKLIST1)>0:
93
        if len(NAAPTOL_CHKLIST1)>0:
93
            NAAPTOL_CHKLIST1 = NAAPTOL_CHKLIST1.split(';')
94
            NAAPTOL_CHKLIST1 = NAAPTOL_CHKLIST1.split(';')
94
        for i in phone_urls:
95
	for i in phone_urls:
95
            site = i.extract()
96
            site = i.extract()
96
            site = unescape(site)
97
            site = unescape(site)
97
            pos1 = pos2 = pos3 = 0
98
            pos1 = pos2 = pos3 = 0
98
            temp =""
99
            temp =""
99
            
100
            
Line 106... Line 107...
106
            if pos3 > 0:
107
            if pos3 > 0:
107
                temp = site[pos3+1:pos1]
108
                temp = site[pos3+1:pos1]
108
            for c in NAAPTOL_CHKLIST1:
109
            for c in NAAPTOL_CHKLIST1:
109
                if temp == c:
110
                if temp == c:
110
                    da.add_naaptolurl(site)           
111
                    da.add_naaptolurl(site)           
111
SPIDER = naaptol_spider()
-
 
112
112
SPIDER = naaptol_spider()
-
 
113