Rev 271 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 27-May-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore.DataCodeAccessor import *from datastore.DataAccessor import *from html2text.unescaping import *class naaptol_spider(BaseSpider):"""Documentation for class naaptol_spiderThis spider collects the url for the individual phonesand store them in table datastore_datadefinition_naaptol_urls."""def __init__(self):"""Documentation for constructorinitialize_table is called to make all the tables known inthe scope of this class.Also start url needs to be feeded to the spider through start_urls.appendDomainname is name by which this spider is known outsideSo this will be used as an argument for calling this spider"""initialize_table()#NAAPTOL_DOMAINNAME = "naaptol"NAAPTOL_DOMAINNAME = get_code_word("NAAPTOL_DOMAINNAME")self.domain_name = NAAPTOL_DOMAINNAME#NAAPTOL_URL = "http://www.naaptol.com/sitemap.xml"NAAPTOL_URL = get_code_word("NAAPTOL_URL")self.start_urls.append(NAAPTOL_URL)def start_requests(self):"""Documentation for method start_requestsTo set various properties of the request to be madelike referer, headers and all.Also suppliers entry need to be done in the tabledatastore_datadefinition_suppliers.@return a list of well formed requests which will becrawled by spider and spider will return the response"""#adding entry for the supplier i.e its name and site#NAAPTOL_HOMEPAGE = "http://www.naaptol.com"NAAPTOL_HOMEPAGE = get_code_word("NAAPTOL_HOMEPAGE")da = DataHelper()da.add_supplier(self.domain_name, NAAPTOL_HOMEPAGE)listreq = []#for each request a referer has to be set#NAAPTOL_REFERER = "http://www.google.com"NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse)request.headers.setdefault("Referer", NAAPTOL_REFERER)listreq.append(request)return listreqdef parse(self, response):"""Documentation for method parse@param response of individual requestsUsing Xpaths needed information is extracted out of the responseand added to the databaseXpath1 = Give us url for individual phoneschklist1 = elements in chk_list are specific to this site for determining valid sites"""da = DataHelper()hxs = HtmlXPathSelector(response)#NAAPTOL_XPATH1 = '//url/loc/text()'NAAPTOL_XPATH1 = get_code_word("NAAPTOL_XPATH1")phone_urls = hxs.select(NAAPTOL_XPATH1)#elements in chk_list are specific to this site for determining valid sites#NAAPTOL_CHKLIST1 = ["mobile_phones/pdas_and_smartphones" ,"mobile_phones/gsm_handsets" ,"mobile_phones/cdma_handsets"]#list separeated by ';'NAAPTOL_CHKLIST1 = str(get_code_word("NAAPTOL_CHKLIST1"))if len(NAAPTOL_CHKLIST1)>0:NAAPTOL_CHKLIST1 = NAAPTOL_CHKLIST1.split(';')for i in phone_urls:site = i.extract()site = unescape(site)pos1 = pos2 = pos3 = 0temp =""# temp contains string b/w 2nd last and 3rd last slash(/)pos1 = site.rfind('/')if pos1 != -1:pos2 = site.rfind('/',0,pos1-1)if pos2 != -1:pos3 = site.rfind('/',0,pos2-1)if pos3 > 0:temp = site[pos3+1:pos1]for c in NAAPTOL_CHKLIST1:if temp == c:da.add_naaptolurl(site)SPIDER = naaptol_spider()