WebSVN – SmartDukaan – Blame – //prototype/naaptolpass1/src/demo/spiders/spider1.py

Rev	Author	Line No.	Line
187	ashish	1	`'''`
		2	`Created on 27-May-2010`
		3
		4	`@author: gaurav`
		5	`'''`
		6
		7	`from scrapy.spider import BaseSpider`
		8	`from scrapy.selector import HtmlXPathSelector`
		9	`from scrapy.http import Request`
		10
		11	`from demo.items import DemoItem`
		12	`from scrapy.contrib.spidermiddleware import referer`
		13	`from scrapy.http.headers import Headers`
		14	`from scrapy.http.request.form import FormRequest`
		15	`from scrapy.log import msg`
		16	`from scrapy.http.response import Response`
235	ashish	17	`from datastore.DataCodeAccessor import *`
		18	`from datastore.DataAccessor import *`
187	ashish	19
235	ashish	20	`from html2text.unescaping import *`
187	ashish	21
		22	`class naaptol_spider(BaseSpider):`
261	ashish	23	`"""`
		24	`Documentation for class naaptol_spider`
		25	`This spider collects the url for the individual phones`
		26	`and store them in table datastore_datadefinition_naaptol_urls.`
		27	`"""`
235	ashish	28	`def __init__(self):`
261	ashish	29	`"""`
		30	`Documentation for constructor`
		31	`initialize_table is called to make all the tables known in`
		32	`the scope of this class.`
		33	`Also start url needs to be feeded to the spider through start_urls.append`
		34	`Domainname is name by which this spider is known outside`
		35	`So this will be used as an argument for calling this spider`
		36	`"""`
235	ashish	37	`initialize_table()`
		38	`#NAAPTOL_DOMAINNAME = "naaptol"`
		39	`NAAPTOL_DOMAINNAME = get_code_word("NAAPTOL_DOMAINNAME")`
		40	`self.domain_name = NAAPTOL_DOMAINNAME`
		41	`#NAAPTOL_URL = "http://www.naaptol.com/sitemap.xml"`
		42	`NAAPTOL_URL = get_code_word("NAAPTOL_URL")`
		43	`self.start_urls.append(NAAPTOL_URL)`
187	ashish	44
		45
		46	`def start_requests(self):`
261	ashish	47	`"""`
		48	`Documentation for method start_requests`
		49	`To set various properties of the request to be made`
		50	`like referer, headers and all.`
		51	`Also suppliers entry need to be done in the table`
		52	`datastore_datadefinition_suppliers.`
		53	`@return a list of well formed requests which will be`
		54	`crawled by spider and spider will return the response`
		55	`"""`
235	ashish	56	`#adding entry for the supplier i.e its name and site`
		57	`#NAAPTOL_HOMEPAGE = "http://www.naaptol.com"`
		58	`NAAPTOL_HOMEPAGE = get_code_word("NAAPTOL_HOMEPAGE")`
187	ashish	59	`da = DataHelper()`
235	ashish	60	`da.add_supplier(self.domain_name, NAAPTOL_HOMEPAGE)`
187	ashish	61	`listreq = []`
235	ashish	62
		63	`#for each request a referer has to be set`
		64	`#NAAPTOL_REFERER = "http://www.google.com"`
		65	`NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")`
187	ashish	66	`for url1 in self.start_urls:`
		67	`request = Request(url = str(url1), callback=self.parse)`
235	ashish	68	`request.headers.setdefault("Referer", NAAPTOL_REFERER)`
187	ashish	69	`listreq.append(request)`
		70	`return listreq`
		71
		72	`def parse(self, response):`
261	ashish	73	`"""`
		74	`Documentation for method parse`
		75	`@param response of individual requests`
		76	`Using Xpaths needed information is extracted out of the response`
		77	`and added to the database`
		78	`Xpath1 = Give us url for individual phones`
		79	`chklist1 = elements in chk_list are specific to this site for determining valid sites`
		80	`"""`
187	ashish	81	`da = DataHelper()`
		82	`hxs = HtmlXPathSelector(response)`
235	ashish	83	`#NAAPTOL_XPATH1 = '//url/loc/text()'`
		84	`NAAPTOL_XPATH1 = get_code_word("NAAPTOL_XPATH1")`
290	gaurav	85
		86	`phone_urls = hxs.select(NAAPTOL_XPATH1)`
		87
235	ashish	88	`#elements in chk_list are specific to this site for determining valid sites`
		89	`#NAAPTOL_CHKLIST1 = ["mobile_phones/pdas_and_smartphones" ,"mobile_phones/gsm_handsets" ,"mobile_phones/cdma_handsets"]`
		90	`#list separeated by ';'`
271	ashish	91	`NAAPTOL_CHKLIST1 = str(get_code_word("NAAPTOL_CHKLIST1"))`
290	gaurav	92
271	ashish	93	`if len(NAAPTOL_CHKLIST1)>0:`
		94	`NAAPTOL_CHKLIST1 = NAAPTOL_CHKLIST1.split(';')`
290	gaurav	95	`for i in phone_urls:`
187	ashish	96	`site = i.extract()`
235	ashish	97	`site = unescape(site)`
187	ashish	98	`pos1 = pos2 = pos3 = 0`
		99	`temp =""`
235	ashish	100
		101	`# temp contains string b/w 2nd last and 3rd last slash(/)`
187	ashish	102	`pos1 = site.rfind('/')`
		103	`if pos1 != -1:`
		104	`pos2 = site.rfind('/',0,pos1-1)`
		105	`if pos2 != -1:`
		106	`pos3 = site.rfind('/',0,pos2-1)`
		107	`if pos3 > 0:`
		108	`temp = site[pos3+1:pos1]`
235	ashish	109	`for c in NAAPTOL_CHKLIST1:`
		110	`if temp == c:`
		111	`da.add_naaptolurl(site)`
290	gaurav	112	`SPIDER = naaptol_spider()`

Subversion Repositories SmartDukaan

(root)//prototype/naaptolpass1/src/demo/spiders/spider1.py – Rev 290