WebSVN – SmartDukaan – Blame – /prototype/naaptolpass1/src/demo/spiders/spider1.py

Rev	Author	Line No.	Line
187	ashish	1	`'''`
		2	`Created on 27-May-2010`
		3
		4	`@author: gaurav`
		5	`'''`
		6
		7
		8	`from scrapy.spider import BaseSpider`
		9	`from scrapy.selector import HtmlXPathSelector`
		10	`from scrapy.http import Request`
		11
		12	`from demo.items import DemoItem`
		13	`from scrapy.contrib.spidermiddleware import referer`
		14	`from scrapy.http.headers import Headers`
		15	`from scrapy.http.request.form import FormRequest`
		16	`from scrapy.log import msg`
		17	`from scrapy.http.response import Response`
235	ashish	18	`from datastore.DataCodeAccessor import *`
		19	`from datastore.DataAccessor import *`
187	ashish	20
235	ashish	21	`from html2text.unescaping import *`
187	ashish	22
		23	`class naaptol_spider(BaseSpider):`
261	ashish	24	`"""`
		25	`Documentation for class naaptol_spider`
		26	`This spider collects the url for the individual phones`
		27	`and store them in table datastore_datadefinition_naaptol_urls.`
		28	`"""`
235	ashish	29	`def __init__(self):`
261	ashish	30	`"""`
		31	`Documentation for constructor`
		32	`initialize_table is called to make all the tables known in`
		33	`the scope of this class.`
		34	`Also start url needs to be feeded to the spider through start_urls.append`
		35	`Domainname is name by which this spider is known outside`
		36	`So this will be used as an argument for calling this spider`
		37	`"""`
235	ashish	38	`initialize_table()`
		39	`#NAAPTOL_DOMAINNAME = "naaptol"`
		40	`NAAPTOL_DOMAINNAME = get_code_word("NAAPTOL_DOMAINNAME")`
		41	`self.domain_name = NAAPTOL_DOMAINNAME`
		42	`#NAAPTOL_URL = "http://www.naaptol.com/sitemap.xml"`
		43	`NAAPTOL_URL = get_code_word("NAAPTOL_URL")`
		44	`self.start_urls.append(NAAPTOL_URL)`
187	ashish	45
		46
		47	`def start_requests(self):`
261	ashish	48	`"""`
		49	`Documentation for method start_requests`
		50	`To set various properties of the request to be made`
		51	`like referer, headers and all.`
		52	`Also suppliers entry need to be done in the table`
		53	`datastore_datadefinition_suppliers.`
		54	`@return a list of well formed requests which will be`
		55	`crawled by spider and spider will return the response`
		56	`"""`
235	ashish	57	`#adding entry for the supplier i.e its name and site`
		58	`#NAAPTOL_HOMEPAGE = "http://www.naaptol.com"`
		59	`NAAPTOL_HOMEPAGE = get_code_word("NAAPTOL_HOMEPAGE")`
187	ashish	60	`da = DataHelper()`
235	ashish	61	`da.add_supplier(self.domain_name, NAAPTOL_HOMEPAGE)`
187	ashish	62	`listreq = []`
235	ashish	63
		64	`#for each request a referer has to be set`
		65	`#NAAPTOL_REFERER = "http://www.google.com"`
		66	`NAAPTOL_REFERER = get_code_word("NAAPTOL_REFERER")`
187	ashish	67	`for url1 in self.start_urls:`
		68	`request = Request(url = str(url1), callback=self.parse)`
235	ashish	69	`request.headers.setdefault("Referer", NAAPTOL_REFERER)`
187	ashish	70	`listreq.append(request)`
		71	`return listreq`
		72
		73	`def parse(self, response):`
261	ashish	74	`"""`
		75	`Documentation for method parse`
		76	`@param response of individual requests`
		77	`Using Xpaths needed information is extracted out of the response`
		78	`and added to the database`
		79	`Xpath1 = Give us url for individual phones`
		80	`chklist1 = elements in chk_list are specific to this site for determining valid sites`
		81	`"""`
187	ashish	82	`da = DataHelper()`
		83	`hxs = HtmlXPathSelector(response)`
235	ashish	84	`#NAAPTOL_XPATH1 = '//url/loc/text()'`
		85	`NAAPTOL_XPATH1 = get_code_word("NAAPTOL_XPATH1")`
		86	`phone_urls = hxs.select(NAAPTOL_XPATH1)`
		87
		88	`#elements in chk_list are specific to this site for determining valid sites`
		89	`#NAAPTOL_CHKLIST1 = ["mobile_phones/pdas_and_smartphones" ,"mobile_phones/gsm_handsets" ,"mobile_phones/cdma_handsets"]`
		90	`#list separeated by ';'`
		91	`NAAPTOL_CHKLIST1 = get_code_word("NAAPTOL_CHKLIST1")`
		92	`NAAPTOL_CHKLIST1 = NAAPTOL_CHKLIST1.split(';')`
187	ashish	93	`for i in phone_urls:`
		94	`site = i.extract()`
235	ashish	95	`site = unescape(site)`
187	ashish	96	`pos1 = pos2 = pos3 = 0`
		97	`temp =""`
235	ashish	98
		99	`# temp contains string b/w 2nd last and 3rd last slash(/)`
187	ashish	100	`pos1 = site.rfind('/')`
		101	`if pos1 != -1:`
		102	`pos2 = site.rfind('/',0,pos1-1)`
		103	`if pos2 != -1:`
		104	`pos3 = site.rfind('/',0,pos2-1)`
		105	`if pos3 > 0:`
		106	`temp = site[pos3+1:pos1]`
235	ashish	107	`for c in NAAPTOL_CHKLIST1:`
		108	`if temp == c:`
		109	`da.add_naaptolurl(site)`
187	ashish	110	`SPIDER = naaptol_spider()`

Subversion Repositories SmartDukaan

(root)/prototype/naaptolpass1/src/demo/spiders/spider1.py – Rev 261