WebSVN – SmartDukaan – Blame – /prototype/babuchakScrapypass2/src/demo/spiders/spider2.py

Rev	Author	Line No.	Line
221	ashish	1	`'''`
		2	`Created on 06-Jun-2010`
		3
		4	`@author: gaurav`
		5	`'''`
		6
		7	`from scrapy.spider import BaseSpider`
		8	`from scrapy.selector import HtmlXPathSelector`
		9	`from scrapy.http import Request`
		10
		11	`from demo.items import DemoItem`
		12	`from scrapy.contrib.spidermiddleware import referer`
		13	`from scrapy.http.headers import Headers`
		14	`from scrapy.http.request.form import FormRequest`
		15	`from scrapy.log import msg`
		16	`from scrapy.http.response import Response`
		17
		18	`from datastore.DataAccessor import *`
		19	`from datastore.DataCodeAccessor import *`
		20	`from html2text import *`
		21	`import urllib`
		22
		23	`class babuchak2(BaseSpider):`
252	ashish	24	`"""`
		25	`Documentation for class babuchak2`
		26	`This spider collects the url for the individual phones`
		27	`and store them in table datastore_datadefinition_babuchak_phoneurls.`
		28	`"""`
221	ashish	29	`def __init__(self):`
252	ashish	30	`"""`
		31	`Documentation for constructor`
		32	`initialize_table is called to make all the tables known in`
		33	`the scope of this class.`
		34	`Also start url needs to be feeded to the spider through start_urls.append`
		35	`Domainname1 is name by which this spider is known outside`
		36	`So this will be used as an argument for calling this spider`
		37	`"""`
221	ashish	38	`initialize_table()`
		39	`#BABUCHAK_DOMAINNAME1 = "babuchak1"`
		40	`BABUCHAK_DOMAINNAME1 = get_code_word("BABUCHAK_DOMAINNAME1")`
		41	`self.domain_name = BABUCHAK_DOMAINNAME1`
		42	`#BABUCHAK_VAR1 = "&postPage="`
		43	`BABUCHAK_VAR1 = get_code_word("BABUCHAK_VAR1")`
		44	`da = DataHelper()`
		45	`for item in da.get_allbabuchakurls():`
		46	`ct = item.no_pages`
		47	`while ct>0:`
		48	`url = item.url + BABUCHAK_VAR1`
		49	`url = url + str(ct)`
		50	`self.start_urls.append(url)`
252	ashish	51	`ct = ct -1`
221	ashish	52
		53	`def start_requests(self):`
252	ashish	54	`"""`
		55	`Documentation for method start_requests`
		56	`To set various properties of the request to be made`
		57	`like referer, headers and all.`
		58	`@return a list of well formed requests which will be`
		59	`crawled by spider and spider will return the response`
		60	`"""`
221	ashish	61	`listreq = []`
		62	`#for each request a referer has to be set`
		63	`#BABUCHAK_REFERER = "www.google.com/search"`
		64	`BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")`
		65	`for url1 in self.start_urls:`
		66	`request = Request(url = str(url1), callback=self.parse)`
		67	`request.headers.setdefault("Referer", BABUCHAK_REFERER)`
		68	`listreq.append(request)`
		69	`return listreq`
		70
		71	`def parse(self, response):`
252	ashish	72	`"""`
		73	`Documentation for method parse`
		74	`@param response of individual requests`
		75	`Using Xpaths needed information is extracted out of the response`
		76	`and added to the database`
		77	`Xpath4 = Give us url for individual phone`
		78	`Url2 = To get full url for individual vendors`
		79	`"""`
221	ashish	80	`da = DataHelper()`
		81	`#BABUCHAK_URL2 = "http://www.shopping.babuchak.com/visitourstores.php"`
		82	`BABUCHAK_URL2 = get_code_word("BABUCHAK_URL2")`
		83	`hxs = HtmlXPathSelector(response)`
		84	`#BABUCHAK_XPATH4 = '//td[@class="mod-item-body-title"]/a/@href'`
		85	`BABUCHAK_XPATH4 = get_code_word("BABUCHAK_XPATH4")`
		86
		87	`info = hxs.select(BABUCHAK_XPATH4)`
		88	`for i in info:`
		89	`url = i.extract()`
		90	`url = url.strip()`
		91	`url = BABUCHAK_URL2 + url`
		92	`da.add_babuchakphoneurl(url)`
		93
		94	`SPIDER = babuchak2()`

Subversion Repositories SmartDukaan

(root)/prototype/babuchakScrapypass2/src/demo/spiders/spider2.py – Rev 252