WebSVN – SmartDukaan – Blame – /prototype/indiaplazaScrapypass1/src/demo/spiders/indiaplazaspider.py

Rev	Author	Line No.	Line
158	ashish	1	`'''`
		2	`Created on 16-May-2010`
		3
		4	`@author: gaurav`
		5	`'''`
		6
		7
		8	`from scrapy.spider import BaseSpider`
		9	`from scrapy.selector import HtmlXPathSelector`
		10	`from scrapy.http import Request`
		11
		12	`from demo.items import DemoItem`
		13	`from scrapy.contrib.spidermiddleware import referer`
		14	`from scrapy.http.headers import Headers`
		15	`from scrapy.http.request.form import FormRequest`
		16	`from scrapy.log import msg`
		17	`from scrapy.http.response import Response`
		18
226	ashish	19	`from html2text.unescaping import *`
		20	`from datastore.DataAccessor import *`
		21	`from datastore.DataCodeAccessor import *`
158	ashish	22
		23
226	ashish	24
158	ashish	25	`class indiaplaza_spider(BaseSpider):`
257	ashish	26	`"""`
		27	`Documentation for class indiaplaza_spider`
		28	`This spider collects the url for the individual phones`
		29	`and store them in table datastore_datadefinition_indiaplaza_data.`
		30	`"""`
158	ashish	31	`def __init__(self):`
257	ashish	32	`"""`
		33	`Documentation for constructor`
		34	`initialize_table is called to make all the tables known in`
		35	`the scope of this class.`
		36	`Also start url needs to be feeded to the spider through start_urls.append`
		37	`Domainname is name by which this spider is known outside`
		38	`So this will be used as an argument for calling this spider`
		39	`Since, the number of pages is not fixed Ct and no are used to make it dynamic`
		40	`"""`
226	ashish	41	`initialize_table()`
		42	`da = DataHelper()`
		43	`#INDIAPLAZA_DOMAINNAME = "indiaplaza"`
		44	`INDIAPLAZA_DOMAINNAME = get_code_word("INDIAPLAZA_DOMAINNAME")`
		45	`self.domain_name = INDIAPLAZA_DOMAINNAME`
		46	`#INDIAPLAZA_CT = 18`
		47	`#INDIAPLAZA_CT = int(get_code_word("INDIAPLAZA_CT"))`
		48	`#INDIAPLAZA_NO = 1`
		49	`CT = int(da.get_extra_vars('indiaplaza_count'))`
		50	`NO = 1`
		51	`if CT>18:`
		52	`NO = CT`
		53	`#INDIAPLAZA_URL = "http://www.indiaplaza.in/mobile-phones-Mobiles-1.htm?PageNo="`
		54	`INDIAPLAZA_URL = get_code_word("INDIAPLAZA_URL")`
		55	`while(NO<=CT):`
		56	`url1 = INDIAPLAZA_URL + str(NO)`
		57	`self.start_urls.append(url1)`
		58	`NO=NO+1`
		59
158	ashish	60	`def start_requests(self):`
257	ashish	61	`"""`
		62	`Documentation for method start_requests`
		63	`To set various properties of the request to be made`
		64	`like referer, headers and all.`
		65	`Also suppliers entry need to be done in the table`
		66	`datastore_datadefinition_suppliers.`
		67	`@return a list of well formed requests which will be`
		68	`crawled by spider and spider will return the response`
		69	`"""`
226	ashish	70	`#adding entry for the supplier i.e its name and site`
		71	`#INDIAPLAZA_HOMEPAGE = "www.indiaplaza.com"`
		72	`INDIAPLAZA_HOMEPAGE = get_code_word("INDIAPLAZA_HOMEPAGE")`
178	ashish	73	`da = DataHelper()`
226	ashish	74	`da.add_supplier(self.domain_name, INDIAPLAZA_HOMEPAGE)`
158	ashish	75	`listreq = []`
226	ashish	76
		77	`#for each request a referer has to be set`
		78	`#INDIAPLAZA_REFERER = "www.google.com/search"`
		79	`INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")`
158	ashish	80	`for url1 in self.start_urls:`
226	ashish	81	`request = Request(url = str(url1), callback=self.parse, dont_filter=True)`
		82	`request.headers.setdefault("Referer", INDIAPLAZA_REFERER)`
158	ashish	83	`listreq.append(request)`
		84	`return listreq`
226	ashish	85
158	ashish	86	`def parse(self, response):`
257	ashish	87	`"""`
		88	`Documentation for method parse`
		89	`@param response of individual requests`
		90	`Using Xpaths needed information is extracted out of the response`
		91	`and added to the database`
		92	`Xpath1 = Give us section for individual phone`
		93	`Xpath2 = Give us name of individual phone`
		94	`Xpath3 = Give us url of individual phone`
		95	`Url1 = To get full url for individual phones`
		96	`"""`
226	ashish	97	`da = DataHelper()`
		98	`#INDIAPLAZA_URL1 = "http://www.indiaplaza.in"`
		99	`INDIAPLAZA_URL1 = get_code_word("INDIAPLAZA_URL1")`
158	ashish	100	`hxs = HtmlXPathSelector(response)`
226	ashish	101	`#INDIAPLAZA_XPATH1 = '//tr/td/table[@id="browsesku"]'`
		102	`INDIAPLAZA_XPATH1 = get_code_word("INDIAPLAZA_XPATH1")`
		103	`phone_info = hxs.select(INDIAPLAZA_XPATH1)`
158	ashish	104	`items = []`
226	ashish	105	`#INDIAPLAZA_XPATH2 = './/div[@class="skuimg"]/a/@title'`
		106	`INDIAPLAZA_XPATH2 = get_code_word("INDIAPLAZA_XPATH2")`
		107	`#INDIAPLAZA_XPATH3 = './/div[@class="skuimg"]/a/@href'`
		108	`INDIAPLAZA_XPATH3 = get_code_word("INDIAPLAZA_XPATH3")`
		109	`if not phone_info:`
		110	`ct = int(da.get_extra_vars('indiaplaza_count'))`
		111	`if ct>18:`
		112	`fails = int(da.get_extra_vars('indiaplaza_fails'))`
		113	`fails = fails+1`
		114	`da.set_extra_vars('indiaplaza_fails',str(fails),'')`
		115	`if fails > 0:`
		116	`da.set_extra_vars('indiaplaza_flag','FALSE','')`
		117	`da.set_extra_vars('indiaplaza_fails',str(fails),'')`
		118	`else:`
		119	`for i in phone_info:`
		120	`item = {}`
		121	`item['title'] = i.select(INDIAPLAZA_XPATH2)[0].extract()`
		122	`item['url'] = i.select(INDIAPLAZA_XPATH3)[0].extract()`
		123	`items.append(item)`
		124
		125	`for item in items:`
		126	`str2 = INDIAPLAZA_URL1 + str(item['url'])`
		127	`da.add_ipbasic(item['title'],unescape(str2))`
		128
158	ashish	129	`SPIDER = indiaplaza_spider()`

Subversion Repositories SmartDukaan

(root)/prototype/indiaplazaScrapypass1/src/demo/spiders/indiaplazaspider.py – Rev 257