Rev 226 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 16-May-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom html2text.unescaping import *from datastore.DataAccessor import *from datastore.DataCodeAccessor import *class indiaplaza_spider(BaseSpider):"""Documentation for class indiaplaza_spiderThis spider collects the url for the individual phonesand store them in table datastore_datadefinition_indiaplaza_data."""def __init__(self):"""Documentation for constructorinitialize_table is called to make all the tables known inthe scope of this class.Also start url needs to be feeded to the spider through start_urls.appendDomainname is name by which this spider is known outsideSo this will be used as an argument for calling this spiderSince, the number of pages is not fixed Ct and no are used to make it dynamic"""initialize_table()da = DataHelper()#INDIAPLAZA_DOMAINNAME = "indiaplaza"INDIAPLAZA_DOMAINNAME = get_code_word("INDIAPLAZA_DOMAINNAME")self.domain_name = INDIAPLAZA_DOMAINNAME#INDIAPLAZA_CT = 18#INDIAPLAZA_CT = int(get_code_word("INDIAPLAZA_CT"))#INDIAPLAZA_NO = 1CT = int(da.get_extra_vars('indiaplaza_count'))NO = 1if CT>18:NO = CT#INDIAPLAZA_URL = "http://www.indiaplaza.in/mobile-phones-Mobiles-1.htm?PageNo="INDIAPLAZA_URL = get_code_word("INDIAPLAZA_URL")while(NO<=CT):url1 = INDIAPLAZA_URL + str(NO)self.start_urls.append(url1)NO=NO+1def start_requests(self):"""Documentation for method start_requestsTo set various properties of the request to be madelike referer, headers and all.Also suppliers entry need to be done in the tabledatastore_datadefinition_suppliers.@return a list of well formed requests which will becrawled by spider and spider will return the response"""#adding entry for the supplier i.e its name and site#INDIAPLAZA_HOMEPAGE = "www.indiaplaza.com"INDIAPLAZA_HOMEPAGE = get_code_word("INDIAPLAZA_HOMEPAGE")da = DataHelper()da.add_supplier(self.domain_name, INDIAPLAZA_HOMEPAGE)listreq = []#for each request a referer has to be set#INDIAPLAZA_REFERER = "www.google.com/search"INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse, dont_filter=True)request.headers.setdefault("Referer", INDIAPLAZA_REFERER)listreq.append(request)return listreqdef parse(self, response):"""Documentation for method parse@param response of individual requestsUsing Xpaths needed information is extracted out of the responseand added to the databaseXpath1 = Give us section for individual phoneXpath2 = Give us name of individual phoneXpath3 = Give us url of individual phoneUrl1 = To get full url for individual phones"""da = DataHelper()#INDIAPLAZA_URL1 = "http://www.indiaplaza.in"INDIAPLAZA_URL1 = get_code_word("INDIAPLAZA_URL1")hxs = HtmlXPathSelector(response)#INDIAPLAZA_XPATH1 = '//tr/td/table[@id="browsesku"]'INDIAPLAZA_XPATH1 = get_code_word("INDIAPLAZA_XPATH1")phone_info = hxs.select(INDIAPLAZA_XPATH1)items = []#INDIAPLAZA_XPATH2 = './/div[@class="skuimg"]/a/@title'INDIAPLAZA_XPATH2 = get_code_word("INDIAPLAZA_XPATH2")#INDIAPLAZA_XPATH3 = './/div[@class="skuimg"]/a/@href'INDIAPLAZA_XPATH3 = get_code_word("INDIAPLAZA_XPATH3")if not phone_info:ct = int(da.get_extra_vars('indiaplaza_count'))if ct>18:fails = int(da.get_extra_vars('indiaplaza_fails'))fails = fails+1da.set_extra_vars('indiaplaza_fails',str(fails),'')if fails > 0:da.set_extra_vars('indiaplaza_flag','FALSE','')da.set_extra_vars('indiaplaza_fails',str(fails),'')else:for i in phone_info:item = {}item['title'] = i.select(INDIAPLAZA_XPATH2)[0].extract()item['url'] = i.select(INDIAPLAZA_XPATH3)[0].extract()items.append(item)for item in items:str2 = INDIAPLAZA_URL1 + str(item['url'])da.add_ipbasic(item['title'],unescape(str2))SPIDER = indiaplaza_spider()