Rev 258 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 17-May-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore.DataAccessor import *from datastore.DataCodeAccessor import *import urllibfrom xml.dom import INDEX_SIZE_ERRfrom html2text.unescaping import *class indiaplaza_extra(BaseSpider):"""Documentation for class indiaplaza_extraThis spider collects all the information for the individual phonesand store them in table datastore_datadefinition_indiaplaza_items."""def __init__(self):"""Documentation for constructorinitialize_table is called to make all the tables known inthe scope of this class.Also start url needs to be feeded to the spider through start_urls.appendDomainname1 is name by which this spider is known outsideSo this will be used as an argument for calling this spider"""initialize_table()#INDIAPLAZA_DOMAINNAME1 = "indiaplaza1"INDIAPLAZA_DOMAINNAME1 = get_code_word("INDIAPLAZA_DOMAINNAME1")self.domain_name = INDIAPLAZA_DOMAINNAME1# get urls from the database and append them in the list for crawlingda = DataHelper()for pitem in da.get_all_ipbasic():self.start_urls.append(pitem.v_site.strip())def start_requests(self):"""Documentation for method start_requestsTo set various properties of the request to be madelike referer, headers and all.@return a list of well formed requests which will becrawled by spider and spider will return the response"""listreq = []#for each request a referer has to be set#INDIAPLAZA_REFERER = "www.google.com/search"INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse)request.headers.setdefault("Referer", INDIAPLAZA_REFERER)listreq.append(request)return listreqdef parse(self, response):"""Documentation for method parse@param response of individual requestsUsing Xpaths needed information is extracted out of the responseand added to the databaseXpath4 = Give us name for individual phoneXpath5 = Give us quoted-price for individual phoneXpath6 = Give us ship-price for individual phoneXpath7 = Give us ship_price for individual phone, if not gettable form xpath6Xpath8 = Give us guarantee-info for individual phoneXpath9 = Give us guarantee-info for individual phone, if not gettable form xpath8Xpath10 = Give us ship-info for individual phoneRemovelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'"""hxs = HtmlXPathSelector(response)#INDIAPLAZA_REMOVELIST = ["Rs.","Rs",",","-","/"]#List separated by ';'INDIAPLAZA_REMOVELIST = str(get_code_word("INDIAPLAZA_REMOVELIST"))if len(INDIAPLAZA_REMOVELIST)>0:INDIAPLAZA_REMOVELIST = INDIAPLAZA_REMOVELIST.split(';')#INDIAPLAZA_XPATH4 = './/div[@class="finDetHdr"]/h1/text()'INDIAPLAZA_XPATH4 = get_code_word("INDIAPLAZA_XPATH4")#INDIAPLAZA_XPATH5 = './/div[@class="priceArea"]/span[1]/text()'INDIAPLAZA_XPATH5 = get_code_word("INDIAPLAZA_XPATH5")#INDIAPLAZA_XPATH6 = './/div[@class="priceArea"]/div[@class="row"][2]/text()'INDIAPLAZA_XPATH6 = get_code_word("INDIAPLAZA_XPATH6")#INDIAPLAZA_XPATH7 = './/div[@class="priceArea"]/div[@class="row"][2]/span/text()'INDIAPLAZA_XPATH7 = get_code_word("INDIAPLAZA_XPATH7")#INDIAPLAZA_XPATH8 = './/div[@class="priceArea"]/div[@class="row"][3]/text()'INDIAPLAZA_XPATH8 = get_code_word("INDIAPLAZA_XPATH8")#INDIAPLAZA_XPATH9 = './/div[@class="priceArea"]/div[@class="row"][4]/text()'INDIAPLAZA_XPATH9 = get_code_word("INDIAPLAZA_XPATH9")#INDIAPLAZA_XPATH10 = './/div[@class="priceArea"]/div[@class="row"][1]/text()'INDIAPLAZA_XPATH10 = get_code_word("INDIAPLAZA_XPATH10")name = hxs.select(INDIAPLAZA_XPATH4)[0].extract()name = unescape(name)price = hxs.select(INDIAPLAZA_XPATH5)[0].extract()try:ship_price = hxs.select(INDIAPLAZA_XPATH6)[0].extract()except IndexError:ship_price = hxs.select(INDIAPLAZA_XPATH7)[0].extract()try:guarantee_info = hxs.select(INDIAPLAZA_XPATH8)[0].extract()except IndexError:guarantee_info = hxs.select(INDIAPLAZA_XPATH9)[0].extract()ship_info = hxs.select(INDIAPLAZA_XPATH10)[0].extract()urllib.unquote(name)urllib.unquote(price)urllib.unquote(ship_price)urllib.unquote(guarantee_info)urllib.unquote(ship_info)#INDIAPLAZA_VAR1 = "Free shipping"INDIAPLAZA_VAR1 = get_code_word("INDIAPLAZA_VAR1")if ship_price == INDIAPLAZA_VAR1:ship_price = "0"else :if ship_price != '':for r in INDIAPLAZA_REMOVELIST:while ship_price.find(r) != -1:ship_price = ship_price.replace(r, "")if price != '':for r in INDIAPLAZA_REMOVELIST:while price.find(r) != -1:price = price.replace(r, "")name = name.strip()price = price.strip()ship_price = ship_price.strip()guarantee_info = guarantee_info.strip()ship_info = ship_info.strip()shown_pr = int(price)final_pr = shown_pr + int(ship_price)da = DataHelper()da.add_ipextra(name,shown_pr,final_pr,guarantee_info,ship_info)SPIDER = indiaplaza_extra()