Rev 267 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 14-May-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore.DataCodeAccessor import *from datastore.DataAccessor import *from html2text.unescaping import *class univercell_price(BaseSpider):"""Documentation for class univercell_priceThis spider collects the information for the individual phonesand store them in table datastore_datadefinition_univercell_items"""def __init__(self):"""Documentation for constructorinitialize_table is called to make all the tables known inthe scope of this class.Also start url needs to be feeded to the spider through start_urls.appendDomainname1 is name by which this spider is known outsideSo this will be used as an argument for calling this spider"""initialize_table()#UNIVERCELL_DOMAINNAME1 = "univercell1"UNIVERCELL_DOMAINNAME1 = get_code_word("UNIVERCELL_DOMAINNAME1")self.domain_name = UNIVERCELL_DOMAINNAME1# get urls from the database and append them in the list for crawlingda = DataHelper()for pitem in da.get_all_univervendors():self.start_urls.append(pitem.v_site.strip())def start_requests(self):"""Documentation for method start_requestsTo set various properties of the request to be madelike referer, headers and all.@return a list of well formed requests which will becrawled by spider and spider will return the response"""#for each request a referer has to be setlistreq = []#UNIVERCELL_REFERER = "www.google.com/search"UNIVERCELL_REFERER = get_code_word("UNIVERCELL_REFERER")for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse)request.headers.setdefault("Referer", UNIVERCELL_REFERER)listreq.append(request)return listreqdef parse(self, response):"""Documentation for method parse@param response of individual requestsUsing Xpaths needed information is extracted out of the responseand added to the databaseXpath4 = Give us section for individual phoneXpath5 = Give us name for individual phoneXpath6 = Give us quoted-price for individual phonevatplustax = Give us final_price for individual phone on adding with quoted-priceRemovelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'"""da = DataHelper()#UNIVERCELL_VATPLUSTAX = 0#removelist is used for converting price to decimal format containing only numbers and '.'#UNIVERCELL_REMOVELIST = ["Rs",",","-","/"]#list separated by ';'UNIVERCELL_REMOVELIST = str(get_code_word("UNIVERCELL_REMOVELIST"))if len(UNIVERCELL_REMOVELIST)>0:UNIVERCELL_REMOVELIST = UNIVERCELL_REMOVELIST.split(';')hxs = HtmlXPathSelector(response)#UNIVERCELL_XPATH4 = '//td[@class="gray-border"]'UNIVERCELL_XPATH4 = get_code_word("UNIVERCELL_XPATH4")sites = hxs.select(UNIVERCELL_XPATH4)items = []for site in sites:item = {}#UNIVERCELL_XPATH5 = './/tr[2]/td/a/text()'UNIVERCELL_XPATH5 = get_code_word("UNIVERCELL_XPATH5")item['title'] = site.select(UNIVERCELL_XPATH5)[0].extract()#UNIVERCELL_XPATH6 = './/tr[3]/th/label/text()'UNIVERCELL_XPATH6 = get_code_word("UNIVERCELL_XPATH6")item['price'] =site.select(UNIVERCELL_XPATH6)[0].extract()items.append(item)for i in items:str1 = str(i['title']).strip()amnt = i['price']if amnt != '':for r in UNIVERCELL_REMOVELIST:while amnt.find(r) != -1:amnt = amnt.replace(r, "")amnt = amnt.strip()# 4% additional vat is there on the priceUNIVERCELL_VATPLUSTAX = 4*int(amnt)/100pr = int(amnt) + UNIVERCELL_VATPLUSTAX#adding model-name,quotedprice and finalpriceda.add_new_univerphone(unescape(str1),amnt,pr)SPIDER = univercell_price()