Rev 253 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 06-Jun-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore.DataAccessor import *from datastore.DataCodeAccessor import *from html2text import *import urllibclass babuchak3(BaseSpider):"""Documentation for class babuchak3This spider collects the information for the individual phonesand store them in table datastore_datadefinition_babuchak_phones."""def __init__(self):"""Documentation for constructorinitialize_table is called to make all the tables known inthe scope of this class.Also start url needs to be feeded to the spider through start_urls.appendDomainname2 is name by which this spider is known outsideSo this will be used as an argument for calling this spider"""initialize_table()#BABUCHAK_DOMAINNAME2 = "babuchak2"BABUCHAK_DOMAINNAME2 = get_code_word("BABUCHAK_DOMAINNAME2")self.domain_name = BABUCHAK_DOMAINNAME2da = DataHelper()for item in da.get_allbabuchakphoneurls():self.start_urls.append(item.url)def start_requests(self):"""Documentation for method start_requestsTo set various properties of the request to be madelike referer, headers and all.@return a list of well formed requests which will becrawled by spider and spider will return the response"""listreq = []#for each request a referer has to be set#BABUCHAK_REFERER = "www.google.com/search"BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse)request.headers.setdefault("Referer", BABUCHAK_REFERER)listreq.append(request)return listreqdef parse(self, response):"""Documentation for method parse@param response of individual requestsUsing Xpaths needed information is extracted out of the responseand added to the databaseXpath5 = Give us name for individual phoneXpath6 = Give us quoted-price for individual phoneXpath7 = Give us final_price for individual phoneRemovelist = To filer the prices so as to make them integer for eg remove ',' or 'Rs'"""da = DataHelper()hxs = HtmlXPathSelector(response)#BABUCHAK_XPATH5 = '//td[@class="text-header"]/text()'BABUCHAK_XPATH5 = get_code_word("BABUCHAK_XPATH5")#BABUCHAK_XPATH6 = '//td[@class="xl63"]//strong/span/text()'BABUCHAK_XPATH6 = get_code_word("BABUCHAK_XPATH6")#BABUCHAK_XPATH7 = '//td[@class="mod-item-body-title"]/b/text()'BABUCHAK_XPATH7 = get_code_word("BABUCHAK_XPATH7")#BABUCHAK_REMOVELIST = ["Rs.","Rs",",","-","/"]#list separated by ';'BABUCHAK_REMOVELIST = str(get_code_word("BABUCHAK_REMOVELIST"))if len(BABUCHAK_REMOVELIST)>0:BABUCHAK_REMOVELIST = BABUCHAK_REMOVELIST.split(';')name = hxs.select(BABUCHAK_XPATH5)[0].extract()try:shown_price = hxs.select(BABUCHAK_XPATH6)[0].extract()final_price = hxs.select(BABUCHAK_XPATH6)[2].extract()except:final_price = shown_price = hxs.select(BABUCHAK_XPATH7)[0].extract()name = name.strip()shown_price = shown_price.strip()final_price = final_price.strip()if shown_price != '':for r in BABUCHAK_REMOVELIST:while shown_price.find(r) != -1:shown_price = shown_price.replace(r, "")shown_price = shown_price.strip()if final_price != '':for r in BABUCHAK_REMOVELIST:while final_price.find(r) != -1:final_price = final_price.replace(r, "")final_price = final_price.strip()ps1 = shown_price.find('.')if ps1 != -1:shown_price = shown_price[0:ps1]final_price = shown_priceshown_price = int(shown_price)final_price = int(final_price)#There were some phones on which discount was there so it had#marked price, quoted price and final priceif shown_price>final_price:try:shown_price = hxs.select(BABUCHAK_XPATH6)[1].extract()final_price = hxs.select(BABUCHAK_XPATH6)[2].extract()except:final_price = shown_price = hxs.select(BABUCHAK_XPATH7)[0].extract()name = name.strip()shown_price = shown_price.strip()final_price = final_price.strip()if shown_price != '':for r in BABUCHAK_REMOVELIST:while shown_price.find(r) != -1:shown_price = shown_price.replace(r, "")shown_price = shown_price.strip()if final_price != '':for r in BABUCHAK_REMOVELIST:while final_price.find(r) != -1:final_price = final_price.replace(r, "")final_price = final_price.strip()ps1 = shown_price.find('.')if ps1 != -1:shown_price = shown_price[0:ps1]final_price = shown_priceshown_price = int(shown_price)final_price = int(final_price)da.add_babuchakphone(name,shown_price,final_price)SPIDER = babuchak3()