Rev 221 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 06-Jun-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore.DataAccessor import *from datastore.DataCodeAccessor import *from html2text import *import urllibclass babuchak2(BaseSpider):"""Documentation for class babuchak2This spider collects the url for the individual phonesand store them in table datastore_datadefinition_babuchak_phoneurls."""def __init__(self):"""Documentation for constructorinitialize_table is called to make all the tables known inthe scope of this class.Also start url needs to be feeded to the spider through start_urls.appendDomainname1 is name by which this spider is known outsideSo this will be used as an argument for calling this spider"""initialize_table()#BABUCHAK_DOMAINNAME1 = "babuchak1"BABUCHAK_DOMAINNAME1 = get_code_word("BABUCHAK_DOMAINNAME1")self.domain_name = BABUCHAK_DOMAINNAME1#BABUCHAK_VAR1 = "&postPage="BABUCHAK_VAR1 = get_code_word("BABUCHAK_VAR1")da = DataHelper()for item in da.get_allbabuchakurls():ct = item.no_pageswhile ct>0:url = item.url + BABUCHAK_VAR1url = url + str(ct)self.start_urls.append(url)ct = ct -1def start_requests(self):"""Documentation for method start_requestsTo set various properties of the request to be madelike referer, headers and all.@return a list of well formed requests which will becrawled by spider and spider will return the response"""listreq = []#for each request a referer has to be set#BABUCHAK_REFERER = "www.google.com/search"BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse)request.headers.setdefault("Referer", BABUCHAK_REFERER)listreq.append(request)return listreqdef parse(self, response):"""Documentation for method parse@param response of individual requestsUsing Xpaths needed information is extracted out of the responseand added to the databaseXpath4 = Give us url for individual phoneUrl2 = To get full url for individual vendors"""da = DataHelper()#BABUCHAK_URL2 = "http://www.shopping.babuchak.com/visitourstores.php"BABUCHAK_URL2 = get_code_word("BABUCHAK_URL2")hxs = HtmlXPathSelector(response)#BABUCHAK_XPATH4 = '//td[@class="mod-item-body-title"]/a/@href'BABUCHAK_XPATH4 = get_code_word("BABUCHAK_XPATH4")info = hxs.select(BABUCHAK_XPATH4)for i in info:url = i.extract()url = url.strip()url = BABUCHAK_URL2 + urlda.add_babuchakphoneurl(url)SPIDER = babuchak2()