Rev 219 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 06-Jun-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore.DataAccessor import *from datastore.DataCodeAccessor import *from html2text import *import urllibclass babuchak1(BaseSpider):"""Documentation for class babuchak1This spider collects the url for the individual vendorsand store them in table datastore_datadefinition_babuchak_urls."""def __init__(self):"""Documentation for constructorinitialize_table is called to make all the tables known inthe scope of this class.Also start url needs to be feeded to the spider through start_urls.appendDomainname is name by which this spider is known outsideSo this will be used as an argument for calling this spider"""initialize_table()#BABUCHAK_DOMAINNAME = "babuchak"BABUCHAK_DOMAINNAME = get_code_word("BABUCHAK_DOMAINNAME")self.domain_name = BABUCHAK_DOMAINNAME#BABUCHAK_URL = "http://www.shopping.babuchak.com/visitourstores.php?view=productListPage&category=108"BABUCHAK_URL = get_code_word("BABUCHAK_URL")self.start_urls.append(BABUCHAK_URL)def start_requests(self):"""Documentation for method start_requestsTo set various properties of the request to be madelike referer, headers and all.Also suppliers entry need to be done in the tabledatastore_datadefinition_suppliers.@return a list of well formed requests which will becrawled by spider and spider will return the response"""#adding entry for the supplier i.e its name and site#BABUCHAK_HOMEPAGE = "http://www.shopping.babuchak.com"BABUCHAK_HOMEPAGE = get_code_word("BABUCHAK_HOMEPAGE")da = DataHelper()da.add_supplier(self.domain_name, BABUCHAK_HOMEPAGE)listreq = []#for each request a referer has to be set#BABUCHAK_REFERER = "www.google.com/search"BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse)request.headers.setdefault("Referer", BABUCHAK_REFERER)listreq.append(request)return listreqdef parse(self, response):"""Documentation for method parse@param response of individual requestsUsing Xpaths needed information is extracted out of the responseand added to the databaseXpath1 = Give us section for individual vendorsXpath2 = Give us no of pages for individual vendorsXpath3 = Give us url for individual vendorsUrl1 = To get full url for individual vendors"""da = DataHelper()#BABUCHAK_URL1 = "http://www.shopping.babuchak.com/visitourstores.php"BABUCHAK_URL1 = get_code_word("BABUCHAK_URL1")hxs = HtmlXPathSelector(response)#BABUCHAK_XPATH1 = '//td[@class="mod-category-header"]'BABUCHAK_XPATH1 = get_code_word("BABUCHAK_XPATH1")info = hxs.select(BABUCHAK_XPATH1)for i in info:#BABUCHAK_XPATH2 = './/text()'BABUCHAK_XPATH2 = get_code_word("BABUCHAK_XPATH2")#BABUCHAK_XPATH3 = './/a/@href'BABUCHAK_XPATH3 = get_code_word("BABUCHAK_XPATH3")no_pages = i.select(BABUCHAK_XPATH2)[2].extract()url = i.select(BABUCHAK_XPATH3)[0].extract()url = BABUCHAK_URL1 + urlno_pages = urllib.unquote(no_pages)no_pages = no_pages.strip()no_pages = no_pages[1:len(no_pages)-1]no_pages = int(no_pages)da.add_babuchakurl(url, no_pages)SPIDER = babuchak1()