Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 06-Jun-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore.DataAccessor import *from datastore.DataCodeAccessor import *from html2text import *from babel.messages.pofile import unescapeimport urllibclass babuchak1(BaseSpider):def __init__(self):initialize_table()#BABUCHAK_DOMAINNAME = "babuchak"BABUCHAK_DOMAINNAME = get_code_word("BABUCHAK_DOMAINNAME")self.domain_name = BABUCHAK_DOMAINNAME#BABUCHAK_URL = "http://www.shopping.babuchak.com/visitourstores.php?view=productListPage&category=108"BABUCHAK_URL = get_code_word("BABUCHAK_URL")self.start_urls.append(BABUCHAK_URL)def start_requests(self):#adding entry for the supplier i.e its name and site#BABUCHAK_HOMEPAGE = "http://www.shopping.babuchak.com"BABUCHAK_HOMEPAGE = get_code_word("BABUCHAK_HOMEPAGE")da = DataHelper()da.add_supplier(self.domain_name, BABUCHAK_HOMEPAGE)listreq = []#for each request a referer has to be set#BABUCHAK_REFERER = "www.google.com/search"BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse)request.headers.setdefault("Referer", BABUCHAK_REFERER)listreq.append(request)return listreqdef parse(self, response):#url1 needed to get complete urlsda = DataHelper()#BABUCHAK_URL1 = "http://www.shopping.babuchak.com/visitourstores.php"BABUCHAK_URL1 = get_code_word("BABUCHAK_URL1")hxs = HtmlXPathSelector(response)#BABUCHAK_XPATH1 = '//td[@class="mod-category-header"]'BABUCHAK_XPATH1 = get_code_word("BABUCHAK_XPATH1")info = hxs.select(BABUCHAK_XPATH1)for i in info:#BABUCHAK_XPATH2 = './/text()'BABUCHAK_XPATH2 = get_code_word("BABUCHAK_XPATH2")#BABUCHAK_XPATH3 = './/a/@href'BABUCHAK_XPATH3 = get_code_word("BABUCHAK_XPATH3")no_pages = i.select(BABUCHAK_XPATH2)[2].extract()#print i.select(BABUCHAK_XPATH2)[1].extract() + " "url = i.select(BABUCHAK_XPATH3)[0].extract()url = BABUCHAK_URL1 + urlno_pages = urllib.unquote(no_pages)no_pages = no_pages.strip()no_pages = no_pages[1:len(no_pages)-1]no_pages = int(no_pages)#print url + " "#print no_pagesda.add_babuchakurl(url, no_pages)SPIDER = babuchak1()