Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 06-Jun-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore.DataAccessor import *from datastore.DataCodeAccessor import *from html2text import *from babel.messages.pofile import unescapeimport urllibclass babuchak2(BaseSpider):def __init__(self):initialize_table()#BABUCHAK_DOMAINNAME1 = "babuchak1"BABUCHAK_DOMAINNAME1 = get_code_word("BABUCHAK_DOMAINNAME1")self.domain_name = BABUCHAK_DOMAINNAME1#BABUCHAK_VAR1 = "&postPage="BABUCHAK_VAR1 = get_code_word("BABUCHAK_VAR1")da = DataHelper()for item in da.get_allbabuchakurls():ct = item.no_pageswhile ct>0:url = item.url + BABUCHAK_VAR1url = url + str(ct)self.start_urls.append(url)ct = ct -1session.close()def start_requests(self):listreq = []#for each request a referer has to be set#BABUCHAK_REFERER = "www.google.com/search"BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")for url1 in self.start_urls:request = Request(url = str(url1), callback=self.parse)request.headers.setdefault("Referer", BABUCHAK_REFERER)listreq.append(request)return listreqdef parse(self, response):#url1 needed to get complete urlsda = DataHelper()#BABUCHAK_URL2 = "http://www.shopping.babuchak.com/visitourstores.php"BABUCHAK_URL2 = get_code_word("BABUCHAK_URL2")hxs = HtmlXPathSelector(response)#BABUCHAK_XPATH4 = '//td[@class="mod-item-body-title"]/a/@href'BABUCHAK_XPATH4 = get_code_word("BABUCHAK_XPATH4")info = hxs.select(BABUCHAK_XPATH4)for i in info:url = i.extract()url = url.strip()url = BABUCHAK_URL2 + urlda.add_babuchakphoneurl(url)session.remove()SPIDER = babuchak2()