Rev 140 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 11-May-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Responsefrom datastore import DataAccessorfrom datastore.DataAccessor import DataHelperclass scrapy_price1(BaseSpider):def __init__(self):#f = open('/home/gaurav/twopassscrapy/pricelinks.txt', 'r')self.domain_name = "price_collector"#count = 0#for line in f.xreadlines():# count = (count+1)%2# if count%2 != 0:# continue#print line#self.start_urls.append(line.strip())#f.close()da = DataHelper()for pitem in da.get_all_phones():self.start_urls.append(pitem.url.strip())#print "starturls"#print self.start_urlsdef start_requests(self):listreq = []for url1 in self.start_urls:request = Request(url = url1, callback=self.parse)request.headers.setdefault("Referer", "www.google.com/search")listreq.append(request)#print "request"#print requestreturn listreqdef parse(self, response):hxs1 = HtmlXPathSelector(response)#print "url"#msg(response.url)#print "body"#msg(response.body)print("this is parse1")temp = hxs1.select('//span[@class="infiPrice amount"]/text()').extract()print "temp"amnt = str(temp[0].strip())amnt = amnt.replace(",", "")print int(amnt.strip())da = DataHelper()da.set_crawled(response.url, True)da.add_price(response.url , int(amnt.strip()))#print tempSPIDER = scrapy_price1()