Rev 139 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 11-May-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Response#from datastore.DataAccessor import add_new_phonefrom datastore import DataAccessorfrom datastore.DataAccessor import DataHelperclass scrapy_price(BaseSpider):def __init__(self):self.domain_name = "infibeam.com"da = DataHelper()for pitem in da.get_all_vendors():self.start_urls.append(pitem.v_url.strip())def start_requests(self):listreq = []for url1 in self.start_urls:request = Request(url = url1, callback=self.parse)request.headers.setdefault("Referer", "www.google.com/search")listreq.append(request)return listreqdef parse(self, response):hxs = HtmlXPathSelector(response)sites = hxs.select('//p[@class="box"]')items = []for site in sites:item = {}tmp = site.select('.//a[@class="nocol"]/@name')item['title'] = tmp[0].extract()psite = site.select(".//a[3][@href]/@href")[0].extract()item['link'] = psiteitems.append(item)str1 = "http://www.infibeam.com"da = DataHelper()da.set_all_crawled(False)for i in items:str2 = str(i['link'])if str(i['link']).find("http://www.infibeam.com") == -1:str2 = str1 + str2print "name"print i['title']print "site"print str2da.add_new_phone(str2,i['title'],"vendor")#lt = len(da.get_all_phones())#print "length" + str(lt)#for ph in da.get_all_phones():# print phf = open('/home/gaurav/twopassscrapy/pricelinks.txt', 'w')for i in items:f.write(i['title'])f.write("\n")f.write(i['link'])f.write("\n")f.close()SPIDER = scrapy_price()