Rev 154 | Rev 240 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 14-May-2010@author: gaurav'''from scrapy.spider import BaseSpiderfrom scrapy.selector import HtmlXPathSelectorfrom scrapy.http import Requestfrom demo.items import DemoItemfrom scrapy.contrib.spidermiddleware import refererfrom scrapy.http.headers import Headersfrom scrapy.http.request.form import FormRequestfrom scrapy.log import msgfrom scrapy.http.response import Response#from datastore.DataAccessor import add_new_phonefrom datastore import DataAccessorfrom datastore.DataAccessor import DataHelperclass univercell_price(BaseSpider):def __init__(self):self.domain_name = "univercellphones"da = DataHelper()for pitem in da.get_all_univervendors():self.start_urls.append(pitem.v_site.strip())def start_requests(self):listreq = []for url1 in self.start_urls:request = Request(url = url1, callback=self.parse)request.headers.setdefault("Referer", "www.google.com/search")listreq.append(request)return listreqdef parse(self, response):da = DataHelper()vatplustax = 0hxs = HtmlXPathSelector(response)#sites = hxs.select('//div[@id="productsDiv"]/table/tbody/tr[2]/td/div/table/tbody/tr/td/table/tbody')#sites = hxs.select('//div[@id="productsDiv"]/table/tr[2]//tr')sites = hxs.select('//td[@class="gray-border"]')items = []for site in sites:item = {}#tmp = site.select('.//tr[2]/td/a/text()')item['title'] = site.select('.//tr[2]/td/a/text()')[0].extract()#psite = site.select(".//a[3][@href]/@href")[0].extract()item['price'] =site.select('.//tr[3]/th/label/text()')[0].extract()items.append(item)da = DataHelper()for i in items:str1 = str(i['title']).strip()print str1amnt = i['price'].replace(",","")amnt = amnt.replace("Rs", "")amnt = amnt.replace("/", "")amnt = amnt.replace("-", "")amnt = amnt.strip()vatplustax = 4*int(amnt)/100pr = int(amnt) + vatplustax#print prda.add_new_univerphone(str1,amnt,pr)#lt = len(da.get_all_phones())#print "length" + str(lt)#for ph in da.get_all_phones():# print ph#f = open('/home/gaurav/twopassscrapy/pricelinks.txt', 'w')#for i in items:#f.write(i['title'])#f.write("\n")#f.write(i['link'])#f.write("\n")#f.close()SPIDER = univercell_price()