Subversion Repositories SmartDukaan

Rev

Rev 140 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
140 ashish 1
'''
2
Created on 11-May-2010
3
 
4
@author: gaurav
5
'''
6
 
7
 
8
from scrapy.spider import BaseSpider
9
from scrapy.selector import HtmlXPathSelector
10
from scrapy.http import Request
11
 
12
from demo.items import DemoItem
13
from scrapy.contrib.spidermiddleware import referer
14
from scrapy.http.headers import Headers
15
from scrapy.http.request.form import FormRequest
16
from scrapy.log import msg
17
from scrapy.http.response import Response
18
from datastore import DataAccessor
19
from datastore.DataAccessor import DataHelper
20
 
21
 
22
class scrapy_price1(BaseSpider):
23
 
24
    def __init__(self):
25
        #f = open('/home/gaurav/twopassscrapy/pricelinks.txt', 'r')
26
        self.domain_name = "price_collector"
27
        #count = 0
28
        #for line in f.xreadlines():
29
         #   count = (count+1)%2
30
          #  if count%2 != 0:
31
           #     continue
32
            #print line
33
            #self.start_urls.append(line.strip())
34
 
35
        #f.close()
36
        da = DataHelper()
37
        for pitem in da.get_all_phones():
38
            self.start_urls.append(pitem.url.strip())
39
        #print "starturls"
40
        #print self.start_urls
41
 
42
    def start_requests(self):
43
        listreq = []
44
        for url1 in self.start_urls:
45
            request = Request(url = url1, callback=self.parse)
46
            request.headers.setdefault("Referer", "www.google.com/search")
47
            listreq.append(request)
48
        #print "request"
49
        #print request
50
 
51
        return listreq
52
 
53
    def parse(self, response):
54
        hxs1 = HtmlXPathSelector(response)
55
        #print "url"
56
        #msg(response.url)
57
        #print "body"
58
        #msg(response.body)
59
        print("this is parse1")
60
        temp = hxs1.select('//span[@class="infiPrice amount"]/text()').extract()
61
        print "temp"
62
 
63
        amnt = str(temp[0].strip())
64
        amnt = amnt.replace(",", "")
65
        print int(amnt.strip())
66
        da = DataHelper()
67
        da.set_crawled(response.url, True)
68
        da.add_price(response.url , int(amnt.strip()))
69
 
70
        #print temp
71
 
72
SPIDER = scrapy_price1()