Subversion Repositories SmartDukaan

Rev

Rev 138 | Rev 169 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 138 Rev 152
Line 1... Line 1...
1
'''
1
'''
2
Created on 14-May-2010
2
Created on 14-May-2010
3
 
3
 
4
@author: gaurav
4
@author: gaurav
5
'''
5
'''
-
 
6
 
-
 
7
 
-
 
8
from scrapy.spider import BaseSpider
-
 
9
from scrapy.selector import HtmlXPathSelector
-
 
10
from scrapy.http import Request
-
 
11
 
-
 
12
from demo.items import DemoItem
-
 
13
from scrapy.contrib.spidermiddleware import referer
-
 
14
from scrapy.http.headers import Headers
-
 
15
from scrapy.http.request.form import FormRequest
-
 
16
from scrapy.log import msg
-
 
17
from scrapy.http.response import Response
-
 
18
 
-
 
19
from datastore import DataAccessor
-
 
20
from datastore.DataAccessor import DataHelper
-
 
21
 
-
 
22
 
-
 
23
class vendor_links(BaseSpider):
-
 
24
    domain_name = "univercellvendors"
-
 
25
    start_urls = [
-
 
26
          "http://www.univercell.in/mobiles/populateStore.action"
-
 
27
    ]
-
 
28
    
-
 
29
    def start_requests(self):
-
 
30
        request = Request(url = "http://www.univercell.in/mobiles/populateStore.action", callback=self.parse)
-
 
31
        request.headers.setdefault("Referer", "www.google.com/search")
-
 
32
        return [request]
-
 
33
    
-
 
34
    def parse(self, response):
-
 
35
        str1 = "http://www.univercell.in"
-
 
36
        hxs = HtmlXPathSelector(response)
-
 
37
        vendor_info = hxs.select('//div[@id="mobilesTab"]/table/tr[1]/td/table/tr')
-
 
38
        print len(vendor_info)
-
 
39
        items = []
-
 
40
        for i in vendor_info:
-
 
41
            item = {}
-
 
42
            item['name'] = i.select('.//a/text()')[0].extract()
-
 
43
            temp = i.select('.//a/@href')[0].extract()
-
 
44
            a = str(temp).find(";")
-
 
45
            b = str(temp).find("?")
-
 
46
            temp1 = str(temp)[a:b]
-
 
47
            temp2 = str(temp).replace(temp1,"")
-
 
48
            item['site'] =  str(temp2).replace("populate","rePopulate")
-
 
49
            items.append(item)
-
 
50
            
-
 
51
        da = DataHelper()
-
 
52
        for item in items:
-
 
53
            str2 = str1 + str(item['site'])
-
 
54
            da.add_univervendor(item['name'].strip(), str2)
-
 
55
            print item['name']
-
 
56
            print str2
-
 
57
            
-
 
58
SPIDER = vendor_links()
6
59