Subversion Repositories SmartDukaan

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
219 ashish 1
'''
2
Created on 06-Jun-2010
3
 
4
@author: gaurav
5
'''
6
 
7
from scrapy.spider import BaseSpider
8
from scrapy.selector import HtmlXPathSelector
9
from scrapy.http import Request
10
 
11
from demo.items import DemoItem
12
from scrapy.contrib.spidermiddleware import referer
13
from scrapy.http.headers import Headers
14
from scrapy.http.request.form import FormRequest
15
from scrapy.log import msg
16
from scrapy.http.response import Response
17
from datastore.DataAccessor import *
18
from datastore.DataCodeAccessor import *
19
 
20
from html2text import *
21
from babel.messages.pofile import unescape
22
import urllib
23
 
24
class babuchak1(BaseSpider):
25
 
26
    def __init__(self):
27
        initialize_table()
28
        #BABUCHAK_DOMAINNAME = "babuchak"   
29
        BABUCHAK_DOMAINNAME = get_code_word("BABUCHAK_DOMAINNAME")
30
        self.domain_name = BABUCHAK_DOMAINNAME 
31
        #BABUCHAK_URL = "http://www.shopping.babuchak.com/visitourstores.php?view=productListPage&category=108"
32
        BABUCHAK_URL = get_code_word("BABUCHAK_URL")
33
        self.start_urls.append(BABUCHAK_URL)
34
 
35
 
36
    def start_requests(self):
37
        #adding entry for the supplier i.e its name and site
38
        #BABUCHAK_HOMEPAGE = "http://www.shopping.babuchak.com"
39
        BABUCHAK_HOMEPAGE = get_code_word("BABUCHAK_HOMEPAGE")
40
        da = DataHelper()
41
        da.add_supplier(self.domain_name, BABUCHAK_HOMEPAGE)
42
        listreq = []
43
 
44
        #for each request a referer has to be set
45
        #BABUCHAK_REFERER = "www.google.com/search"
46
        BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")
47
        for url1 in self.start_urls:
48
            request = Request(url = str(url1), callback=self.parse)
49
            request.headers.setdefault("Referer", BABUCHAK_REFERER)
50
            listreq.append(request)
51
        return listreq
52
 
53
    def parse(self, response):
54
        #url1 needed to get complete urls
55
        da = DataHelper()
56
        #BABUCHAK_URL1 = "http://www.shopping.babuchak.com/visitourstores.php"
57
        BABUCHAK_URL1 = get_code_word("BABUCHAK_URL1")
58
        hxs = HtmlXPathSelector(response)
59
        #BABUCHAK_XPATH1 = '//td[@class="mod-category-header"]'
60
        BABUCHAK_XPATH1 = get_code_word("BABUCHAK_XPATH1")
61
        info = hxs.select(BABUCHAK_XPATH1)
62
        for i in info:
63
            #BABUCHAK_XPATH2 = './/text()'
64
            BABUCHAK_XPATH2 = get_code_word("BABUCHAK_XPATH2")
65
            #BABUCHAK_XPATH3 = './/a/@href' 
66
            BABUCHAK_XPATH3 = get_code_word("BABUCHAK_XPATH3")
67
            no_pages = i.select(BABUCHAK_XPATH2)[2].extract()
68
            #print i.select(BABUCHAK_XPATH2)[1].extract() + "  "
69
            url = i.select(BABUCHAK_XPATH3)[0].extract()
70
            url = BABUCHAK_URL1 + url
71
            no_pages = urllib.unquote(no_pages)
72
            no_pages = no_pages.strip()
73
            no_pages = no_pages[1:len(no_pages)-1]
74
            no_pages = int(no_pages)
75
            #print url + " "
76
            #print no_pages
77
            da.add_babuchakurl(url, no_pages)
78
 
79
SPIDER = babuchak1()