Subversion Repositories SmartDukaan

Rev

Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed

'''
Created on 06-Jun-2010

@author: gaurav
'''

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request

from demo.items import DemoItem
from scrapy.contrib.spidermiddleware import referer
from scrapy.http.headers import Headers
from scrapy.http.request.form import FormRequest
from scrapy.log import msg
from scrapy.http.response import Response

from datastore.DataAccessor import *
from datastore.DataCodeAccessor import *
from html2text import *
from babel.messages.pofile import unescape
import urllib

class babuchak2(BaseSpider):
    
    def __init__(self):
        initialize_table()
        #BABUCHAK_DOMAINNAME1 = "babuchak1"   
        BABUCHAK_DOMAINNAME1 = get_code_word("BABUCHAK_DOMAINNAME1")
        self.domain_name = BABUCHAK_DOMAINNAME1 
        #BABUCHAK_VAR1 = "&postPage=" 
        BABUCHAK_VAR1 = get_code_word("BABUCHAK_VAR1")
        da = DataHelper()
        for item in da.get_allbabuchakurls():
            ct = item.no_pages
            while ct>0:
                url = item.url + BABUCHAK_VAR1  
                url = url + str(ct)
                self.start_urls.append(url)
                ct = ct -1
        session.close()        

    def start_requests(self):
        listreq = []        
        #for each request a referer has to be set
        #BABUCHAK_REFERER = "www.google.com/search"
        BABUCHAK_REFERER = get_code_word("BABUCHAK_REFERER")
        for url1 in self.start_urls:
            request = Request(url = str(url1), callback=self.parse)
            request.headers.setdefault("Referer", BABUCHAK_REFERER)
            listreq.append(request)
        return listreq
        
    def parse(self, response):
        #url1 needed to get complete urls
        da = DataHelper()
        #BABUCHAK_URL2 = "http://www.shopping.babuchak.com/visitourstores.php"
        BABUCHAK_URL2 = get_code_word("BABUCHAK_URL2")
        hxs = HtmlXPathSelector(response)
        #BABUCHAK_XPATH4 = '//td[@class="mod-item-body-title"]/a/@href'
        BABUCHAK_XPATH4 = get_code_word("BABUCHAK_XPATH4")
        
        info = hxs.select(BABUCHAK_XPATH4)
        for i in info:
            url = i.extract()
            url = url.strip()
            url = BABUCHAK_URL2 + url 
            da.add_babuchakphoneurl(url) 
        session.remove()
       
SPIDER = babuchak2()