Subversion Repositories SmartDukaan

Rev

Rev 183 | Blame | Compare with Previous | Last modification | View Log | RSS feed

'''
Created on 20-May-2010

@author: gaurav
'''

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request

from demo.items import DemoItem
from scrapy.contrib.spidermiddleware import referer
from scrapy.http.headers import Headers
from scrapy.http.request.form import FormRequest
from scrapy.log import msg
from scrapy.http.response import Response

from datastore import DataAccessor
from datastore.DataAccessor import DataHelper
from html2text.unescaping import *

class mobilestore_spider(BaseSpider):
   
    def __init__(self): 
       MOBILESTORE_DOMAINNAME = "mobilestore"   
       self.domain_name = MOBILESTORE_DOMAINNAME 
       MOBILESTORE_URL = "http://www.themobilestore.in/sitemap.xml"
       self.start_urls.append(MOBILESTORE_URL)
    
    def start_requests(self):
        #adding entry for the supplier i.e its name and site
        MOBILESTORE_HOMEPAGE = "www.themobilestore.in"
        da = DataHelper()
        da.add_supplier(self.domain_name, MOBILESTORE_HOMEPAGE)
        listreq = []
        
        #for each request a referer has to be set
        MOBILESTORE_REFERER = "www.google.com/search"
        for url1 in self.start_urls:
            request = Request(url = str(url1), callback=self.parse)
            request.headers.setdefault("Referer", MOBILESTORE_REFERER)
            listreq.append(request)
        return listreq
        
    def parse(self, response):
        da = DataHelper()
        hxs = HtmlXPathSelector(response)
        MOBILESTORE_XPATH1 = '//url/loc/text()'
        phone_urls = hxs.select(MOBILESTORE_XPATH1)
         #elements in chk_list are specific to this site for determining valid sites
        MOBILESTORE_CHKLIST1 = ["Mobile_Phones"]
        for i in phone_urls:
            site = i.extract()
            site = unescape(site)
            pos1 = pos2 = pos3 = 0
            temp =""
            pos1 = site.rfind('/')
            if pos1 != -1:
                pos2 = site.rfind('/',0,pos1-1)
            if pos2 != -1:    
                pos3 = site.rfind('/',0,pos2-1)
            if pos3 > 0:
                temp = site[pos3+1:pos2]
            # adding valid urls to the DB from the site-map    
            for m in MOBILESTORE_CHKLIST1:
                if temp == m:
                    da.add_mobstoreurl(site)           
SPIDER = mobilestore_spider()