Subversion Repositories SmartDukaan

Rev

Rev 233 | Blame | Compare with Previous | Last modification | View Log | RSS feed

'''
Created on 09-Jun-2010

@author: gaurav
'''

from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request

from demo.items import DemoItem
from scrapy.contrib.spidermiddleware import referer
from scrapy.http.headers import Headers
from scrapy.http.request.form import FormRequest
from scrapy.log import msg
from scrapy.http.response import Response
from datastore.DataAccessor import *
from datastore.DataCodeAccessor import *

from html2text.unescaping import *
import urllib

class mobilestore_spider0(BaseSpider):
    """
    Documentation for class mobilestore_spider0
    This spider collects the information for the individual phones
    and store them in table datastore_datadefinition_themobilestorephones_new
    """
    
    def __init__(self):
       """
        Documentation for constructor
        initialize_table is called to make all the tables known in
        the scope of this class.
        Also start url needs to be feeded to the spider through start_urls.append
        Domainname is name by which this spider is known outside
        So this will be used as an argument for calling this spider.
        As the number of pages to be crawled are not fixed so ct and nt are used to make it dynamic.
        url1 and url2 are used for getting actual start urls.  
       """ 
       initialize_table()  
       #MOBILESTORE_DOMAINNAME0 = "mobilestore0"   
       MOBILESTORE_DOMAINNAME0 = get_code_word("MOBILESTORE_DOMAINNAME0")
       self.domain_name = MOBILESTORE_DOMAINNAME0
       da = DataHelper()
       MOBILESTORE_CT = int(da.get_extra_vars('mobilestore_count'))
       nt = 1
       if MOBILESTORE_CT > 800:
           nt = MOBILESTORE_CT-50
       while nt < MOBILESTORE_CT:
           #MOBILESTORE_URL1 = "http://www.themobilestore.in/mobilestore/faces/tiles/product.jsp?productID=" + str(nt)
           MOBILESTORE_URL1 = get_code_word("MOBILESTORE_URL1") + str(nt)
           #MOBILESTORE_URL2 = MOBILESTORE_URL1 + "&catalogueID=3"
           MOBILESTORE_URL2 = MOBILESTORE_URL1 + get_code_word("MOBILESTORE_URL2")
           self.start_urls.append(MOBILESTORE_URL2)
           nt = nt+1
    
    def start_requests(self):
        """
        Documentation for method start_requests
        To set various properties of the request to be made
        like referer, headers and all.
        Also suppliers entry need to be done in the table
        datastore_datadefinition_suppliers.
        @return a list of well formed requests which will be 
        crawled by spider and spider will return the response
        """
        #adding entry for the supplier i.e its name and site
        #MOBILESTORE_HOMEPAGE = "www.themobilestore.in"
        MOBILESTORE_HOMEPAGE = get_code_word("MOBILESTORE_HOMEPAGE")
        da = DataHelper()
        da.add_supplier(self.domain_name, MOBILESTORE_HOMEPAGE)
        listreq = []
        #for each request a referer has to be set
        #MOBILESTORE_REFERER = "www.google.com/search"
        MOBILESTORE_REFERER = get_code_word("MOBILESTORE_REFERER")
        for url1 in self.start_urls:
            request = Request(url = str(url1), callback=self.parse)
            request.headers.setdefault("Referer", MOBILESTORE_REFERER)
            listreq.append(request)
        return listreq
        
    def parse(self, response):
        """
        Documentation for method parse
        @param response of individual requests
        Using Xpaths needed information is extracted out of the response
        and added to the database
        Xpath3 = Give us name for individual phone
        Xpath4 = Give us price for individual phone
        Xpath5 = Give us name for individual phone, if its not gettable from xpath3
        Xpath6 = Give us name for individual phone, if its not gettable from xpath3 and xpath5
        Xpath7 = to check that the phone can be bought or not
        Xpath8 = to check that the item is mobile phone
        """
        da = DataHelper()
        url1 = response.url
        ps1 = url1.find('=')
        ps2 = url1.find('&')
        str1 = url1[ps1+1:ps2]
        #da = DataHelper()
        hxs = HtmlXPathSelector(response)
        #MOBILESTORE_XPATH3 = '//span[@id="productLayoutForm:categoryNavigation:navigationList_2:navigationList3"]/text()'
        MOBILESTORE_XPATH3 = get_code_word("MOBILESTORE_XPATH3")
        #MOBILESTORE_XPATH4 = '//div[@id="priceComp"]//tr[2]/td[3]/span/text()'
        MOBILESTORE_XPATH4 = get_code_word("MOBILESTORE_XPATH4")
        #MOBILESTORE_XPATH5 = '//span[@id="productLayoutForm:categoryNavigation:navigationList_1:navigationList3"]/text()'
        MOBILESTORE_XPATH5 = get_code_word("MOBILESTORE_XPATH5")
        #MOBILESTORE_XPATH6 = '//span[@id="productLayoutForm:categoryNavigation:navigationList_0:navigationList3"]/text()'
        MOBILESTORE_XPATH6 = get_code_word("MOBILESTORE_XPATH6")
        #MOBILESTORE_XPATH7 = '//div[@id="priceComp"]/b/text()'
        MOBILESTORE_XPATH7 = get_code_word("MOBILESTORE_XPATH7")
        #MOBILESTORE_XPATH8 = '//span[@id="productLayoutForm:categoryNavigation:navigationList_0:navigationList1"]/text()'
        MOBILESTORE_XPATH8 = get_code_word("MOBILESTORE_XPATH8")
        try:
            catg = hxs.select(MOBILESTORE_XPATH8)
            catg = catg[0].extract()
            catg = catg.strip()
            catg = unescape(catg)
            print catg
            if catg == "Mobile Phones>":
                try:
                    str2 = hxs.select(MOBILESTORE_XPATH7)
                    str2 = str2[0].extract()
                    str2 = "can buy"
                except:
                    str2 = "can not buy"    
                try:
                    name = hxs.select(MOBILESTORE_XPATH3)
                    name = name[0].extract()
                    name = name.strip()
                    price = hxs.select(MOBILESTORE_XPATH4)
                    price = price[0].extract()
                    price = price.strip()
                    price = int(price)
                    da.add_new_mobstorephone_new(name, price, price, str2)   
                except:
                    try:
                        name = hxs.select(MOBILESTORE_XPATH5)
                        name = name[0].extract()
                        name = name.strip()
                        price = hxs.select(MOBILESTORE_XPATH4)
                        price = price[0].extract()
                        price = price.strip()
                        price = int(price)
                        da.add_new_mobstorephone_new(name, price, price, str2)   
                    except:
                        try:
                            name = hxs.select(MOBILESTORE_XPATH6)
                            name = name[0].extract()
                            name = name.strip()
                            price = hxs.select(MOBILESTORE_XPATH4)
                            price = price[0].extract()
                            price = price.strip()
                            price = int(price)
                            da.add_new_mobstorephone_new(name, price, price, str2)   
                        except:
                            pass
        except:    
          ct = int(da.get_extra_vars('mobilestore_count'))
          if ct>800:
            fails = int(da.get_extra_vars('mobilestore_fails'))
            fails = fails+1
            da.set_extra_vars('mobilestore_fails',str(fails),'')
            if fails > 40:
                da.set_extra_vars('mobilestore_flag','FALSE','')
                
SPIDER = mobilestore_spider0()