Subversion Repositories SmartDukaan

Rev

Rev 178 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 178 Rev 226
Line 14... Line 14...
14
from scrapy.http.headers import Headers
14
from scrapy.http.headers import Headers
15
from scrapy.http.request.form import FormRequest
15
from scrapy.http.request.form import FormRequest
16
from scrapy.log import msg
16
from scrapy.log import msg
17
from scrapy.http.response import Response
17
from scrapy.http.response import Response
18
 
18
 
-
 
19
from html2text.unescaping import *
19
from datastore import DataAccessor
20
from datastore.DataAccessor import *
20
from datastore.DataAccessor import DataHelper
21
from datastore.DataCodeAccessor import *
-
 
22
 
21
 
23
 
22
 
24
 
23
class indiaplaza_spider(BaseSpider):
25
class indiaplaza_spider(BaseSpider):
24
   
26
   
25
    def __init__(self):
27
    def __init__(self):
-
 
28
       initialize_table()
-
 
29
       da = DataHelper() 
-
 
30
       #INDIAPLAZA_DOMAINNAME = "indiaplaza"
-
 
31
       INDIAPLAZA_DOMAINNAME = get_code_word("INDIAPLAZA_DOMAINNAME")   
26
       self.domain_name = "indiaplaza"
32
       self.domain_name = INDIAPLAZA_DOMAINNAME
-
 
33
       #INDIAPLAZA_CT = 18
-
 
34
       #INDIAPLAZA_CT = int(get_code_word("INDIAPLAZA_CT"))
-
 
35
       #INDIAPLAZA_NO = 1
-
 
36
       CT = int(da.get_extra_vars('indiaplaza_count'))
27
       ct = 18
37
       NO = 1
28
       no = 1
38
       if CT>18:
29
       while(no<=ct):
39
           NO = CT
30
            str1 = "http://www.indiaplaza.in/mobile-phones-Mobiles-1.htm?PageNo="+str(no)
40
       #INDIAPLAZA_URL = "http://www.indiaplaza.in/mobile-phones-Mobiles-1.htm?PageNo="
-
 
41
       INDIAPLAZA_URL = get_code_word("INDIAPLAZA_URL")
-
 
42
       while(NO<=CT):
-
 
43
            url1 = INDIAPLAZA_URL + str(NO)
31
            self.start_urls.append(str1)
44
            self.start_urls.append(url1)
32
            no=no+1
45
            NO=NO+1
33
    
46
        
34
    def start_requests(self):
47
    def start_requests(self):
-
 
48
        #adding entry for the supplier i.e its name and site
-
 
49
        #INDIAPLAZA_HOMEPAGE = "www.indiaplaza.com"
-
 
50
        INDIAPLAZA_HOMEPAGE = get_code_word("INDIAPLAZA_HOMEPAGE")
35
        da = DataHelper()
51
        da = DataHelper()
36
        da.add_supplier(self.domain_name, "www.indiaplaza.com")
52
        da.add_supplier(self.domain_name, INDIAPLAZA_HOMEPAGE)
37
        listreq = []
53
        listreq = []
-
 
54
        
-
 
55
        #for each request a referer has to be set
-
 
56
        #INDIAPLAZA_REFERER = "www.google.com/search"
-
 
57
        INDIAPLAZA_REFERER = get_code_word("INDIAPLAZA_REFERER")
38
        for url1 in self.start_urls:
58
        for url1 in self.start_urls:
39
            request = Request(url = str(url1), callback=self.parse,dont_filter=True,)
59
            request = Request(url = str(url1), callback=self.parse, dont_filter=True)
40
            request.headers.setdefault("Referer", "http://www.indiaplaza.in")
60
            request.headers.setdefault("Referer", INDIAPLAZA_REFERER)
41
            listreq.append(request)
61
            listreq.append(request)
42
        return listreq
62
        return listreq
43
    
63
        
44
    def parse(self, response):
64
    def parse(self, response):
-
 
65
        da = DataHelper()
45
        str1 = "http://www.indiaplaza.in"
66
        #INDIAPLAZA_URL1 = "http://www.indiaplaza.in"
-
 
67
        INDIAPLAZA_URL1 = get_code_word("INDIAPLAZA_URL1")
46
        hxs = HtmlXPathSelector(response)
68
        hxs = HtmlXPathSelector(response)
47
        phone_info = hxs.select('//tr/td/table[@id="browsesku"]')
69
        #INDIAPLAZA_XPATH1 = '//tr/td/table[@id="browsesku"]'
-
 
70
        INDIAPLAZA_XPATH1 = get_code_word("INDIAPLAZA_XPATH1")
-
 
71
        phone_info = hxs.select(INDIAPLAZA_XPATH1)
48
        items = []
72
        items = []
49
        #msg(len(phone_info))
73
        #INDIAPLAZA_XPATH2 = './/div[@class="skuimg"]/a/@title'
50
        #msg(response.body)
-
 
51
        for i in phone_info:
74
        INDIAPLAZA_XPATH2 = get_code_word("INDIAPLAZA_XPATH2")
52
            item = {}
-
 
53
            item['title'] = i.select('.//div[@class="skuimg"]/a/@title')[0].extract()
75
        #INDIAPLAZA_XPATH3 = './/div[@class="skuimg"]/a/@href'
54
            item['url'] = i.select('.//div[@class="skuimg"]/a/@href')[0].extract()
76
        INDIAPLAZA_XPATH3 = get_code_word("INDIAPLAZA_XPATH3")
55
            
-
 
56
            
77
        if not phone_info:
57
            items.append(item)
78
            ct = int(da.get_extra_vars('indiaplaza_count'))
58
            
79
            if ct>18:
59
        f = open('/home/gaurav/Desktop/indiaplaza_info.txt', 'a')
80
                fails = int(da.get_extra_vars('indiaplaza_fails'))
60
        da = DataHelper()
-
 
61
        for item in items:
81
                fails = fails+1
62
            str2 = str1 + str(item['url'])
82
                da.set_extra_vars('indiaplaza_fails',str(fails),'')
63
            #amnt = item['price'].replace(",","")
83
                if fails > 0:
64
            #amnt = amnt.replace("Rs.", "")
84
                    da.set_extra_vars('indiaplaza_flag','FALSE','')
65
            #amnt = amnt.strip()
85
                da.set_extra_vars('indiaplaza_fails',str(fails),'')
66
            
86
        else:            
67
            #pr = int(amnt) + vatplustax
87
            for i in phone_info:
68
            da.add_ipbasic(item['title'],str2)    
88
                item = {}
69
            print item['title']
89
                item['title'] = i.select(INDIAPLAZA_XPATH2)[0].extract()
70
            #print str(item['title'])
90
                item['url'] = i.select(INDIAPLAZA_XPATH3)[0].extract()
71
            print str(item['title']).strip()
91
                items.append(item)
72
            #print str2
92
                
73
            f.write(str(item['title']).strip())
-
 
74
            f.write("\n")
93
            for item in items:
75
            f.write(str2.strip())
94
                str2 = INDIAPLAZA_URL1 + str(item['url'])
76
            f.write("\n\n")
95
                da.add_ipbasic(item['title'],unescape(str2))
77
        f.close()            
96
                     
78
SPIDER = indiaplaza_spider()
97
SPIDER = indiaplaza_spider()
79
98