Subversion Repositories SmartDukaan

Rev

Rev 3232 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 3232 Rev 4039
Line 3... Line 3...
3
 
3
 
4
@author: Varun Gupta
4
@author: Varun Gupta
5
'''
5
'''
6
import sched, time
6
import sched, time
7
import ScraperLoader, Utils
7
import ScraperLoader, Utils
8
from Clients import GAEServletClient
8
from Clients.GAEServletClient import url as GAE_URL, postDataAndGetNewJob
9
 
9
 
10
class ScraperAgent:
10
class ScraperAgent:
11
    
11
    
12
    def __init__(self):
12
    def __init__(self):
13
        self.schedular = sched.scheduler(time.time, time.sleep)
13
        self.schedular = sched.scheduler(time.time, time.sleep)
14
        self.time_to_sleep = 2
14
        self.time_to_sleep = 2
15
        self.current_job = None
15
        self.current_job = None
16
        self.data = {'id': None, 'job_id': None, 'source': None, 'phone_prices': None, 'next_url': None}
16
        self.data = {'id': None, 'job_id': None, 'source': None, 'phone_prices': None, 'next_url': None}
-
 
17
        self.counter_attempts = 0
17
        print "ScraperAgent initiated at %f" % time.time()
18
        print "ScraperAgent initiated at %f" % time.time()
18
        
19
        
19
    def work(self):
20
    def work(self):
20
        
21
        
21
        if Utils.isValidRule(self.current_job):
22
        if Utils.isValidRule(self.current_job):
22
            print 'Working on new job'
23
            print 'Working on new job'
-
 
24
            self.counter_attempts = 0
23
            
25
            
24
            url = self.current_job['url'] if 'url' in self.current_job else None
26
            url = self.current_job['url'] if 'url' in self.current_job else None
25
            print 'URL: ', url
27
            print 'URL: ', url
26
            scraper = ScraperLoader.getScraper(self.current_job['source'])
28
            scraper = ScraperLoader.getScraper(self.current_job['source'])
27
            scraper.setUrl(url)
29
            scraper.setUrl(url)
Line 33... Line 35...
33
            self.data['job_id'] = self.current_job['id']
35
            self.data['job_id'] = self.current_job['id']
34
            self.data['source'] = self.current_job['source']
36
            self.data['source'] = self.current_job['source']
35
            self.data['phone_prices'] = phone_prices
37
            self.data['phone_prices'] = phone_prices
36
            self.data['next_url'] = next_url
38
            self.data['next_url'] = next_url
37
            
39
            
-
 
40
        else:
38
        print 'Posting data:', self.data
41
            self.counter_attempts += 1
39
        
42
        
-
 
43
        if self.counter_attempts < 5:       #Scraper Agent will die after receiving continuous 5 null jobs
40
        self.current_job = GAEServletClient.postDataAndGetNewJob(self.data)
44
            print 'Posting data:', self.data
41
        
45
        
42
        self.data['job_id'] = None
-
 
43
        self.data['source'] = None
-
 
44
        self.data['phone_prices'] = None
46
            self.current_job = postDataAndGetNewJob(self.data, GAE_URL)
45
        self.data['next_url'] = None
-
 
46
        
47
        
-
 
48
            self.data['job_id'] = None
-
 
49
            self.data['source'] = None
-
 
50
            self.data['phone_prices'] = None
47
        print 'New job: ', self.current_job
51
            self.data['next_url'] = None
48
        
52
        
-
 
53
            print 'New job: ', self.current_job
-
 
54
        
49
        self.schedular.enter(int(self.current_job['timetowait']), 1, self.work, ())
55
            self.schedular.enter(int(self.current_job['timetowait']), 1, self.work, ())
50
    
56
    
51
    def start(self):
57
    def start(self):
52
        self.schedular.enter(self.time_to_sleep, 1, self.work, ())
58
        self.schedular.enter(self.time_to_sleep, 1, self.work, ())
53
        self.schedular.run()
59
        self.schedular.run()
54
 
60
 
55
ScraperAgent().start()
-
 
56
61
if __name__ == '__main__':
-
 
62
    ScraperAgent().start()
-
 
63
57
64