Subversion Repositories SmartDukaan

Rev

Rev 4039 | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 4039 Rev 4198
Line 3... Line 3...
3
 
3
 
4
@author: Varun Gupta
4
@author: Varun Gupta
5
'''
5
'''
6
import sched, time
6
import sched, time
7
import ScraperLoader, Utils
7
import ScraperLoader, Utils
-
 
8
from URLQueue import URL, URLQueue
8
from Clients.GAEServletClient import url as GAE_URL, postDataAndGetNewJob
9
from ScrapedDataManager import ScrapedDataManager
9
 
10
 
10
class ScraperAgent:
11
class ScraperAgent:
11
    
12
    
12
    def __init__(self):
13
    def __init__(self):
13
        self.schedular = sched.scheduler(time.time, time.sleep)
14
        self.schedular = sched.scheduler(time.time, time.sleep)
14
        self.time_to_sleep = 2
15
        self.time_to_sleep = 2
15
        self.current_job = None
16
        self.current_url = None
16
        self.data = {'id': None, 'job_id': None, 'source': None, 'phone_prices': None, 'next_url': None}
-
 
17
        self.counter_attempts = 0
17
        self.counter_attempts = 0
18
        print "ScraperAgent initiated at %f" % time.time()
18
        print "ScraperAgent initiated at %f" % time.time()
-
 
19
        self.url_queue = URLQueue()
-
 
20
        self.data_manager = ScrapedDataManager()
19
        
21
        
20
    def work(self):
22
    def work(self):
21
        
23
        
-
 
24
        self.current_url = self.url_queue.get()
-
 
25
        
22
        if Utils.isValidRule(self.current_job):
26
        if self.current_url is not None:
23
            print 'Working on new job'
27
            print 'Working on new URL,', self.current_url
24
            self.counter_attempts = 0
28
            self.counter_attempts = 0
25
            
29
            
26
            url = self.current_job['url'] if 'url' in self.current_job else None
30
            scraper = ScraperLoader.getScraper(self.current_url.source)
27
            print 'URL: ', url
31
            print 'Scraper:', scraper
28
            scraper = ScraperLoader.getScraper(self.current_job['source'])
-
 
29
            scraper.setUrl(url)
32
            scraper.setUrl(self.current_url.url)
30
            scraper.scrape()
33
            scraper.scrape()
31
            phone_prices = scraper.getPhones()
34
            phone_prices = scraper.getPhones()
32
            next_url = scraper.getNextUrl()
35
            next_url = scraper.getNextUrl()
33
            
36
            
34
            self.data['id'] = self.current_job['assigneeId']
-
 
35
            self.data['job_id'] = self.current_job['id']
-
 
36
            self.data['source'] = self.current_job['source']
-
 
37
            self.data['phone_prices'] = phone_prices
37
            self.data_manager.save(phone_prices)
38
            self.data['next_url'] = next_url
38
            print 'Next URL:', next_url
39
            
39
            
-
 
40
            if next_url is not None:
-
 
41
                self.url_queue.enqueue(next_url)
40
        else:
42
        else:
41
            self.counter_attempts += 1
43
            self.counter_attempts += 1
42
        
44
        
43
        if self.counter_attempts < 5:       #Scraper Agent will die after receiving continuous 5 null jobs
45
        if self.counter_attempts < 2:       #Scraper Agent will die after receiving continuous 5 null jobs
44
            print 'Posting data:', self.data
46
            self.schedular.enter(1, 1, self.work, ())
45
        
-
 
46
            self.current_job = postDataAndGetNewJob(self.data, GAE_URL)
-
 
47
        
47
        else:
48
            self.data['job_id'] = None
-
 
49
            self.data['source'] = None
48
            self.data_manager.dump()
50
            self.data['phone_prices'] = None
-
 
51
            self.data['next_url'] = None
49
            self.data_manager.persist()
52
        
-
 
53
            print 'New job: ', self.current_job
-
 
54
        
-
 
55
            self.schedular.enter(int(self.current_job['timetowait']), 1, self.work, ())
-
 
56
    
50
    
57
    def start(self):
51
    def start(self):
58
        self.schedular.enter(self.time_to_sleep, 1, self.work, ())
52
        self.schedular.enter(self.time_to_sleep, 1, self.work, ())
59
        self.schedular.run()
53
        self.schedular.run()
60
 
54