Subversion Repositories SmartDukaan

Rev

Rev 3232 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
3232 varun.gupt 1
'''
2
Created on 25-Aug-2011
3
 
4
@author: Varun Gupta
5
'''
6
import sched, time
7
import ScraperLoader, Utils
4039 varun.gupt 8
from Clients.GAEServletClient import url as GAE_URL, postDataAndGetNewJob
3232 varun.gupt 9
 
10
class ScraperAgent:
11
 
12
    def __init__(self):
13
        self.schedular = sched.scheduler(time.time, time.sleep)
14
        self.time_to_sleep = 2
15
        self.current_job = None
16
        self.data = {'id': None, 'job_id': None, 'source': None, 'phone_prices': None, 'next_url': None}
4039 varun.gupt 17
        self.counter_attempts = 0
3232 varun.gupt 18
        print "ScraperAgent initiated at %f" % time.time()
19
 
20
    def work(self):
21
 
22
        if Utils.isValidRule(self.current_job):
23
            print 'Working on new job'
4039 varun.gupt 24
            self.counter_attempts = 0
3232 varun.gupt 25
 
26
            url = self.current_job['url'] if 'url' in self.current_job else None
27
            print 'URL: ', url
28
            scraper = ScraperLoader.getScraper(self.current_job['source'])
29
            scraper.setUrl(url)
30
            scraper.scrape()
31
            phone_prices = scraper.getPhones()
32
            next_url = scraper.getNextUrl()
33
 
34
            self.data['id'] = self.current_job['assigneeId']
35
            self.data['job_id'] = self.current_job['id']
36
            self.data['source'] = self.current_job['source']
37
            self.data['phone_prices'] = phone_prices
38
            self.data['next_url'] = next_url
39
 
4039 varun.gupt 40
        else:
41
            self.counter_attempts += 1
3232 varun.gupt 42
 
4039 varun.gupt 43
        if self.counter_attempts < 5:       #Scraper Agent will die after receiving continuous 5 null jobs
44
            print 'Posting data:', self.data
3232 varun.gupt 45
 
4039 varun.gupt 46
            self.current_job = postDataAndGetNewJob(self.data, GAE_URL)
3232 varun.gupt 47
 
4039 varun.gupt 48
            self.data['job_id'] = None
49
            self.data['source'] = None
50
            self.data['phone_prices'] = None
51
            self.data['next_url'] = None
3232 varun.gupt 52
 
4039 varun.gupt 53
            print 'New job: ', self.current_job
54
 
55
            self.schedular.enter(int(self.current_job['timetowait']), 1, self.work, ())
3232 varun.gupt 56
 
57
    def start(self):
58
        self.schedular.enter(self.time_to_sleep, 1, self.work, ())
59
        self.schedular.run()
60
 
4039 varun.gupt 61
if __name__ == '__main__':
62
    ScraperAgent().start()