| 3232 |
varun.gupt |
1 |
'''
|
|
|
2 |
Created on 25-Aug-2011
|
|
|
3 |
|
|
|
4 |
@author: Varun Gupta
|
|
|
5 |
'''
|
|
|
6 |
import sched, time
|
|
|
7 |
import ScraperLoader, Utils
|
|
|
8 |
from Clients import GAEServletClient
|
|
|
9 |
|
|
|
10 |
class ScraperAgent:
|
|
|
11 |
|
|
|
12 |
def __init__(self):
|
|
|
13 |
self.schedular = sched.scheduler(time.time, time.sleep)
|
|
|
14 |
self.time_to_sleep = 2
|
|
|
15 |
self.current_job = None
|
|
|
16 |
self.data = {'id': None, 'job_id': None, 'source': None, 'phone_prices': None, 'next_url': None}
|
|
|
17 |
print "ScraperAgent initiated at %f" % time.time()
|
|
|
18 |
|
|
|
19 |
def work(self):
|
|
|
20 |
|
|
|
21 |
if Utils.isValidRule(self.current_job):
|
|
|
22 |
print 'Working on new job'
|
|
|
23 |
|
|
|
24 |
url = self.current_job['url'] if 'url' in self.current_job else None
|
|
|
25 |
print 'URL: ', url
|
|
|
26 |
scraper = ScraperLoader.getScraper(self.current_job['source'])
|
|
|
27 |
scraper.setUrl(url)
|
|
|
28 |
scraper.scrape()
|
|
|
29 |
phone_prices = scraper.getPhones()
|
|
|
30 |
next_url = scraper.getNextUrl()
|
|
|
31 |
|
|
|
32 |
self.data['id'] = self.current_job['assigneeId']
|
|
|
33 |
self.data['job_id'] = self.current_job['id']
|
|
|
34 |
self.data['source'] = self.current_job['source']
|
|
|
35 |
self.data['phone_prices'] = phone_prices
|
|
|
36 |
self.data['next_url'] = next_url
|
|
|
37 |
|
|
|
38 |
print 'Posting data:', self.data
|
|
|
39 |
|
|
|
40 |
self.current_job = GAEServletClient.postDataAndGetNewJob(self.data)
|
|
|
41 |
|
|
|
42 |
self.data['job_id'] = None
|
|
|
43 |
self.data['source'] = None
|
|
|
44 |
self.data['phone_prices'] = None
|
|
|
45 |
self.data['next_url'] = None
|
|
|
46 |
|
|
|
47 |
print 'New job: ', self.current_job
|
|
|
48 |
|
|
|
49 |
self.schedular.enter(int(self.current_job['timetowait']), 1, self.work, ())
|
|
|
50 |
|
|
|
51 |
def start(self):
|
|
|
52 |
self.schedular.enter(self.time_to_sleep, 1, self.work, ())
|
|
|
53 |
self.schedular.run()
|
|
|
54 |
|
|
|
55 |
ScraperAgent().start()
|