| 3232 |
varun.gupt |
1 |
'''
|
|
|
2 |
Created on 25-Aug-2011
|
|
|
3 |
|
|
|
4 |
@author: Varun Gupta
|
|
|
5 |
'''
|
|
|
6 |
import sched, time
|
|
|
7 |
import ScraperLoader, Utils
|
| 4039 |
varun.gupt |
8 |
from Clients.GAEServletClient import url as GAE_URL, postDataAndGetNewJob
|
| 3232 |
varun.gupt |
9 |
|
|
|
10 |
class ScraperAgent:
|
|
|
11 |
|
|
|
12 |
def __init__(self):
|
|
|
13 |
self.schedular = sched.scheduler(time.time, time.sleep)
|
|
|
14 |
self.time_to_sleep = 2
|
|
|
15 |
self.current_job = None
|
|
|
16 |
self.data = {'id': None, 'job_id': None, 'source': None, 'phone_prices': None, 'next_url': None}
|
| 4039 |
varun.gupt |
17 |
self.counter_attempts = 0
|
| 3232 |
varun.gupt |
18 |
print "ScraperAgent initiated at %f" % time.time()
|
|
|
19 |
|
|
|
20 |
def work(self):
|
|
|
21 |
|
|
|
22 |
if Utils.isValidRule(self.current_job):
|
|
|
23 |
print 'Working on new job'
|
| 4039 |
varun.gupt |
24 |
self.counter_attempts = 0
|
| 3232 |
varun.gupt |
25 |
|
|
|
26 |
url = self.current_job['url'] if 'url' in self.current_job else None
|
|
|
27 |
print 'URL: ', url
|
|
|
28 |
scraper = ScraperLoader.getScraper(self.current_job['source'])
|
|
|
29 |
scraper.setUrl(url)
|
|
|
30 |
scraper.scrape()
|
|
|
31 |
phone_prices = scraper.getPhones()
|
|
|
32 |
next_url = scraper.getNextUrl()
|
|
|
33 |
|
|
|
34 |
self.data['id'] = self.current_job['assigneeId']
|
|
|
35 |
self.data['job_id'] = self.current_job['id']
|
|
|
36 |
self.data['source'] = self.current_job['source']
|
|
|
37 |
self.data['phone_prices'] = phone_prices
|
|
|
38 |
self.data['next_url'] = next_url
|
|
|
39 |
|
| 4039 |
varun.gupt |
40 |
else:
|
|
|
41 |
self.counter_attempts += 1
|
| 3232 |
varun.gupt |
42 |
|
| 4039 |
varun.gupt |
43 |
if self.counter_attempts < 5: #Scraper Agent will die after receiving continuous 5 null jobs
|
|
|
44 |
print 'Posting data:', self.data
|
| 3232 |
varun.gupt |
45 |
|
| 4039 |
varun.gupt |
46 |
self.current_job = postDataAndGetNewJob(self.data, GAE_URL)
|
| 3232 |
varun.gupt |
47 |
|
| 4039 |
varun.gupt |
48 |
self.data['job_id'] = None
|
|
|
49 |
self.data['source'] = None
|
|
|
50 |
self.data['phone_prices'] = None
|
|
|
51 |
self.data['next_url'] = None
|
| 3232 |
varun.gupt |
52 |
|
| 4039 |
varun.gupt |
53 |
print 'New job: ', self.current_job
|
|
|
54 |
|
|
|
55 |
self.schedular.enter(int(self.current_job['timetowait']), 1, self.work, ())
|
| 3232 |
varun.gupt |
56 |
|
|
|
57 |
def start(self):
|
|
|
58 |
self.schedular.enter(self.time_to_sleep, 1, self.work, ())
|
|
|
59 |
self.schedular.run()
|
|
|
60 |
|
| 4039 |
varun.gupt |
61 |
if __name__ == '__main__':
|
|
|
62 |
ScraperAgent().start()
|