Rev 3232 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 25-Aug-2011@author: Varun Gupta'''import sched, timeimport ScraperLoader, Utilsfrom Clients.GAEServletClient import url as GAE_URL, postDataAndGetNewJobclass ScraperAgent:def __init__(self):self.schedular = sched.scheduler(time.time, time.sleep)self.time_to_sleep = 2self.current_job = Noneself.data = {'id': None, 'job_id': None, 'source': None, 'phone_prices': None, 'next_url': None}self.counter_attempts = 0print "ScraperAgent initiated at %f" % time.time()def work(self):if Utils.isValidRule(self.current_job):print 'Working on new job'self.counter_attempts = 0url = self.current_job['url'] if 'url' in self.current_job else Noneprint 'URL: ', urlscraper = ScraperLoader.getScraper(self.current_job['source'])scraper.setUrl(url)scraper.scrape()phone_prices = scraper.getPhones()next_url = scraper.getNextUrl()self.data['id'] = self.current_job['assigneeId']self.data['job_id'] = self.current_job['id']self.data['source'] = self.current_job['source']self.data['phone_prices'] = phone_pricesself.data['next_url'] = next_urlelse:self.counter_attempts += 1if self.counter_attempts < 5: #Scraper Agent will die after receiving continuous 5 null jobsprint 'Posting data:', self.dataself.current_job = postDataAndGetNewJob(self.data, GAE_URL)self.data['job_id'] = Noneself.data['source'] = Noneself.data['phone_prices'] = Noneself.data['next_url'] = Noneprint 'New job: ', self.current_jobself.schedular.enter(int(self.current_job['timetowait']), 1, self.work, ())def start(self):self.schedular.enter(self.time_to_sleep, 1, self.work, ())self.schedular.run()if __name__ == '__main__':ScraperAgent().start()