Rev 4039 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 25-Aug-2011@author: Varun Gupta'''import sched, timeimport ScraperLoader, Utilsfrom URLQueue import URL, URLQueuefrom ScrapedDataManager import ScrapedDataManagerclass ScraperAgent:def __init__(self):self.schedular = sched.scheduler(time.time, time.sleep)self.time_to_sleep = 2self.current_url = Noneself.counter_attempts = 0print "ScraperAgent initiated at %f" % time.time()self.url_queue = URLQueue()self.data_manager = ScrapedDataManager()def work(self):self.current_url = self.url_queue.get()if self.current_url is not None:print 'Working on new URL,', self.current_urlself.counter_attempts = 0scraper = ScraperLoader.getScraper(self.current_url.source)print 'Scraper:', scraperscraper.setUrl(self.current_url.url)scraper.scrape()phone_prices = scraper.getPhones()next_url = scraper.getNextUrl()self.data_manager.save(phone_prices)print 'Next URL:', next_urlif next_url is not None:self.url_queue.enqueue(next_url)else:self.counter_attempts += 1if self.counter_attempts < 2: #Scraper Agent will die after receiving continuous 5 null jobsself.schedular.enter(1, 1, self.work, ())else:self.data_manager.dump()self.data_manager.persist()def start(self):self.schedular.enter(self.time_to_sleep, 1, self.work, ())self.schedular.run()if __name__ == '__main__':ScraperAgent().start()