WebSVN – SmartDukaan – /trunk/PriceComparisonFramework/src/ScraperAgent.py

'''
Created on 25-Aug-2011

@author: Varun Gupta
'''
import sched, time
import ScraperLoader, Utils
from Clients.GAEServletClient import url as GAE_URL, postDataAndGetNewJob

class ScraperAgent:
    
    def __init__(self):
        self.schedular = sched.scheduler(time.time, time.sleep)
        self.time_to_sleep = 2
        self.current_job = None
        self.data = {'id': None, 'job_id': None, 'source': None, 'phone_prices': None, 'next_url': None}
        self.counter_attempts = 0
        print "ScraperAgent initiated at %f" % time.time()
        
    def work(self):
        
        if Utils.isValidRule(self.current_job):
            print 'Working on new job'
            self.counter_attempts = 0
            
            url = self.current_job['url'] if 'url' in self.current_job else None
            print 'URL: ', url
            scraper = ScraperLoader.getScraper(self.current_job['source'])
            scraper.setUrl(url)
            scraper.scrape()
            phone_prices = scraper.getPhones()
            next_url = scraper.getNextUrl()
            
            self.data['id'] = self.current_job['assigneeId']
            self.data['job_id'] = self.current_job['id']
            self.data['source'] = self.current_job['source']
            self.data['phone_prices'] = phone_prices
            self.data['next_url'] = next_url
            
        else:
            self.counter_attempts += 1
        
        if self.counter_attempts < 5:       #Scraper Agent will die after receiving continuous 5 null jobs
            print 'Posting data:', self.data
        
            self.current_job = postDataAndGetNewJob(self.data, GAE_URL)
        
            self.data['job_id'] = None
            self.data['source'] = None
            self.data['phone_prices'] = None
            self.data['next_url'] = None
        
            print 'New job: ', self.current_job
        
            self.schedular.enter(int(self.current_job['timetowait']), 1, self.work, ())
    
    def start(self):
        self.schedular.enter(self.time_to_sleep, 1, self.work, ())
        self.schedular.run()

if __name__ == '__main__':
    ScraperAgent().start()
Subversion Repositories SmartDukaan

(root)/trunk/PriceComparisonFramework/src/ScraperAgent.py – Rev 4106