WebSVN – SmartDukaan – Diff – /trunk/PriceComparisonFramework/src/ScraperAgent.py

 @author: Varun Gupta
 '''
 import sched, time
 import ScraperLoader, Utils
-from Clients import GAEServletClient
+from Clients.GAEServletClient import url as GAE_URL, postDataAndGetNewJob
 class ScraperAgent:
     def __init__(self):
         self.schedular = sched.scheduler(time.time, time.sleep)
         self.time_to_sleep = 2
         self.current_job = None
         self.data = {'id': None, 'job_id': None, 'source': None, 'phone_prices': None, 'next_url': None}
+        self.counter_attempts = 0
         print "ScraperAgent initiated at %f" % time.time()
     def work(self):
         if Utils.isValidRule(self.current_job):
             print 'Working on new job'
+            self.counter_attempts = 0
             url = self.current_job['url'] if 'url' in self.current_job else None
             print 'URL: ', url
             scraper = ScraperLoader.getScraper(self.current_job['source'])
             scraper.setUrl(url)
             self.data['job_id'] = self.current_job['id']
             self.data['source'] = self.current_job['source']
             self.data['phone_prices'] = phone_prices
             self.data['next_url'] = next_url
+        else:
-        print 'Posting data:', self.data
+            self.counter_attempts += 1
+        if self.counter_attempts < 5:       #Scraper Agent will die after receiving continuous 5 null jobs
-        self.current_job = GAEServletClient.postDataAndGetNewJob(self.data)
+            print 'Posting data:', self.data
-        self.data['job_id'] = None
-        self.data['source'] = None
-        self.data['phone_prices'] = None
+            self.current_job = postDataAndGetNewJob(self.data, GAE_URL)
-        self.data['next_url'] = None
+            self.data['job_id'] = None
+            self.data['source'] = None
+            self.data['phone_prices'] = None
-        print 'New job: ', self.current_job
+            self.data['next_url'] = None
+            print 'New job: ', self.current_job
-        self.schedular.enter(int(self.current_job['timetowait']), 1, self.work, ())
+            self.schedular.enter(int(self.current_job['timetowait']), 1, self.work, ())
     def start(self):
         self.schedular.enter(self.time_to_sleep, 1, self.work, ())
         self.schedular.run()
-ScraperAgent().start()
 if __name__ == '__main__':
+    ScraperAgent().start()

Subversion Repositories SmartDukaan

(root)/trunk/PriceComparisonFramework/src/ScraperAgent.py – Rev 3232 → 4039