Subversion Repositories SmartDukaan

Rev

Rev 4039 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
3232 varun.gupt 1
'''
2
Created on 25-Aug-2011
3
 
4
@author: Varun Gupta
5
'''
6
import sched, time
7
import ScraperLoader, Utils
4198 varun.gupt 8
from URLQueue import URL, URLQueue
9
from ScrapedDataManager import ScrapedDataManager
3232 varun.gupt 10
 
11
class ScraperAgent:
12
 
13
    def __init__(self):
14
        self.schedular = sched.scheduler(time.time, time.sleep)
15
        self.time_to_sleep = 2
4198 varun.gupt 16
        self.current_url = None
4039 varun.gupt 17
        self.counter_attempts = 0
3232 varun.gupt 18
        print "ScraperAgent initiated at %f" % time.time()
4198 varun.gupt 19
        self.url_queue = URLQueue()
20
        self.data_manager = ScrapedDataManager()
3232 varun.gupt 21
 
22
    def work(self):
23
 
4198 varun.gupt 24
        self.current_url = self.url_queue.get()
25
 
26
        if self.current_url is not None:
27
            print 'Working on new URL,', self.current_url
4039 varun.gupt 28
            self.counter_attempts = 0
3232 varun.gupt 29
 
4198 varun.gupt 30
            scraper = ScraperLoader.getScraper(self.current_url.source)
31
            print 'Scraper:', scraper
32
            scraper.setUrl(self.current_url.url)
3232 varun.gupt 33
            scraper.scrape()
34
            phone_prices = scraper.getPhones()
35
            next_url = scraper.getNextUrl()
36
 
4198 varun.gupt 37
            self.data_manager.save(phone_prices)
38
            print 'Next URL:', next_url
3232 varun.gupt 39
 
4198 varun.gupt 40
            if next_url is not None:
41
                self.url_queue.enqueue(next_url)
4039 varun.gupt 42
        else:
43
            self.counter_attempts += 1
3232 varun.gupt 44
 
4198 varun.gupt 45
        if self.counter_attempts < 2:       #Scraper Agent will die after receiving continuous 5 null jobs
46
            self.schedular.enter(1, 1, self.work, ())
47
        else:
48
            self.data_manager.dump()
49
            self.data_manager.persist()
3232 varun.gupt 50
 
51
    def start(self):
52
        self.schedular.enter(self.time_to_sleep, 1, self.work, ())
53
        self.schedular.run()
54
 
4039 varun.gupt 55
if __name__ == '__main__':
56
    ScraperAgent().start()