Subversion Repositories SmartDukaan

Rev

Rev 5507 | Blame | Compare with Previous | Last modification | View Log | RSS feed

'''
Created on 16-Sep-2011

@author: Varun Gupta
'''
import sys, json
from Clients.GAEServletClient import url, clearPriceData, initJobQueue, getPhonePricesJSON
from ScraperAgent import ScraperAgent
from ScraperLoader import getScraper
from PyLucene.IndexBuilder import IndexBuilder

def startScraper():
    ScraperAgent().start()

def buildIndex():
    #price_data = getPhonePricesJSON(url)
    f = open('/usr/price-comp-dashboard/primary-crawl.json')
    price_data = json.load(f)
    print len(price_data)
    indexer = IndexBuilder(price_data = price_data, new_index = True)
    indexer.build()

if __name__ == '__main__':
    try:
        cmd = sys.argv[1].strip()
        print cmd
        
        if cmd == 'clean':
            clearPriceData(url)

        elif cmd == 'init':
            initJobQueue(url)
        
        elif cmd == 'scrape':
            startScraper()
        
        elif cmd == 'scrapep':
            f = open('/usr/price-comp-dashboard/urls.json')
            data = {}
            
            for entityId, sourcenurl in json.load(f).iteritems():
                for source, url in sourcenurl.iteritems():
                    print entityId, source, url
                    scraper = getScraper(source)
                    productData = scraper.getDataFromProductPage(url)
                    print productData
                    
                    if entityId in data:
                        data[entityId][source] = productData
                    else:
                        data[entityId] = {source: productData}
            
            fw = open('/usr/price-comp-dashboard/secondary-crawl.json', 'w')
            json.dump(data, fw, indent = 4)
        
        elif cmd == 'index':
            buildIndex()
        
    except IndexError as e:
        print e
        print 'ERROR: Command line param must be specified. Options: clean, init, scrape, index'