Rev 5507 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 16-Sep-2011@author: Varun Gupta'''import sys, jsonfrom Clients.GAEServletClient import url, clearPriceData, initJobQueue, getPhonePricesJSONfrom ScraperAgent import ScraperAgentfrom ScraperLoader import getScraperfrom PyLucene.IndexBuilder import IndexBuilderdef startScraper():ScraperAgent().start()def buildIndex():#price_data = getPhonePricesJSON(url)f = open('/usr/price-comp-dashboard/primary-crawl.json')price_data = json.load(f)print len(price_data)indexer = IndexBuilder(price_data = price_data, new_index = True)indexer.build()if __name__ == '__main__':try:cmd = sys.argv[1].strip()print cmdif cmd == 'clean':clearPriceData(url)elif cmd == 'init':initJobQueue(url)elif cmd == 'scrape':startScraper()elif cmd == 'scrapep':f = open('/usr/price-comp-dashboard/urls.json')data = {}for entityId, sourcenurl in json.load(f).iteritems():for source, url in sourcenurl.iteritems():print entityId, source, urlscraper = getScraper(source)productData = scraper.getDataFromProductPage(url)print productDataif entityId in data:data[entityId][source] = productDataelse:data[entityId] = {source: productData}fw = open('/usr/price-comp-dashboard/secondary-crawl.json', 'w')json.dump(data, fw, indent = 4)elif cmd == 'index':buildIndex()except IndexError as e:print eprint 'ERROR: Command line param must be specified. Options: clean, init, scrape, index'