Subversion Repositories SmartDukaan

Rev

Rev 5291 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
4039 varun.gupt 1
'''
2
Created on 16-Sep-2011
3
 
4
@author: Varun Gupta
5
'''
5291 varun.gupt 6
import sys, json
4039 varun.gupt 7
from Clients.GAEServletClient import url, clearPriceData, initJobQueue, getPhonePricesJSON
8
from ScraperAgent import ScraperAgent
5507 varun.gupt 9
from ScraperLoader import getScraper
4039 varun.gupt 10
from PyLucene.IndexBuilder import IndexBuilder
11
 
12
def startScraper():
13
    ScraperAgent().start()
14
 
15
def buildIndex():
5291 varun.gupt 16
    #price_data = getPhonePricesJSON(url)
17
    f = open('/tmp/price-comp-dashboard/primary-crawl.json')
18
    price_data = json.load(f)
19
    print len(price_data)
20
    indexer = IndexBuilder(price_data = price_data, new_index = True)
4039 varun.gupt 21
    indexer.build()
22
 
23
if __name__ == '__main__':
24
    try:
25
        cmd = sys.argv[1].strip()
26
        print cmd
27
 
28
        if cmd == 'clean':
29
            clearPriceData(url)
30
 
31
        elif cmd == 'init':
32
            initJobQueue(url)
33
 
34
        elif cmd == 'scrape':
35
            startScraper()
36
 
5507 varun.gupt 37
        elif cmd == 'scrapep':
38
            f = open('/tmp/price-comp-dashboard/urls.json')
39
            data = {}
40
 
41
            for entityId, sourcenurl in json.load(f).iteritems():
42
                for source, url in sourcenurl.iteritems():
43
                    print entityId, source, url
44
                    scraper = getScraper(source)
45
                    productData = scraper.getDataFromProductPage(url)
46
                    print productData
47
 
48
                    if entityId in data:
49
                        data[entityId][source] = productData
50
                    else:
51
                        data[entityId] = {source: productData}
52
 
53
            fw = open('/tmp/price-comp-dashboard/secondary-crawl.json', 'w')
54
            json.dump(data, fw, indent = 4)
55
 
56
        elif cmd == 'index':
4039 varun.gupt 57
            buildIndex()
58
 
59
    except IndexError as e:
60
        print e
61
        print 'ERROR: Command line param must be specified. Options: clean, init, scrape, index'