Subversion Repositories SmartDukaan

Rev

Rev 5291 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 5291 Rev 5507
Line 4... Line 4...
4
@author: Varun Gupta
4
@author: Varun Gupta
5
'''
5
'''
6
import sys, json
6
import sys, json
7
from Clients.GAEServletClient import url, clearPriceData, initJobQueue, getPhonePricesJSON
7
from Clients.GAEServletClient import url, clearPriceData, initJobQueue, getPhonePricesJSON
8
from ScraperAgent import ScraperAgent
8
from ScraperAgent import ScraperAgent
-
 
9
from ScraperLoader import getScraper
9
from PyLucene.IndexBuilder import IndexBuilder
10
from PyLucene.IndexBuilder import IndexBuilder
10
 
11
 
11
def startScraper():
12
def startScraper():
12
    ScraperAgent().start()
13
    ScraperAgent().start()
13
 
14
 
Line 31... Line 32...
31
            initJobQueue(url)
32
            initJobQueue(url)
32
        
33
        
33
        elif cmd == 'scrape':
34
        elif cmd == 'scrape':
34
            startScraper()
35
            startScraper()
35
        
36
        
-
 
37
        elif cmd == 'scrapep':
-
 
38
            f = open('/tmp/price-comp-dashboard/urls.json')
-
 
39
            data = {}
-
 
40
            
-
 
41
            for entityId, sourcenurl in json.load(f).iteritems():
-
 
42
                for source, url in sourcenurl.iteritems():
-
 
43
                    print entityId, source, url
-
 
44
                    scraper = getScraper(source)
-
 
45
                    productData = scraper.getDataFromProductPage(url)
-
 
46
                    print productData
-
 
47
                    
-
 
48
                    if entityId in data:
-
 
49
                        data[entityId][source] = productData
-
 
50
                    else:
-
 
51
                        data[entityId] = {source: productData}
-
 
52
            
-
 
53
            fw = open('/tmp/price-comp-dashboard/secondary-crawl.json', 'w')
-
 
54
            json.dump(data, fw, indent = 4)
-
 
55
        
36
        elif cmd == 'index': 
56
        elif cmd == 'index':
37
            buildIndex()
57
            buildIndex()
38
        
58
        
39
    except IndexError as e:
59
    except IndexError as e:
40
        print e
60
        print e
41
        print 'ERROR: Command line param must be specified. Options: clean, init, scrape, index'
61
        print 'ERROR: Command line param must be specified. Options: clean, init, scrape, index'
42
62