Subversion Repositories SmartDukaan

Rev

Rev 4039 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 4039 Rev 4198
Line 2... Line 2...
2
Created on 30-Aug-2011
2
Created on 30-Aug-2011
3
 
3
 
4
@author: Varun Gupta
4
@author: Varun Gupta
5
'''
5
'''
6
from lucene import File, StandardAnalyzer, IndexWriter, Version, SimpleFSDirectory, Document, Field
6
from lucene import File, StandardAnalyzer, IndexWriter, Version, SimpleFSDirectory, Document, Field
7
from Clients.GAEServletClient import getPhonePricesJSON, url
-
 
8
from PhonePriceDoc import PhonePriceDoc
7
from PhonePriceDoc import PhonePriceDoc
-
 
8
from ScrapedDataManager import ScrapedDataManager
9
import lucene, Utils
9
import lucene, Utils, json
10
 
10
 
11
class IndexBuilder:
11
class IndexBuilder:
12
 
12
 
13
    def __init__(self, price_data, new_index = True):
13
    def __init__(self, price_data, new_index = True):
14
        self.indexDir = "/tmp/lucene-index-dir"
14
        self.indexDir = "/tmp/price-comp-dashboard/lucene-index-dir"
15
        lucene.initVM()
15
        lucene.initVM()
16
        dir = SimpleFSDirectory(File(self.indexDir))
16
        dir = SimpleFSDirectory(File(self.indexDir))
17
        self.analyzer = StandardAnalyzer(Version.LUCENE_30)
17
        self.analyzer = StandardAnalyzer(Version.LUCENE_30)
18
        self.writer = IndexWriter(dir, self.analyzer, new_index, IndexWriter.MaxFieldLength(512))
18
        self.writer = IndexWriter(dir, self.analyzer, new_index, IndexWriter.MaxFieldLength(512))
19
        self.price_data = price_data
19
        self.price_data = price_data
Line 24... Line 24...
24
        count = 0
24
        count = 0
25
        
25
        
26
        for phone_price in self.price_data:
26
        for phone_price in self.price_data:
27
            print phone_price
27
            print phone_price
28
            #doc = PhonePriceDoc(phone_price)
28
            #doc = PhonePriceDoc(phone_price)
29
            count += 1
-
 
30
            
29
            
31
            brand, name = self.brand_model_extracter.extract(phone_price['name'])
30
            brand, name = self.brand_model_extracter.extract(phone_price['name'])
32
            print "Brand: %s, Model: %s" % (brand, name)
31
            print "Brand: %s, Model: %s" % (brand, name)
-
 
32
            try:
33
            doc = Document()
33
                doc = Document()
34
            doc.add(Field("name", name, Field.Store.YES, Field.Index.ANALYZED))
34
                doc.add(Field("name", name, Field.Store.YES, Field.Index.ANALYZED))
35
            doc.add(Field("brand", brand, Field.Store.YES, Field.Index.ANALYZED))
35
                doc.add(Field("brand", brand, Field.Store.YES, Field.Index.ANALYZED))
36
            doc.add(Field("source", str(phone_price['source']), Field.Store.YES, Field.Index.ANALYZED))
36
                doc.add(Field("source", str(phone_price['source']), Field.Store.YES, Field.Index.ANALYZED))
37
            doc.add(Field("price", str(phone_price['price']), Field.Store.YES, Field.Index.NO))
37
                doc.add(Field("price", str(phone_price['price']), Field.Store.YES, Field.Index.NO))
38
            doc.add(Field("in_stock", str(phone_price['in_stock']), Field.Store.YES, Field.Index.NO))
38
                doc.add(Field("in_stock", str(phone_price['in_stock']), Field.Store.YES, Field.Index.NO))
39
            doc.add(Field("url", str(phone_price['url']), Field.Store.YES, Field.Index.NO))
39
                doc.add(Field("url", str(phone_price['product_url']), Field.Store.YES, Field.Index.NO))
-
 
40
                self.writer.addDocument(doc)
40
            
41
                count += 1
-
 
42
            except KeyError as e:
-
 
43
                print 'KeyError:', e
41
            self.writer.addDocument(doc)
44
                print phone_price
42
            
45
            
43
        print "Indexed lines from stdin (%d documents in index)" % (self.writer.numDocs())
46
        print "Indexed lines from stdin (%d documents in index)" % (self.writer.numDocs())
44
        print "About to optimize index of %d documents..." % self.writer.numDocs()
47
        print "About to optimize index of %d documents..." % self.writer.numDocs()
45
        self.writer.optimize()
48
        self.writer.optimize()
46
        print "...done optimizing index of %d documents" % self.writer.numDocs()
49
        print "...done optimizing index of %d documents" % self.writer.numDocs()
47
        print "Closing index of %d documents..." % self.writer.numDocs()
50
        print "Closing index of %d documents..." % self.writer.numDocs()
48
        print "%d docs added" % count
51
        print "%d docs added" % count
49
        self.writer.close()
52
        self.writer.close()
50
        
53
        
51
if __name__ == '__main__':
54
if __name__ == '__main__':
-
 
55
    data_manager = ScrapedDataManager()
-
 
56
    fp = open(data_manager.price_data_file, 'r')
52
    phones = getPhonePricesJSON(url)
57
    phones = json.load(fp)
53
#    print phones
58
    print phones.__len__()
54
    indexer = IndexBuilder(price_data = phones, new_index = False)
59
    indexer = IndexBuilder(price_data = phones, new_index = True)
55
    indexer.build()
60
    indexer.build()
56
#    catalog_client = InventoryClient().get_client()
61
#    catalog_client = InventoryClient().get_client()
57
#    items = catalog_client.getAllItems(True)
62
#    items = catalog_client.getAllItems(True)
58
#    print phones
63
#    print phones
59
#    for item in items:  print item.id, item.brand, item.modelName, item.modelNumber
64
#    for item in items:  print item.id, item.brand, item.modelName, item.modelNumber
60
65