Subversion Repositories SmartDukaan

Rev

Rev 4198 | Blame | Compare with Previous | Last modification | View Log | RSS feed

'''
Created on 30-Aug-2011

@author: Varun Gupta
'''
from lucene import File, StandardAnalyzer, IndexWriter, Version, SimpleFSDirectory, Document, Field
from PhonePriceDoc import PhonePriceDoc
from ScrapedDataManager import ScrapedDataManager
import lucene, Utils, json

class IndexBuilder:

    def __init__(self, price_data, new_index = True):
        self.indexDir = "/usr/price-comp-dashboard/lucene-index-dir"
        lucene.initVM()
        dir = SimpleFSDirectory(File(self.indexDir))
        self.analyzer = StandardAnalyzer(Version.LUCENE_30)
        self.writer = IndexWriter(dir, self.analyzer, new_index, IndexWriter.MaxFieldLength(512))
        self.price_data = price_data
        self.brand_model_extracter = Utils.BrandAndModelExtracter()
    
    def build(self):
        print "Currently there are %d documents in the index..." % self.writer.numDocs()
        count = 0
        
        for phone_price in self.price_data:
            print phone_price
            #doc = PhonePriceDoc(phone_price)
            
            brand, name = self.brand_model_extracter.extract(phone_price['name'])
            print "Brand: %s, Model: %s" % (brand, name)
            try:
                doc = Document()
                doc.add(Field("name", name, Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field("brand", brand, Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field("source", str(phone_price['source']), Field.Store.YES, Field.Index.ANALYZED))
                doc.add(Field("price", str(phone_price['price']), Field.Store.YES, Field.Index.NO))
                doc.add(Field("in_stock", str(phone_price['in_stock']), Field.Store.YES, Field.Index.NO))
                doc.add(Field("url", str(phone_price['product_url']), Field.Store.YES, Field.Index.NO))
                self.writer.addDocument(doc)
                count += 1
            except KeyError as e:
                print 'KeyError:', e
                print phone_price
            
        print "Indexed lines from stdin (%d documents in index)" % (self.writer.numDocs())
        print "About to optimize index of %d documents..." % self.writer.numDocs()
        self.writer.optimize()
        print "...done optimizing index of %d documents" % self.writer.numDocs()
        print "Closing index of %d documents..." % self.writer.numDocs()
        print "%d docs added" % count
        self.writer.close()
        
if __name__ == '__main__':
    data_manager = ScrapedDataManager()
    fp = open(data_manager.price_data_file, 'r')
    phones = json.load(fp)
    print phones.__len__()
    indexer = IndexBuilder(price_data = phones, new_index = True)
    indexer.build()
#    catalog_client = InventoryClient().get_client()
#    items = catalog_client.getAllItems(True)
#    print phones
#    for item in items:  print item.id, item.brand, item.modelName, item.modelNumber