Rev 4198 | Blame | Compare with Previous | Last modification | View Log | RSS feed
'''Created on 30-Aug-2011@author: Varun Gupta'''from lucene import File, StandardAnalyzer, IndexWriter, Version, SimpleFSDirectory, Document, Fieldfrom PhonePriceDoc import PhonePriceDocfrom ScrapedDataManager import ScrapedDataManagerimport lucene, Utils, jsonclass IndexBuilder:def __init__(self, price_data, new_index = True):self.indexDir = "/usr/price-comp-dashboard/lucene-index-dir"lucene.initVM()dir = SimpleFSDirectory(File(self.indexDir))self.analyzer = StandardAnalyzer(Version.LUCENE_30)self.writer = IndexWriter(dir, self.analyzer, new_index, IndexWriter.MaxFieldLength(512))self.price_data = price_dataself.brand_model_extracter = Utils.BrandAndModelExtracter()def build(self):print "Currently there are %d documents in the index..." % self.writer.numDocs()count = 0for phone_price in self.price_data:print phone_price#doc = PhonePriceDoc(phone_price)brand, name = self.brand_model_extracter.extract(phone_price['name'])print "Brand: %s, Model: %s" % (brand, name)try:doc = Document()doc.add(Field("name", name, Field.Store.YES, Field.Index.ANALYZED))doc.add(Field("brand", brand, Field.Store.YES, Field.Index.ANALYZED))doc.add(Field("source", str(phone_price['source']), Field.Store.YES, Field.Index.ANALYZED))doc.add(Field("price", str(phone_price['price']), Field.Store.YES, Field.Index.NO))doc.add(Field("in_stock", str(phone_price['in_stock']), Field.Store.YES, Field.Index.NO))doc.add(Field("url", str(phone_price['product_url']), Field.Store.YES, Field.Index.NO))self.writer.addDocument(doc)count += 1except KeyError as e:print 'KeyError:', eprint phone_priceprint "Indexed lines from stdin (%d documents in index)" % (self.writer.numDocs())print "About to optimize index of %d documents..." % self.writer.numDocs()self.writer.optimize()print "...done optimizing index of %d documents" % self.writer.numDocs()print "Closing index of %d documents..." % self.writer.numDocs()print "%d docs added" % countself.writer.close()if __name__ == '__main__':data_manager = ScrapedDataManager()fp = open(data_manager.price_data_file, 'r')phones = json.load(fp)print phones.__len__()indexer = IndexBuilder(price_data = phones, new_index = True)indexer.build()# catalog_client = InventoryClient().get_client()# items = catalog_client.getAllItems(True)# print phones# for item in items: print item.id, item.brand, item.modelName, item.modelNumber