| Line 2... |
Line 2... |
| 2 |
Created on 30-Aug-2011
|
2 |
Created on 30-Aug-2011
|
| 3 |
|
3 |
|
| 4 |
@author: Varun Gupta
|
4 |
@author: Varun Gupta
|
| 5 |
'''
|
5 |
'''
|
| 6 |
from lucene import File, StandardAnalyzer, IndexWriter, Version, SimpleFSDirectory, Document, Field
|
6 |
from lucene import File, StandardAnalyzer, IndexWriter, Version, SimpleFSDirectory, Document, Field
|
| 7 |
from Clients.GAEServletClient import getPhonePricesJSON, url
|
- |
|
| 8 |
from PhonePriceDoc import PhonePriceDoc
|
7 |
from PhonePriceDoc import PhonePriceDoc
|
| - |
|
8 |
from ScrapedDataManager import ScrapedDataManager
|
| 9 |
import lucene, Utils
|
9 |
import lucene, Utils, json
|
| 10 |
|
10 |
|
| 11 |
class IndexBuilder:
|
11 |
class IndexBuilder:
|
| 12 |
|
12 |
|
| 13 |
def __init__(self, price_data, new_index = True):
|
13 |
def __init__(self, price_data, new_index = True):
|
| 14 |
self.indexDir = "/tmp/lucene-index-dir"
|
14 |
self.indexDir = "/tmp/price-comp-dashboard/lucene-index-dir"
|
| 15 |
lucene.initVM()
|
15 |
lucene.initVM()
|
| 16 |
dir = SimpleFSDirectory(File(self.indexDir))
|
16 |
dir = SimpleFSDirectory(File(self.indexDir))
|
| 17 |
self.analyzer = StandardAnalyzer(Version.LUCENE_30)
|
17 |
self.analyzer = StandardAnalyzer(Version.LUCENE_30)
|
| 18 |
self.writer = IndexWriter(dir, self.analyzer, new_index, IndexWriter.MaxFieldLength(512))
|
18 |
self.writer = IndexWriter(dir, self.analyzer, new_index, IndexWriter.MaxFieldLength(512))
|
| 19 |
self.price_data = price_data
|
19 |
self.price_data = price_data
|
| Line 24... |
Line 24... |
| 24 |
count = 0
|
24 |
count = 0
|
| 25 |
|
25 |
|
| 26 |
for phone_price in self.price_data:
|
26 |
for phone_price in self.price_data:
|
| 27 |
print phone_price
|
27 |
print phone_price
|
| 28 |
#doc = PhonePriceDoc(phone_price)
|
28 |
#doc = PhonePriceDoc(phone_price)
|
| 29 |
count += 1
|
- |
|
| 30 |
|
29 |
|
| 31 |
brand, name = self.brand_model_extracter.extract(phone_price['name'])
|
30 |
brand, name = self.brand_model_extracter.extract(phone_price['name'])
|
| 32 |
print "Brand: %s, Model: %s" % (brand, name)
|
31 |
print "Brand: %s, Model: %s" % (brand, name)
|
| - |
|
32 |
try:
|
| 33 |
doc = Document()
|
33 |
doc = Document()
|
| 34 |
doc.add(Field("name", name, Field.Store.YES, Field.Index.ANALYZED))
|
34 |
doc.add(Field("name", name, Field.Store.YES, Field.Index.ANALYZED))
|
| 35 |
doc.add(Field("brand", brand, Field.Store.YES, Field.Index.ANALYZED))
|
35 |
doc.add(Field("brand", brand, Field.Store.YES, Field.Index.ANALYZED))
|
| 36 |
doc.add(Field("source", str(phone_price['source']), Field.Store.YES, Field.Index.ANALYZED))
|
36 |
doc.add(Field("source", str(phone_price['source']), Field.Store.YES, Field.Index.ANALYZED))
|
| 37 |
doc.add(Field("price", str(phone_price['price']), Field.Store.YES, Field.Index.NO))
|
37 |
doc.add(Field("price", str(phone_price['price']), Field.Store.YES, Field.Index.NO))
|
| 38 |
doc.add(Field("in_stock", str(phone_price['in_stock']), Field.Store.YES, Field.Index.NO))
|
38 |
doc.add(Field("in_stock", str(phone_price['in_stock']), Field.Store.YES, Field.Index.NO))
|
| 39 |
doc.add(Field("url", str(phone_price['url']), Field.Store.YES, Field.Index.NO))
|
39 |
doc.add(Field("url", str(phone_price['product_url']), Field.Store.YES, Field.Index.NO))
|
| - |
|
40 |
self.writer.addDocument(doc)
|
| 40 |
|
41 |
count += 1
|
| - |
|
42 |
except KeyError as e:
|
| - |
|
43 |
print 'KeyError:', e
|
| 41 |
self.writer.addDocument(doc)
|
44 |
print phone_price
|
| 42 |
|
45 |
|
| 43 |
print "Indexed lines from stdin (%d documents in index)" % (self.writer.numDocs())
|
46 |
print "Indexed lines from stdin (%d documents in index)" % (self.writer.numDocs())
|
| 44 |
print "About to optimize index of %d documents..." % self.writer.numDocs()
|
47 |
print "About to optimize index of %d documents..." % self.writer.numDocs()
|
| 45 |
self.writer.optimize()
|
48 |
self.writer.optimize()
|
| 46 |
print "...done optimizing index of %d documents" % self.writer.numDocs()
|
49 |
print "...done optimizing index of %d documents" % self.writer.numDocs()
|
| 47 |
print "Closing index of %d documents..." % self.writer.numDocs()
|
50 |
print "Closing index of %d documents..." % self.writer.numDocs()
|
| 48 |
print "%d docs added" % count
|
51 |
print "%d docs added" % count
|
| 49 |
self.writer.close()
|
52 |
self.writer.close()
|
| 50 |
|
53 |
|
| 51 |
if __name__ == '__main__':
|
54 |
if __name__ == '__main__':
|
| - |
|
55 |
data_manager = ScrapedDataManager()
|
| - |
|
56 |
fp = open(data_manager.price_data_file, 'r')
|
| 52 |
phones = getPhonePricesJSON(url)
|
57 |
phones = json.load(fp)
|
| 53 |
# print phones
|
58 |
print phones.__len__()
|
| 54 |
indexer = IndexBuilder(price_data = phones, new_index = False)
|
59 |
indexer = IndexBuilder(price_data = phones, new_index = True)
|
| 55 |
indexer.build()
|
60 |
indexer.build()
|
| 56 |
# catalog_client = InventoryClient().get_client()
|
61 |
# catalog_client = InventoryClient().get_client()
|
| 57 |
# items = catalog_client.getAllItems(True)
|
62 |
# items = catalog_client.getAllItems(True)
|
| 58 |
# print phones
|
63 |
# print phones
|
| 59 |
# for item in items: print item.id, item.brand, item.modelName, item.modelNumber
|
64 |
# for item in items: print item.id, item.brand, item.modelName, item.modelNumber
|
| 60 |
|
65 |
|