Subversion Repositories SmartDukaan

Rev

Rev 3313 | Rev 4198 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 3313 Rev 4039
Line 2... Line 2...
2
Created on 30-Aug-2011
2
Created on 30-Aug-2011
3
 
3
 
4
@author: Varun Gupta
4
@author: Varun Gupta
5
'''
5
'''
6
from lucene import File, StandardAnalyzer, IndexWriter, Version, SimpleFSDirectory, Document, Field
6
from lucene import File, StandardAnalyzer, IndexWriter, Version, SimpleFSDirectory, Document, Field
7
from Clients import GAEServletClient
7
from Clients.GAEServletClient import getPhonePricesJSON, url
8
from PhonePriceDoc import PhonePriceDoc
8
from PhonePriceDoc import PhonePriceDoc
9
import lucene, Utils
9
import lucene, Utils
10
 
10
 
11
class IndexBuilder:
11
class IndexBuilder:
12
 
12
 
Line 15... Line 15...
15
        lucene.initVM()
15
        lucene.initVM()
16
        dir = SimpleFSDirectory(File(self.indexDir))
16
        dir = SimpleFSDirectory(File(self.indexDir))
17
        self.analyzer = StandardAnalyzer(Version.LUCENE_30)
17
        self.analyzer = StandardAnalyzer(Version.LUCENE_30)
18
        self.writer = IndexWriter(dir, self.analyzer, new_index, IndexWriter.MaxFieldLength(512))
18
        self.writer = IndexWriter(dir, self.analyzer, new_index, IndexWriter.MaxFieldLength(512))
19
        self.price_data = price_data
19
        self.price_data = price_data
-
 
20
        self.brand_model_extracter = Utils.BrandAndModelExtracter()
20
    
21
    
21
    def build(self):
22
    def build(self):
22
        print "Currently there are %d documents in the index..." % self.writer.numDocs()
23
        print "Currently there are %d documents in the index..." % self.writer.numDocs()
23
        count = 0
24
        count = 0
24
        
25
        
25
        for phone_price in self.price_data:
26
        for phone_price in self.price_data:
26
            print phone_price
27
            print phone_price
27
            #doc = PhonePriceDoc(phone_price)
28
            #doc = PhonePriceDoc(phone_price)
28
            count += 1
29
            count += 1
29
            
30
            
30
            brand, name = Utils.extractBrandAndName(str(phone_price['name']))
31
            brand, name = self.brand_model_extracter.extract(phone_price['name'])
-
 
32
            print "Brand: %s, Model: %s" % (brand, name)
31
            doc = Document()
33
            doc = Document()
32
            doc.add(Field("name", name, Field.Store.YES, Field.Index.ANALYZED))
34
            doc.add(Field("name", name, Field.Store.YES, Field.Index.ANALYZED))
33
            doc.add(Field("brand", brand, Field.Store.YES, Field.Index.ANALYZED))
35
            doc.add(Field("brand", brand, Field.Store.YES, Field.Index.ANALYZED))
34
            doc.add(Field("source", str(phone_price['source']), Field.Store.YES, Field.Index.ANALYZED))
36
            doc.add(Field("source", str(phone_price['source']), Field.Store.YES, Field.Index.ANALYZED))
35
            doc.add(Field("price", str(phone_price['price']), Field.Store.YES, Field.Index.NO))
37
            doc.add(Field("price", str(phone_price['price']), Field.Store.YES, Field.Index.NO))
Line 45... Line 47...
45
        print "Closing index of %d documents..." % self.writer.numDocs()
47
        print "Closing index of %d documents..." % self.writer.numDocs()
46
        print "%d docs added" % count
48
        print "%d docs added" % count
47
        self.writer.close()
49
        self.writer.close()
48
        
50
        
49
if __name__ == '__main__':
51
if __name__ == '__main__':
50
    phones = GAEServletClient.getPhonePricesJSON()
52
    phones = getPhonePricesJSON(url)
51
#    print phones
53
#    print phones
52
    indexer = IndexBuilder(price_data = phones, new_index = False)
54
    indexer = IndexBuilder(price_data = phones, new_index = False)
53
    indexer.build()
55
    indexer.build()
54
#    catalog_client = InventoryClient().get_client()
56
#    catalog_client = InventoryClient().get_client()
55
#    items = catalog_client.getAllItems(True)
57
#    items = catalog_client.getAllItems(True)