Subversion Repositories SmartDukaan

Rev

Rev 5291 | Blame | Compare with Previous | Last modification | View Log | RSS feed

'''
Created on 30-Aug-2011

@author: Varun Gupta
'''
import lucene
from lucene import SimpleFSDirectory, File, StandardAnalyzer, IndexSearcher, Version, MultiFieldQueryParser, QueryParser

class Retriever:
    
    def __init__(self):
        self.indexDir = "/usr/price-comp-dashboard/lucene-index-dir"
        lucene.initVM()
        
        dir = SimpleFSDirectory(File(self.indexDir))
        self.analyzer = StandardAnalyzer(Version.LUCENE_30)
        self.searcher = IndexSearcher(dir)
        self.MAX_RESULTS = 1000
    
    def retrieve(self, model_number, model_name = None, brand = None, synonyms = None):
        if model_number is None:  return
        print model_number, model_name, brand
        
        query_str = ''
        
        if brand is not None:
            query_brand = "brand:%s" % brand
        
        if model_name is not None:
            name_chunks = filter(lambda chunk: len(chunk.strip()) > 0, model_name.split(' '))
            
            if brand is not None:   query_str += ' AND '
            
            query_model_name = ' OR '.join(name_chunks)
        else:
            query_model_name = ''
        
        if model_number is not None:
            chunks = filter(lambda chunk: len(chunk.strip()) > 0, model_number.split(' '))
            
            for i in range(0, len(chunks)):
                
                if chunks[i].find('-') > -1:
                    chunk_parts = chunks[i].split('-')
                    chunks.append(''.join(chunk_parts))
                    chunks.append("(%s)" % ' AND '.join(chunk_parts))
            
            query_model_number = (' OR '.join(chunks)).strip()
        else:
            query_model_number = ''
        
        query_synonyms = ' '.join(synonyms) if synonyms is not None else ''
        
        if query_model_name:
            query_str = "%s AND (%s" % (query_brand, query_model_name)
        else:
            query_str = "%s AND (" % query_brand
        
        if query_model_number:
            if query_model_name:
                query_str += " OR %s %s)" % (query_model_number, query_synonyms)
            else:
                query_str += " %s %s)" % (query_model_number, query_synonyms)
        else:
            query_str += " %s)" % query_synonyms
        
#        query = QueryParser(Version.LUCENE_30, "name", analyzer).parse(user_query)
        qp = MultiFieldQueryParser(Version.LUCENE_30, ["name", "brand"], self.analyzer)
        query = qp.parse(qp, query_str)
        hits = self.searcher.search(query, self.MAX_RESULTS)
        print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query_str)
        
        phones = []
        
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            phones.append({
                    'name': str(doc.get("name")),
                    'price': str(doc.get("price")),
                    'score': hit.score,
                    'in_stock': str(doc.get("in_stock")),
                    'url': str(doc.get("url")),
                    'source': str(doc.get("source"))
                })
        print phones
        return phones

    def retrieveForQuery(self, query_str):
        qp = MultiFieldQueryParser(Version.LUCENE_30, ["name", "brand"], self.analyzer)
        query = qp.parse(qp, query_str)
        hits = self.searcher.search(query, self.MAX_RESULTS)

#        print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query)
        
        phones = []
        
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            phones.append({
                    'name': str(doc.get("name")),
                    'price': str(doc.get("price")),
                    'score': hit.score,
                    'in_stock': str(doc.get("in_stock")),
                    'url': str(doc.get("url")),
                    'source': str(doc.get("source"))
            })
        return phones

if __name__ == "__main__":
    retriever = Retriever()
#    print retriever.retrieve(model_number="E2230", model_name = "Hero", brand="Samsung")
    docs = retriever.retrieveForQuery('brand:Spice AND (M-6868 OR M6868 OR (M AND 6868) )')
    print len(docs)
    print docs
    #    print phone
    '''
    catalog_client = CatalogClient().get_client()
    items = catalog_client.getAllItems(True)
    products = {}
    
    for item in items:
        if item.category in (10002, 10003, 10004, 10005):  products[item.catalogItemId] = item

    for key, item in products.iteritems():
        try:
            name = "%s %s" % (item.modelName, item.modelNumber)
            matching_phones_with_top_score = getItemsWithTopScore(retriever.retrieve(name))
            print name, '\n', isPriceSame(matching_phones_with_top_score), matching_phones_with_top_score
        except Exception as e:
            print e
    '''