| 3232 |
varun.gupt |
1 |
'''
|
|
|
2 |
Created on 26-Aug-2011
|
|
|
3 |
|
|
|
4 |
@author: Varun Gupta
|
|
|
5 |
'''
|
| 3350 |
varun.gupt |
6 |
import json, sys, os
|
| 3232 |
varun.gupt |
7 |
|
| 3350 |
varun.gupt |
8 |
cmd_folder = os.path.dirname(os.path.abspath(os.environ["HOME"] + "/code/trunk/PyProj/src/shop2020/"))
|
|
|
9 |
if cmd_folder not in sys.path:
|
|
|
10 |
sys.path.insert(0, cmd_folder)
|
|
|
11 |
|
|
|
12 |
from shop2020.clients.CatalogClient import CatalogClient
|
| 5401 |
varun.gupt |
13 |
from shop2020.thriftpy.model.v1.catalog.ttypes import status
|
| 3350 |
varun.gupt |
14 |
|
| 3453 |
varun.gupt |
15 |
CHARACTER_ENCODING = 'ISO-8859-1'
|
|
|
16 |
|
|
|
17 |
class BrandAndModelExtracter:
|
|
|
18 |
|
|
|
19 |
def __init__(self):
|
|
|
20 |
|
|
|
21 |
try:
|
|
|
22 |
client = CatalogClient().get_client()
|
|
|
23 |
self.brands = client.getAllBrandsByCategory(10001)
|
| 6186 |
amar.kumar |
24 |
self.brands.extend(client.getAllBrandsByCategory(11001))
|
| 3453 |
varun.gupt |
25 |
except Exception:
|
|
|
26 |
self.brands = ['Micromax', 'BlackBerry', 'Blackberry', 'Motorola', 'Alcatel', 'Sony Ericsson', 'Apple', \
|
| 6183 |
amar.kumar |
27 |
'Spice', 'Nokia', 'HTC', 'Samsung', 'LG', 'Dell', 'Karbonn', 'Lava', 'Canon', 'Nikon', 'Sony']
|
| 3453 |
varun.gupt |
28 |
|
|
|
29 |
self.brands.append('Blackberry') #To resolve issue of 'BlackBerry' and 'Blackberry'
|
|
|
30 |
|
|
|
31 |
def extract(self, full_name):
|
| 4039 |
varun.gupt |
32 |
full_name = full_name.strip()
|
| 3453 |
varun.gupt |
33 |
|
|
|
34 |
for brand in self.brands:
|
|
|
35 |
if full_name.startswith(brand): return (brand, full_name.replace(brand, '').strip())
|
|
|
36 |
|
|
|
37 |
return ("", full_name)
|
|
|
38 |
|
| 5291 |
varun.gupt |
39 |
class DuplicateMappingTracker:
|
|
|
40 |
|
|
|
41 |
def __init__(self):
|
|
|
42 |
self.mapping = {}
|
|
|
43 |
|
|
|
44 |
def track(self, url, entity_id):
|
|
|
45 |
if url in self.mapping:
|
|
|
46 |
self.mapping[url].append(entity_id)
|
|
|
47 |
else:
|
|
|
48 |
self.mapping[url] = [entity_id]
|
|
|
49 |
|
|
|
50 |
def getDuplicateMappings(self):
|
|
|
51 |
|
|
|
52 |
duplicate_mappings = {}
|
|
|
53 |
|
|
|
54 |
for url, entities in duplicate_mappings.iteritems():
|
|
|
55 |
if len(entities) > 1:
|
|
|
56 |
duplicate_mappings[url] = entities
|
|
|
57 |
|
|
|
58 |
return duplicate_mappings
|
|
|
59 |
|
| 4198 |
varun.gupt |
60 |
def getURLSource(url):
|
|
|
61 |
try:
|
|
|
62 |
return str(url.split('.')[1].strip())
|
|
|
63 |
except Exception:
|
|
|
64 |
return None
|
|
|
65 |
|
| 3232 |
varun.gupt |
66 |
def isValidRule(rule):
|
|
|
67 |
try:
|
|
|
68 |
if rule is None:
|
|
|
69 |
return False
|
|
|
70 |
|
|
|
71 |
elif rule['url'] is None:
|
|
|
72 |
return False
|
|
|
73 |
|
|
|
74 |
elif rule['source'] is None:
|
|
|
75 |
return False
|
|
|
76 |
|
|
|
77 |
else:
|
|
|
78 |
return True
|
|
|
79 |
|
|
|
80 |
except KeyError:
|
|
|
81 |
return False
|
|
|
82 |
|
|
|
83 |
def getItemsWithTopScore(items):
|
|
|
84 |
filterd_items = []
|
|
|
85 |
top_score = -1.0
|
|
|
86 |
|
|
|
87 |
for item in items:
|
|
|
88 |
if item['score'] >= top_score:
|
|
|
89 |
filterd_items.append(item)
|
|
|
90 |
top_score = item['score']
|
|
|
91 |
else:
|
|
|
92 |
return filterd_items
|
|
|
93 |
|
|
|
94 |
return filterd_items
|
|
|
95 |
|
|
|
96 |
def isPriceSame(items):
|
|
|
97 |
for i in range(0, items.__len__() - 1):
|
|
|
98 |
if items[i]['price'] != items[i + 1]['price']: return False
|
|
|
99 |
|
| 3313 |
varun.gupt |
100 |
return True
|
|
|
101 |
|
|
|
102 |
def getProductClusters(products):
|
|
|
103 |
'''
|
|
|
104 |
Receives a list of products (returned from search results) &
|
|
|
105 |
returns a clustered dictionary, where products are grouped by
|
|
|
106 |
the 'source'
|
|
|
107 |
'''
|
| 5639 |
amar.kumar |
108 |
clustered_results = {'adexmart': [], 'flipkart': [], 'homeshop18': [], 'infibeam': [], 'snapdeal': [], 'tradus': [], 'sulekha': []}
|
| 3313 |
varun.gupt |
109 |
|
|
|
110 |
for product in products:
|
|
|
111 |
clustered_results[product['source']].append(product)
|
|
|
112 |
|
|
|
113 |
return clustered_results
|
|
|
114 |
|
|
|
115 |
def getFilteredClustersWithTopScores(product_clusters):
|
|
|
116 |
filtered_cluster = {}
|
|
|
117 |
|
|
|
118 |
for source, products in product_clusters.iteritems():
|
|
|
119 |
filtered_cluster[source] = getItemsWithTopScore(products)
|
|
|
120 |
|
|
|
121 |
return filtered_cluster
|
|
|
122 |
|
|
|
123 |
def removePriceFormatting(price_string):
|
| 4199 |
varun.gupt |
124 |
return price_string.strip().replace('Rs.', '').replace('Rs', '').replace(',', '').replace(' ', '').replace(' ', '').split('.')[0]
|
| 3313 |
varun.gupt |
125 |
|
| 4198 |
varun.gupt |
126 |
def getSearchURL(source, name):
|
|
|
127 |
|
|
|
128 |
search_urls = {
|
| 6171 |
amar.kumar |
129 |
'flipkart': 'http://www.flipkart.com/search/a/all?query=$$',
|
|
|
130 |
'homeshop18': 'http://www.homeshop18.com/search:$$',
|
| 4198 |
varun.gupt |
131 |
'adexmart': 'http://adexmart.com/search.php?orderby=position&orderway=desc&search_query=$$',
|
| 6171 |
amar.kumar |
132 |
'infibeam': 'http://www.infibeam.com/search?q=$$',
|
| 5639 |
amar.kumar |
133 |
'snapdeal': 'http://www.snapdeal.com/search?catId=&categoryId=12&locUsed=false&vertical=p&keyword=$$',
|
|
|
134 |
'sulekha': 'http://mobiles.sulekha.com/search.htm?cx=partner-pub-3470583419345383%3A8ggsimfcaaa&cof=FORID%3A10&ie=ISO-8859-1&q=$$&sa=Go',
|
|
|
135 |
'tradus': 'http://www.tradus.com/search/tradus_search/?query=$$'
|
|
|
136 |
|
| 4198 |
varun.gupt |
137 |
}
|
|
|
138 |
return search_urls[source].replace('$$', name)
|
|
|
139 |
|
|
|
140 |
def getDisplayInfo(filtered_cluster, product_name):
|
| 5639 |
amar.kumar |
141 |
display_info = {'adexmart': {}, 'flipkart': {}, 'homeshop18': {}, 'infibeam': {}, 'snapdeal': {}, 'tradus':{}, 'sulekha':{}}
|
| 3313 |
varun.gupt |
142 |
|
|
|
143 |
for source, products in filtered_cluster.iteritems():
|
|
|
144 |
|
|
|
145 |
if len(products) > 0:
|
|
|
146 |
if isPriceSame(products):
|
|
|
147 |
display_info[source]['price'] = products[0]['price']
|
|
|
148 |
display_info[source]['data'] = None
|
|
|
149 |
display_info[source]['url'] = products[0]['url']
|
|
|
150 |
display_info[source]['text'] = removePriceFormatting(products[0]['price'])
|
| 3453 |
varun.gupt |
151 |
display_info[source]['title'] = products[0]['name']
|
| 3313 |
varun.gupt |
152 |
else:
|
|
|
153 |
display_info[source]['price'] = None
|
|
|
154 |
display_info[source]['data'] = json.dumps(products)
|
|
|
155 |
display_info[source]['url'] = None
|
|
|
156 |
display_info[source]['text'] = 'Conflict'
|
|
|
157 |
else:
|
|
|
158 |
display_info[source]['price'] = None
|
|
|
159 |
display_info[source]['data'] = None
|
| 4198 |
varun.gupt |
160 |
display_info[source]['url'] = getSearchURL(source, product_name)
|
| 3313 |
varun.gupt |
161 |
display_info[source]['text'] = 'Not Found'
|
|
|
162 |
|
| 3453 |
varun.gupt |
163 |
return display_info
|
|
|
164 |
|
|
|
165 |
def getSynonyms():
|
| 5761 |
amar.kumar |
166 |
file_path = '/usr/price-comp-dashboard/synonyms.json'
|
| 3453 |
varun.gupt |
167 |
file = open(file_path, 'r')
|
|
|
168 |
synonyms_json = file.read()
|
|
|
169 |
|
|
|
170 |
synonyms = {}
|
|
|
171 |
for key, value in json.loads(synonyms_json).iteritems():
|
|
|
172 |
list_synonyms = []
|
|
|
173 |
|
|
|
174 |
if 'MODEL_NAME' in value:
|
|
|
175 |
list_synonyms.extend(value['MODEL_NAME'])
|
|
|
176 |
|
|
|
177 |
if 'MODEL_NUMBER' in value:
|
|
|
178 |
list_synonyms.extend(value['MODEL_NUMBER'])
|
|
|
179 |
|
|
|
180 |
synonyms[int(key)] = list_synonyms
|
|
|
181 |
|
|
|
182 |
return synonyms
|
|
|
183 |
|
| 5401 |
varun.gupt |
184 |
def getValidItems():
|
|
|
185 |
catalog_client = CatalogClient().get_client()
|
|
|
186 |
items = catalog_client.getAllItemsByStatus(status.ACTIVE)
|
|
|
187 |
items.extend(catalog_client.getAllItemsByStatus(status.PAUSED))
|
|
|
188 |
items.extend(catalog_client.getAllItemsByStatus(status.PAUSED_BY_RISK))
|
|
|
189 |
products = {}
|
|
|
190 |
|
|
|
191 |
for item in items:
|
| 6186 |
amar.kumar |
192 |
if item.category in (10002, 10003, 10004, 10005, 10010, 11002, 11003): products[item.catalogItemId] = item
|
| 5401 |
varun.gupt |
193 |
|
|
|
194 |
return sorted(products.itervalues(), key = lambda item: item.brand)
|
|
|
195 |
|
|
|
196 |
def getProductName(item):
|
|
|
197 |
model_name = item.modelName.strip() if len(item.modelName.strip()) > 0 else None
|
|
|
198 |
model_number = item.modelNumber.strip() if len(item.modelNumber.strip()) > 0 else None
|
|
|
199 |
|
|
|
200 |
product_name = "%s " % item.brand
|
|
|
201 |
product_name += "%s " % model_name if model_name is not None else ''
|
|
|
202 |
product_name += model_number if model_number is not None else ''
|
|
|
203 |
|
|
|
204 |
return product_name
|
|
|
205 |
|
| 3453 |
varun.gupt |
206 |
if __name__ == '__main__':
|
|
|
207 |
extracter = BrandAndModelExtracter()
|
|
|
208 |
# print extracter.extract('Nokia X5-01 (Pink)')
|
|
|
209 |
print getSynonyms()
|