Subversion Repositories SmartDukaan

Rev

Rev 4198 | Rev 5377 | Go to most recent revision | Show entire file | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 4198 Rev 5291
Line 8... Line 8...
8
from PyLucene.Retriever import Retriever
8
from PyLucene.Retriever import Retriever
9
from Utils import getItemsWithTopScore, isPriceSame, getProductClusters, getFilteredClustersWithTopScores, \
9
from Utils import getItemsWithTopScore, isPriceSame, getProductClusters, getFilteredClustersWithTopScores, \
10
    getDisplayInfo, getSynonyms
10
    getDisplayInfo, getSynonyms
11
from ScraperLoader import getScraper
11
from ScraperLoader import getScraper
12
from PyLucene.IndexBuilder import IndexBuilder
12
from PyLucene.IndexBuilder import IndexBuilder
-
 
13
from DataStore.WatchListManager import WatchListManager
13
 
14
 
14
cmd_folder = os.path.dirname(os.path.abspath(os.environ["HOME"] + "/code/trunk/PyProj/src/shop2020/"))
15
cmd_folder = os.path.dirname(os.path.abspath(os.environ["HOME"] + "/code/trunk/PyProj/src/shop2020/"))
15
if cmd_folder not in sys.path:
16
if cmd_folder not in sys.path:
16
    sys.path.insert(0, cmd_folder)
17
    sys.path.insert(0, cmd_folder)
17
 
18
 
Line 40... Line 41...
40
            self.redirect('/')
41
            self.redirect('/')
41
        else:
42
        else:
42
            self.redirect('/login')
43
            self.redirect('/login')
43
 
44
 
44
class URLFeedbackHandler(BaseHandler):
45
class URLFeedbackHandler(BaseHandler):
-
 
46
    url_feedback_file = '/tmp/price-comp-dashboard/urls.json'
-
 
47
    secondary_crawl_file = '/tmp/price-comp-dashboard/secondary-crawl.json'
45
    
48
    
46
    def post(self):
49
    def post(self):
47
        self.url_feedback_file = '/tmp/price-comp-dashboard/urls.json'
-
 
48
        
-
 
49
        try:
50
        try:
50
            fp_read = open(self.url_feedback_file, 'r')
51
            fp_read = open(URLFeedbackHandler.url_feedback_file, 'r')
51
            urls = json.load(fp_read)
52
            urls = json.load(fp_read)
52
            
53
            
53
        except ValueError as e:
54
        except ValueError as e:
54
            print e
55
            print e
55
            urls = {}
56
            urls = {}
56
        finally:
57
        finally:
57
            fp_read.close()
58
            fp_read.close()
-
 
59
        
58
        print urls
60
        print 'Existing URLs: ', urls
59
        
61
        
60
        entity = self.get_argument('entity')
62
        entity = self.get_argument('entity')
61
        source = self.get_argument('source')
63
        source = self.get_argument('source')
62
        url = self.get_argument('url')
64
        url = self.get_argument('url')
63
        
65
        
64
        if entity in urls:
66
        if entity in urls:
65
            urls[entity][source] = url
67
            urls[entity][source] = url
66
        else:
68
        else:
67
            urls[entity] = {source: url}
69
            urls[entity] = {source: url}
68
        
70
        
-
 
71
        print 'New set of URLs: ', urls
69
        fp_write = open(self.url_feedback_file, 'w')
72
        fp_write = open(URLFeedbackHandler.url_feedback_file, 'w')
70
        json.dump(urls, fp_write, indent = 4)
73
        json.dump(urls, fp_write, indent = 4)
71
        fp_write.close()
74
        fp_write.close()
72
        
75
        
73
        #Scraping the page
76
        #Scraping the page
74
        scraper = getScraper(source)
77
        scraper = getScraper(source)
75
        data = scraper.getDataFromProductPage(url)
78
        productData = scraper.getDataFromProductPage(url)
-
 
79
        
-
 
80
        #Storing the data
-
 
81
        try:
-
 
82
            fp_read = open(URLFeedbackHandler.secondary_crawl_file, 'r')
-
 
83
            data = json.load(fp_read)
-
 
84
            
-
 
85
        except ValueError as e:
-
 
86
            print e
-
 
87
            data = {}
-
 
88
        finally:
-
 
89
            fp_read.close()
-
 
90
        
-
 
91
        if entity in data:
-
 
92
            data[entity][source] = productData
-
 
93
        else:
-
 
94
            data[entity] = {source: productData}
-
 
95
        
-
 
96
        print 'Secondary crawled data:', data
-
 
97
        
-
 
98
        fp_write = open(URLFeedbackHandler.secondary_crawl_file, 'w')
76
        index_builder = IndexBuilder([data], new_index = False)
99
        json.dump(data, fp_write, indent = 4)
77
        index_builder.build()
100
        fp_write.close()
-
 
101
        
-
 
102
        productData['entityId'] = entity
78
        self.write(data)
103
        self.write(productData)
79
    
104
    
80
    def get(self):
105
    def get(self):
-
 
106
        try:
81
        self.url_feedback_file = '/tmp/price-comp-dashboard/urls.json'
107
            fp_read = open(URLFeedbackHandler.secondary_crawl_file, 'r')
-
 
108
            data = json.load(fp_read)
-
 
109
            
-
 
110
        except ValueError as e:
-
 
111
            print e
-
 
112
            data = {}
-
 
113
        finally:
-
 
114
            fp_read.close()
-
 
115
        self.write(data)
82
 
116
 
83
class FeedbackHandler(BaseHandler):
117
class FeedbackHandler(BaseHandler):
84
    
118
    
85
    def save(self, entity, source, feedback_type, selected_item = None):
119
    def save(self, entity, source, feedback_type, selected_item = None):
86
        self.feedback_file = '/tmp/price-comp-dashboard/feedback.json'
120
        self.feedback_file = '/tmp/price-comp-dashboard/feedback.json'
Line 96... Line 130...
96
        feedbacks[entity][source] = {'type': feedback_type}
130
        feedbacks[entity][source] = {'type': feedback_type}
97
        
131
        
98
        if selected_item is not None:   feedbacks[entity][source]['selected_item'] = selected_item
132
        if selected_item is not None:   feedbacks[entity][source]['selected_item'] = selected_item
99
        
133
        
100
        file_to_write = open(self.feedback_file, 'w')
134
        file_to_write = open(self.feedback_file, 'w')
101
        file_to_write.write(json.dumps(feedbacks))
135
        json.dump(feedbacks, file_to_write, indent = 4)
102
        file_to_write.close()
136
        file_to_write.close()
103
        
137
        
104
    def post(self):
138
    def post(self):
105
        feedback_type = self.get_argument('type')
139
        feedback_type = self.get_argument('type')
106
        entity_id = self.get_argument('entityId')
140
        entity_id = self.get_argument('entityId')
Line 132... Line 166...
132
        search_urls = {
166
        search_urls = {
133
            'flipkart': 'http://www.flipkart.com/search-mobiles?query=$$&from=all&searchGroup=mobiles',
167
            'flipkart': 'http://www.flipkart.com/search-mobiles?query=$$&from=all&searchGroup=mobiles',
134
            'homeshop18': 'http://www.homeshop18.com/nokia%20n97/search:$$/categoryid:3024',
168
            'homeshop18': 'http://www.homeshop18.com/nokia%20n97/search:$$/categoryid:3024',
135
            'adexmart': 'http://adexmart.com/search.php?orderby=position&orderway=desc&search_query=$$',
169
            'adexmart': 'http://adexmart.com/search.php?orderby=position&orderway=desc&search_query=$$',
136
            'infibeam': 'http://www.infibeam.com/Mobiles/search?q=$$',
170
            'infibeam': 'http://www.infibeam.com/Mobiles/search?q=$$',
137
            'letsbuy': 'http://www.letsbuy.com/advanced_search_result.php?cPath=254&keywords=$$'
171
            'letsbuy': 'http://www.letsbuy.com/advanced_search_result.php?cPath=254&keywords=$$',
-
 
172
            'snapdeal': '$$'
138
        }
173
        }
139
        
174
        
140
        for key in search_urls.iterkeys():
175
        for key in search_urls.iterkeys():
141
            try:
176
            try:
142
                if map[key]['url'] == 'Not Found':
177
                if map[key]['url'] == 'Not Found':
Line 151... Line 186...
151
        self.loader = tornado.template.Loader('HTMLTemplates')
186
        self.loader = tornado.template.Loader('HTMLTemplates')
152
        catalog_client = CatalogClient().get_client()
187
        catalog_client = CatalogClient().get_client()
153
        items = catalog_client.getAllItemsByStatus(status.ACTIVE)
188
        items = catalog_client.getAllItemsByStatus(status.ACTIVE)
154
        items.extend(catalog_client.getAllItemsByStatus(status.PAUSED))
189
        items.extend(catalog_client.getAllItemsByStatus(status.PAUSED))
155
        items.extend(catalog_client.getAllItemsByStatus(status.PAUSED_BY_RISK))
190
        items.extend(catalog_client.getAllItemsByStatus(status.PAUSED_BY_RISK))
156
        synonyms = getSynonyms()
191
#        synonyms = getSynonyms()
157
        print synonyms
192
#        print synonyms
158
        retriever = Retriever()
193
        retriever = Retriever()
159
        products = {}
194
        products = {}
160
        
195
        
161
        for item in items:
196
        for item in items:
162
            if item.category in (10002, 10003, 10004, 10005, 10010):  products[item.catalogItemId] = item
197
            if item.category in (10002, 10003, 10004, 10005, 10010):  products[item.catalogItemId] = item
163
        
198
        
164
        comparative_prices = []
199
        comparative_prices = []
165
 
200
        
166
        for item in sorted(products.itervalues(), key = lambda item: item.brand):
201
        for item in sorted(products.itervalues(), key = lambda item: item.brand):
167
            try:
202
            try:
168
                model_name = item.modelName.strip() if len(item.modelName.strip()) > 0 else None
203
                model_name = item.modelName.strip() if len(item.modelName.strip()) > 0 else None
169
                model_number = item.modelNumber.strip() if len(item.modelNumber.strip()) > 0 else None
204
                model_number = item.modelNumber.strip() if len(item.modelNumber.strip()) > 0 else None
170
                
205
                
171
                synonyms_for_this_model = synonyms[item.catalogItemId] if item.catalogItemId in synonyms else None
206
                #synonyms_for_this_model = synonyms[item.catalogItemId] if item.catalogItemId in synonyms else None
172
                
207
                
173
                search_results = retriever.retrieve(model_number = model_number, model_name = model_name, brand = item.brand, synonyms = synonyms_for_this_model)
208
                search_results = retriever.retrieve(model_number = model_number, model_name = model_name, brand = item.brand, synonyms = None)
174
                
209
                
175
                clusters = getProductClusters(search_results)
210
                clusters = getProductClusters(search_results)
176
                filtered_clusters = getFilteredClustersWithTopScores(clusters)
211
                filtered_clusters = getFilteredClustersWithTopScores(clusters)
177
                
212
                
178
                product_name = "%s " % item.brand
213
                product_name = "%s " % item.brand
179
                product_name += "%s " % model_name if model_name is not None else ''
214
                product_name += "%s " % model_name if model_name is not None else ''
180
                product_name += model_number if model_number is not None else ''
215
                product_name += model_number if model_number is not None else ''
181
                
216
                
182
                display_info = getDisplayInfo(filtered_clusters, product_name)
217
                display_info = getDisplayInfo(filtered_clusters, product_name)
-
 
218
                print 'Display Info: ', display_info
183
                
219
                
184
                display_info['entity_id'] = item.catalogItemId
220
                display_info['entity_id'] = item.catalogItemId
185
                display_info['product_name'] = product_name
221
                display_info['product_name'] = product_name
186
                display_info['saholic'] = {'price': item.sellingPrice}
222
                display_info['saholic'] = {'price': item.sellingPrice}
187
                comparative_prices.append(display_info)
223
                comparative_prices.append(display_info)
188
            except Exception as e:
224
            except Exception as e:
189
                print 'Exception:', e
225
                print 'Exception:', e
190
        
226
        
-
 
227
        json.dump(comparative_prices, open('/tmp/pcd_log', 'w'), indent = 4)
191
        self.write(self.loader.load('PriceChart.html').generate(data = comparative_prices))
228
        self.write(self.loader.load('PriceChart.html').generate(data = comparative_prices))
-
 
229
        
-
 
230
class WatchlistHandler(BaseHandler):
192
 
231
    
-
 
232
    @tornado.web.authenticated
-
 
233
    def get(self):
-
 
234
        watchlistManager = WatchListManager()
-
 
235
        watchlist = watchlistManager.getWatchlist()
-
 
236
        print 'Getting watchlist: ', watchlist
-
 
237
        entityIds = []
-
 
238
        
-
 
239
        for id in watchlist:
-
 
240
            entityIds.append(int(id))
-
 
241
        
-
 
242
        self.write(str(entityIds))
-
 
243
    
-
 
244
    def post(self):
-
 
245
        watchlistManager = WatchListManager()
-
 
246
        
-
 
247
        requestType = self.get_argument('type').strip()
-
 
248
        entityId = self.get_argument('entity')
-
 
249
        
-
 
250
        print 'Request Type:', requestType, ', Entity Id: ', entityId
-
 
251
        
-
 
252
        if requestType == 'save':
-
 
253
            watchlistManager.save(entity = entityId)
-
 
254
            
-
 
255
        elif requestType == 'delete':
-
 
256
            watchlistManager.remove(entity = entityId)
-
 
257
        
-
 
258
        self.write("{}")
193
 
259
 
194
settings  = {
260
settings  = {
195
        'static_path': os.path.join(os.path.dirname(__file__), 'static'),
261
        'static_path': os.path.join(os.path.dirname(__file__), 'static'),
196
        'login_url': '/login', 
262
        'login_url': '/login', 
197
        'cookie_secret' :"61oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo="
263
        'cookie_secret' :"61oETzKXQAGaYdkL5gEmGeJJFuYh7EQnp2XdTP1o/Vo="
Line 200... Line 266...
200
application = tornado.web.Application([
266
application = tornado.web.Application([
201
                (r"/", MainHandler),
267
                (r"/", MainHandler),
202
                (r"/login", LoginHandler),
268
                (r"/login", LoginHandler),
203
                (r"/feedback", FeedbackHandler),
269
                (r"/feedback", FeedbackHandler),
204
                (r"/feedback-url", URLFeedbackHandler),
270
                (r"/feedback-url", URLFeedbackHandler),
-
 
271
                (r"/watchlist", WatchlistHandler),
205
                (r"/(jquery-1.6.2.min\.js)", tornado.web.StaticFileHandler, dict(path=settings['static_path']))
272
                (r"/(jquery-1.6.2.min\.js)", tornado.web.StaticFileHandler, dict(path=settings['static_path']))
206
            ], **settings)
273
            ], **settings)
207
 
274
 
208
if __name__ == '__main__':
275
if __name__ == '__main__':
209
    http_server = tornado.httpserver.HTTPServer(application)
276
    http_server = tornado.httpserver.HTTPServer(application)