| 4198 |
varun.gupt |
1 |
'''
|
|
|
2 |
Created on 05-Dec-2011
|
|
|
3 |
@author: Varun Gupta
|
|
|
4 |
'''
|
| 5639 |
amar.kumar |
5 |
from Scrapers.TradusScraper import TradusScraper
|
|
|
6 |
from Scrapers.SulekhaScraper import SulekhaScraper
|
| 4198 |
varun.gupt |
7 |
|
|
|
8 |
class URL:
|
|
|
9 |
|
|
|
10 |
def __init__(self, url):
|
|
|
11 |
self.url = url
|
|
|
12 |
self.source = URL.getSource(url)
|
|
|
13 |
|
|
|
14 |
@staticmethod
|
|
|
15 |
def getSource(url):
|
|
|
16 |
try:
|
|
|
17 |
return str(url.split('.')[1].strip())
|
|
|
18 |
except Exception:
|
|
|
19 |
return None
|
|
|
20 |
|
|
|
21 |
def __str__(self):
|
|
|
22 |
return '%s (%s)' % (self.url, self.source)
|
| 5291 |
varun.gupt |
23 |
|
|
|
24 |
def __unicode__(self):
|
|
|
25 |
return '%s (%s)' % (self.url, self.source)
|
| 4198 |
varun.gupt |
26 |
|
|
|
27 |
class URLQueue:
|
|
|
28 |
|
|
|
29 |
def __init__(self):
|
| 5639 |
amar.kumar |
30 |
TradusScraper.currentPage = 1
|
|
|
31 |
SulekhaScraper.currentPage = 1
|
| 4198 |
varun.gupt |
32 |
self.urls = [
|
| 6166 |
amar.kumar |
33 |
URL('http://www.tradus.com/search/tradus_search/?query=camera&cat=7668&page=0'),
|
| 4198 |
varun.gupt |
34 |
URL('http://www.flipkart.com/mobiles/all/'),
|
| 5291 |
varun.gupt |
35 |
URL('http://www.snapdeal.com/json/product/get/search/175/0/20?q=&sort=plrty&keyword='),
|
| 6169 |
amar.kumar |
36 |
URL('http://www.tradus.com/search/tradus_search/?query=mobile&cat=7844&page=0'),
|
| 6166 |
amar.kumar |
37 |
URL('http://www.infibeam.com/Mobiles/search?page=1'),
|
| 6169 |
amar.kumar |
38 |
URL('http://www.homeshop18.com/gsm-handsets/category:3027/'),
|
| 6166 |
amar.kumar |
39 |
URL('http://www.tradus.com/search/tradus_search/?query=camera&cat=7671&page=0'),
|
|
|
40 |
URL('http://www.flipkart.com/mobiles/tablet-20278'),
|
|
|
41 |
URL('http://www.homeshop18.com/ipads-2f-tablets/category:8937/'),
|
|
|
42 |
URL('http://www.flipkart.com/cameras/all-camcorder/'),
|
| 6169 |
amar.kumar |
43 |
URL('http://www.tradus.com/search/tradus_search/?query=mobile&cat=7759&page=0'),
|
| 5291 |
varun.gupt |
44 |
URL('http://www.snapdeal.com/json/product/get/search/133/0/20?q=&sort=plrty&keyword='),
|
| 5639 |
amar.kumar |
45 |
URL('http://mobiles.sulekha.com/common/common.aspx?type=mobileofferslist&makeId=0&modelId=0&pageNo='),
|
| 6166 |
amar.kumar |
46 |
URL('http://www.tradus.com/search/tradus_search/?query=camera&cat=7670&page=0'),
|
| 6169 |
amar.kumar |
47 |
URL('http://www.homeshop18.com/digital-cameras/category:3178/'),
|
| 6166 |
amar.kumar |
48 |
URL('http://www.flipkart.com/cameras/all-slr'),
|
|
|
49 |
URL('http://www.infibeam.com/Cameras/search?page=1'),
|
| 6169 |
amar.kumar |
50 |
URL('http://www.tradus.com/search/tradus_search/?query=tablets&cat=7762&page=0'),
|
| 6166 |
amar.kumar |
51 |
URL('http://www.flipkart.com/cameras/all-point-shoot'),
|
| 6169 |
amar.kumar |
52 |
URL('http://www.homeshop18.com/digital-slrs/category:3188/'),
|
|
|
53 |
URL('http://www.tradus.com/search/tradus_search/?query=mobile&cat=7759&page=0')
|
| 6166 |
amar.kumar |
54 |
#URL('http://www.adexmart.com/modules/coremanager/modules/filtersearch/filtersearch.json.php?act=filter&ident=16&page=1&perpage=1000&orderby=newest&orderway=desc')'''
|
| 4198 |
varun.gupt |
55 |
]
|
|
|
56 |
|
|
|
57 |
def enqueue(self, url):
|
|
|
58 |
|
|
|
59 |
if url is not None:
|
|
|
60 |
new_url = url if url.__class__ == URL else URL(url)
|
|
|
61 |
|
|
|
62 |
print 'Enqueue', new_url
|
|
|
63 |
self.urls.append(new_url)
|
|
|
64 |
print 'New URL set:', self.urls
|
|
|
65 |
|
|
|
66 |
def get(self):
|
|
|
67 |
print 'Count of URLs in queue:', len(self.urls)
|
|
|
68 |
try:
|
|
|
69 |
url = self.urls.pop(0) if len(self.urls) > 0 else None
|
|
|
70 |
except IndexError:
|
|
|
71 |
url = None
|
|
|
72 |
|
|
|
73 |
print 'Poping', url
|
|
|
74 |
print 'New URL set:', self.urls
|
|
|
75 |
return url
|