Subversion Repositories SmartDukaan

Rev

Rev 4110 | Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4106 varun.gupt 1
'''
2
Created on 19-Nov-2011
3
 
4
@author: Varun Gupta
5
'''
6
from BaseScraper import BaseScraper
7
from BeautifulSoup import BeautifulSoup
8
from FileUtils import Writer
9
 
10
import json
11
 
12
class MySmartPrice(BaseScraper):
13
 
14
    def __init__(self):
15
        self.url_brand_list = 'http://www.mysmartprice.com/mobile/pricelist/'
16
        self.source_whitelist = ['adexmart', 'flipkart', 'homeshop18', 'infibeam', 'letsbuy', 'saholic']
17
 
18
    def getSourceName(self, url):
19
        for source in self.source_whitelist:
20
            if url.find(source) > -1:   return source
21
 
22
        return None
23
 
24
    def getSaholicEntityId(self, map):
25
        try:
26
            if map['saholic']['url'] is None:
27
                return None
28
            else:
29
                return map['saholic']['url'].split('-')[-1]
30
 
31
        except KeyError:
32
            return None
33
 
34
    def getBrandURLs(self):
35
        urls = []
36
        html = BaseScraper.read(self, self.url_brand_list)
37
        soup = BeautifulSoup(html)
38
        for td in soup.find('div', {'class': 'msp_left'}).find('table').find('table').findAll('td', {'width':"300px"}):
39
            urls.append(str(td.find('a')['href']))
40
        return urls
41
 
42
    def getPhoneURLsForBrand(self, url_brand):
43
        urls = []
44
        url_brand = 'http://www.mysmartprice.com' + url_brand
45
        html = BaseScraper.read(self, url_brand)
46
        soup = BeautifulSoup(html)
47
        for div in soup.findAll('div', {'class': 'item'}):
48
            a = div.find('a')
49
 
50
            if a is not None:
51
                urls.append(str(a['href']))
52
        return urls
53
 
54
    def getPhonePrices(self, url):
55
        html = BaseScraper.read(self, url)
56
        soup = BeautifulSoup(html)
57
        map = {}
58
        for div in soup.findAll('div', {'class': 'pt_row'}):
59
            url = div.find('td', {'width': '140px'}).find('a')['href'].split('?url=')[-1].strip()
60
            td_price = div.find('td', {'width': '135px'})
61
 
62
            if td_price.string is None:
63
                is_available = True
64
                price = td_price.find('b').string.strip()
65
            else:
66
                is_available = False
67
 
68
            source = self.getSourceName(url)
69
 
70
            if source is not None:
71
                map[source] = {
72
                        'is_available': is_available,
73
                        'price': price if is_available else 'Not Found',
74
                        'url': url if is_available else 'Not Found'
75
                }
76
        return map
77
 
78
if __name__ == '__main__':
79
    scraper = MySmartPrice()
80
#    brand_urls = scraper.getBrands()
81
    brand_urls = [
82
                  '/mobile/pricelist/nokia-mobile-price-list-in-india.html',
83
                  '/mobile/pricelist/samsung-mobile-price-list-in-india.html',
84
                  '/mobile/pricelist/blackberry-mobile-price-list-in-india.html',
85
                  '/mobile/pricelist/lg-mobile-price-list-in-india.html',
86
                  '/mobile/pricelist/sony-ericsson-mobile-price-list-in-india.html',
87
                  '/mobile/pricelist/micromax-mobile-price-list-in-india.html',
88
                  '/mobile/pricelist/motorola-mobile-price-list-in-india.html',
89
                  '/mobile/pricelist/htc-mobile-price-list-in-india.html',
90
                  '/mobile/pricelist/apple-mobile-price-list-in-india.html',
91
                  '/mobile/pricelist/spice-mobile-price-list-in-india.html',
92
                  '/mobile/pricelist/karbonn-mobile-price-list-in-india.html',
93
                  '/mobile/pricelist/lava-mobile-price-list-in-india.html']
94
    phone_urls = []
95
 
96
    for brand_url in brand_urls:
97
        try:
98
            print brand_url
99
            phone_urls.extend(scraper.getPhoneURLsForBrand(brand_url))
100
        except Exception as e:
101
            print e
102
            continue
103
 
104
    print phone_urls.__len__()
105
 
106
    file_writer = Writer()
107
 
108
    for url in phone_urls:
109
        print url
110
        map = scraper.getPhonePrices(url)
111
        saholic_id = scraper.getSaholicEntityId(map)
112
        print map
113
        print saholic_id
114
 
115
        if saholic_id is not None:
116
            file_path = str("/tmp/msp_dir/%s" % saholic_id)
117
            file_to_write = open(file_path, "w")
118
 
119
            if file_to_write is None:
120
                print 'File pointer is None'
121
            else:
122
                json.dump(map, file_to_write, indent = 4)