Subversion Repositories SmartDukaan

Rev

Rev 6022 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
5639 amar.kumar 1
from BeautifulSoup import BeautifulSoup
2
from BaseScraper import BaseScraper
3
from Utils import removePriceFormatting
4
 
5
import time
6
 
7
class TradusScraper(BaseScraper):
8
 
5761 amar.kumar 9
    mobilePageCount = 32
10
    tabletPageCount = 21
5639 amar.kumar 11
    productCountPerScraping = 20
5761 amar.kumar 12
    mobileCurrentPage = 0
13
    tabletCurrentPage = 0
5639 amar.kumar 14
 
15
 
16
    def __init__(self):
17
        BaseScraper.__init__(self)
18
        self.url = None
19
        self.id = None
20
 
21
    def setUrl(self, url):
22
        self.url = url
23
 
24
    def scrape(self):
25
        html = BaseScraper.read(self, self.url)
26
        self.soup = BeautifulSoup(html)
27
        self.phones = None
28
        #self.setPageCount()
29
 
30
    def getPhones(self):
31
        phones = []
6022 amar.kumar 32
        for div in self.soup.findAll('div', {'class': 'prod_main_div'}):
5639 amar.kumar 33
            try:
6022 amar.kumar 34
                productUrlContainer = div.find('div', {'class': 'product_name search-product-block'})
35
                name = productUrlContainer.contents[1].string
6024 amar.kumar 36
                product_url = "http://www.tradus.com" +productUrlContainer.contents[1]['href']
6022 amar.kumar 37
                price = div.find('span', {'class':'numDiv_left'}).string.strip()
38
                price = removePriceFormatting(price)
5639 amar.kumar 39
                in_stock = 1
40
 
41
                try:
42
                    if price is None:
43
                        continue
44
                    else:
45
                        phones.append({
46
                                'name': str(name), 
47
                                'price': removePriceFormatting(price),
48
                                'source': 'tradus', 
49
                                'product_url': str(product_url), 
50
                                'in_stock': in_stock
51
                            })
52
                except Exception as e:
53
                    print e
54
                    pass
55
 
56
            except IndexError as iex:
57
                try:
58
                    price = div.find('div', {'class': 'mainresult-show-right-startrate'})('span')[0].contents[0].strip()[18:]
59
                    in_stock = 1
60
 
61
                    phones.append({
62
                                'name': str(name), 
63
                                'price': removePriceFormatting(price),
64
                                'source': 'tradus', 
65
                                'product_url': str(product_url), 
66
                                'in_stock': in_stock
67
                            })
68
 
69
                except Exception as ex:
70
                    print ex
71
                    pass
72
            except Exception as e:
73
                print e
74
                pass
5761 amar.kumar 75
        self.phones = phones
76
        return phones
5639 amar.kumar 77
 
78
    def setPageCount(self):
79
        self.currentPage = self.currentPage + 1
80
 
81
    def getNextUrl(self):
82
        time.sleep(1)
5761 amar.kumar 83
        if "mobile" in self.url:
84
            if TradusScraper.mobileCurrentPage < TradusScraper.mobilePageCount:
85
                TradusScraper.mobileCurrentPage += 1
86
                return 'http://www.tradus.com/search/tradus_search/?query=mobile&filters=cat:7756&page=%s' % (TradusScraper.mobileCurrentPage)
87
            else:
88
                return None
89
        elif "tablets" in self.url:
90
            if TradusScraper.tabletCurrentPage < TradusScraper.tabletPageCount:
91
                TradusScraper.tabletCurrentPage += 1
92
                return 'http://www.tradus.com/search/tradus_search/?query=tablets&filters=cat:7756&cat:7762&page=%s' % (TradusScraper.tabletCurrentPage)
93
            else:
94
                return None
5639 amar.kumar 95
        else:
96
            return None
5761 amar.kumar 97
 
5639 amar.kumar 98
 
99
    def getDataFromProductPage(self, url):
100
        html = BaseScraper.read(self, url)
101
        soup = BeautifulSoup(html)
6022 amar.kumar 102
        name = soup.find('span',{'itemprop':'name'}).string.strip()
103
        price= soup.find('span',{'class':'mrp3'}).contents[0].strip()
104
        price = removePriceFormatting(price)
5639 amar.kumar 105
        in_stock = 1
106
 
107
        data = {
108
            "product_url": str(url), 
109
            "source": "tradus", 
110
            "price": price, 
111
            "in_stock": 1, 
112
            "name": name
113
        }
114
        return data
115
 
6022 amar.kumar 116
def removePriceFormatting(price_string):
117
    return price_string.strip().replace('Rs.', '').replace('Rs', '').replace(',', '').replace(' ', '').replace('&nbsp;', '').split('.')[0]
118
 
5639 amar.kumar 119
if __name__ == '__main__':
120
    s = TradusScraper()
6022 amar.kumar 121
    data = s.getDataFromProductPage('http://www.tradus.com/zing-q800-dual-sim-mobile-phone/p/MOB0000004506663')
5639 amar.kumar 122
    print data