Subversion Repositories SmartDukaan

Rev

Rev 6024 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
5639 amar.kumar 1
from BeautifulSoup import BeautifulSoup
2
from BaseScraper import BaseScraper
3
from Utils import removePriceFormatting
4
 
5
import time
6
 
7
class TradusScraper(BaseScraper):
8
 
5761 amar.kumar 9
    mobilePageCount = 32
10
    tabletPageCount = 21
5639 amar.kumar 11
    productCountPerScraping = 20
5761 amar.kumar 12
    mobileCurrentPage = 0
13
    tabletCurrentPage = 0
5639 amar.kumar 14
 
15
 
16
    def __init__(self):
17
        BaseScraper.__init__(self)
18
        self.url = None
19
        self.id = None
20
 
21
    def setUrl(self, url):
22
        self.url = url
23
 
24
    def scrape(self):
25
        html = BaseScraper.read(self, self.url)
26
        self.soup = BeautifulSoup(html)
27
        self.phones = None
28
        #self.setPageCount()
29
 
30
    def getPhones(self):
31
        phones = []
6022 amar.kumar 32
        for div in self.soup.findAll('div', {'class': 'prod_main_div'}):
5639 amar.kumar 33
            try:
6022 amar.kumar 34
                productUrlContainer = div.find('div', {'class': 'product_name search-product-block'})
35
                name = productUrlContainer.contents[1].string
6024 amar.kumar 36
                product_url = "http://www.tradus.com" +productUrlContainer.contents[1]['href']
6022 amar.kumar 37
                price = div.find('span', {'class':'numDiv_left'}).string.strip()
38
                price = removePriceFormatting(price)
5639 amar.kumar 39
                in_stock = 1
40
 
41
                try:
42
                    if price is None:
43
                        continue
44
                    else:
45
                        phones.append({
46
                                'name': str(name), 
47
                                'price': removePriceFormatting(price),
48
                                'source': 'tradus', 
49
                                'product_url': str(product_url), 
50
                                'in_stock': in_stock
51
                            })
52
                except Exception as e:
53
                    print e
54
                    pass
55
 
56
            except IndexError as iex:
57
                try:
58
                    price = div.find('div', {'class': 'mainresult-show-right-startrate'})('span')[0].contents[0].strip()[18:]
59
                    in_stock = 1
60
 
61
                    phones.append({
62
                                'name': str(name), 
63
                                'price': removePriceFormatting(price),
64
                                'source': 'tradus', 
65
                                'product_url': str(product_url), 
66
                                'in_stock': in_stock
67
                            })
68
 
69
                except Exception as ex:
70
                    print ex
71
                    pass
72
            except Exception as e:
73
                print e
74
                pass
5761 amar.kumar 75
        self.phones = phones
76
        return phones
5639 amar.kumar 77
 
78
    def setPageCount(self):
79
        self.currentPage = self.currentPage + 1
80
 
81
    def getNextUrl(self):
82
        time.sleep(1)
6166 amar.kumar 83
        nextLink = self.soup.find('a', {'class': 'filter-link more'})
84
        if(nextLink is not None):
85
            urlDivider = self.url.find('page=')
86
            if urlDivider is not -1:
87
                urlDivider +=5
88
                pageNumber = int(self.url[urlDivider:]) +1
89
                return self.url[:urlDivider] + str(pageNumber)
5761 amar.kumar 90
            else:
91
                return None
5639 amar.kumar 92
        else:
93
            return None
5761 amar.kumar 94
 
5639 amar.kumar 95
 
96
    def getDataFromProductPage(self, url):
97
        html = BaseScraper.read(self, url)
98
        soup = BeautifulSoup(html)
6022 amar.kumar 99
        name = soup.find('span',{'itemprop':'name'}).string.strip()
100
        price= soup.find('span',{'class':'mrp3'}).contents[0].strip()
101
        price = removePriceFormatting(price)
5639 amar.kumar 102
        in_stock = 1
103
 
104
        data = {
105
            "product_url": str(url), 
106
            "source": "tradus", 
107
            "price": price, 
108
            "in_stock": 1, 
109
            "name": name
110
        }
111
        return data
112
 
6022 amar.kumar 113
def removePriceFormatting(price_string):
114
    return price_string.strip().replace('Rs.', '').replace('Rs', '').replace(',', '').replace(' ', '').replace(' ', '').split('.')[0]
115
 
5639 amar.kumar 116
if __name__ == '__main__':
117
    s = TradusScraper()
6022 amar.kumar 118
    data = s.getDataFromProductPage('http://www.tradus.com/zing-q800-dual-sim-mobile-phone/p/MOB0000004506663')
5639 amar.kumar 119
    print data