Subversion Repositories SmartDukaan

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
5401 varun.gupt 1
'''
2
Created on 04-Jun-2012
3
 
4
@author: Varun Gupta
5
 
6
CSS selector support for BeautifulSoup.
7
 
8
soup = BeautifulSoup('<html>...')
9
select(soup, 'div')
10
- returns a list of div elements
11
 
12
select(soup, 'div#main ul a')
13
- returns a list of links inside a ul inside div#main
14
'''
15
 
16
import re
17
 
18
tag_re = re.compile('^[a-z0-9]+$')
19
 
20
attribselect_re = re.compile(
21
    r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + 
22
    r'=?"?(?P<value>[^\]"]*)"?\]$'
23
)
24
 
25
# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
26
#   \---/  \---/\-------------/    \-------/
27
#     |      |         |               |
28
#     |      |         |           The value
29
#     |      |    ~,|,^,$,* or =
30
#     |   Attribute 
31
#    Tag
32
 
33
def attribute_checker(operator, attribute, value=''):
34
    """
35
    Takes an operator, attribute and optional value; returns a function that
36
    will return True for elements that match that combination.
37
    """
38
    return {
39
        '=': lambda el: el.get(attribute) == value,
40
        # attribute includes value as one of a set of space separated tokens
41
        '~': lambda el: value in el.get(attribute, '').split(),
42
        # attribute starts with value
43
        '^': lambda el: el.get(attribute, '').startswith(value),
44
        # attribute ends with value
45
        '$': lambda el: el.get(attribute, '').endswith(value),
46
        # attribute contains value
47
        '*': lambda el: value in el.get(attribute, ''),
48
        # attribute is either exactly value or starts with value-
49
        '|': lambda el: el.get(attribute, '') == value \
50
            or el.get(attribute, '').startswith('%s-' % value),
51
    }.get(operator, lambda el: el.has_key(attribute))
52
 
53
 
54
def select(soup, selector):
55
    """
56
    soup should be a BeautifulSoup instance; selector is a CSS selector 
57
    specifying the elements you want to retrieve.
58
    """
59
    tokens = selector.split()
60
    current_context = [soup]
61
    for token in tokens:
62
        m = attribselect_re.match(token)
63
        if m:
64
            # Attribute selector
65
            tag, attribute, operator, value = m.groups()
66
            if not tag:
67
                tag = True
68
            checker = attribute_checker(operator, attribute, value)
69
            found = []
70
            for context in current_context:
71
                found.extend([el for el in context.findAll(tag) if checker(el)])
72
            current_context = found
73
            continue
74
        if '#' in token:
75
            # ID selector
76
            tag, id = token.split('#', 1)
77
            if not tag:
78
                tag = True
79
            el = current_context[0].find(tag, {'id': id})
80
            if not el:
81
                return [] # No match
82
            current_context = [el]
83
            continue
84
        if '.' in token:
85
            # Class selector
86
            tag, klass = token.split('.', 1)
87
            if not tag:
88
                tag = True
89
            found = []
90
            for context in current_context:
91
                found.extend(
92
                    context.findAll(tag,
93
                        {'class': lambda attr: attr and klass in attr.split()}
94
                    )
95
                )
96
            current_context = found
97
            continue
98
        if token == '*':
99
            # Star selector
100
            found = []
101
            for context in current_context:
102
                found.extend(context.findAll(True))
103
            current_context = found
104
            continue
105
        # Here we should just have a regular tag
106
        if not tag_re.match(token):
107
            return []
108
        found = []
109
        for context in current_context:
110
            found.extend(context.findAll(token))
111
        current_context = found
112
    return current_context
113
 
114
def monkeypatch(BeautifulSoupClass=None):
115
    """
116
    If you don't explicitly state the class to patch, defaults to the most 
117
    common import location for BeautifulSoup.
118
    """
119
    if not BeautifulSoupClass:
120
        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
121
    BeautifulSoupClass.findSelect = select
122
 
123
def unmonkeypatch(BeautifulSoupClass=None):
124
    if not BeautifulSoupClass:
125
        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
126
    delattr(BeautifulSoupClass, 'findSelect')