Subversion Repositories SmartDukaan

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
13566 amit.gupta 1
"""
2
soupselect.py
3
 
4
CSS selector support for BeautifulSoup.
5
 
6
soup = BeautifulSoup('<html>...')
7
select(soup, 'div')
8
- returns a list of div elements
9
 
10
select(soup, 'div#main ul a')
11
- returns a list of links inside a ul inside div#main
12
 
13
"""
14
 
15
import re
16
 
17
tag_re = re.compile('^[a-z0-9]+$')
18
 
19
attribselect_re = re.compile(
20
    r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + 
21
    r'=?"?(?P<value>[^\]"]*)"?\]$'
22
)
23
 
24
# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
25
#   \---/  \---/\-------------/    \-------/
26
#     |      |         |               |
27
#     |      |         |           The value
28
#     |      |    ~,|,^,$,* or =
29
#     |   Attribute 
30
#    Tag
31
 
32
def attribute_checker(operator, attribute, value=''):
33
    """
34
    Takes an operator, attribute and optional value; returns a function that
35
    will return True for elements that match that combination.
36
    """
37
    return {
38
        '=': lambda el: el.get(attribute) == value,
39
        # attribute includes value as one of a set of space separated tokens
40
        '~': lambda el: value in el.get(attribute, '').split(),
41
        # attribute starts with value
42
        '^': lambda el: el.get(attribute, '').startswith(value),
43
        # attribute ends with value
44
        '$': lambda el: el.get(attribute, '').endswith(value),
45
        # attribute contains value
46
        '*': lambda el: value in el.get(attribute, ''),
47
        # attribute is either exactly value or starts with value-
48
        '|': lambda el: el.get(attribute, '') == value \
49
            or el.get(attribute, '').startswith('%s-' % value),
50
    }.get(operator, lambda el: el.has_key(attribute))
51
 
52
 
53
def select(soup, selector):
54
    """
55
    soup should be a BeautifulSoup instance; selector is a CSS selector 
56
    specifying the elements you want to retrieve.
57
    """
58
    tokens = selector.split()
59
    current_context = [soup]
60
    for token in tokens:
61
        m = attribselect_re.match(token)
62
        if m:
63
            # Attribute selector
64
            tag, attribute, operator, value = m.groups()
65
            if not tag:
66
                tag = True
67
            checker = attribute_checker(operator, attribute, value)
68
            found = []
69
            for context in current_context:
70
                found.extend([el for el in context.findAll(tag) if checker(el)])
71
            current_context = found
72
            continue
73
        if '#' in token:
74
            # ID selector
75
            tag, id = token.split('#', 1)
76
            if not tag:
77
                tag = True
78
            el = current_context[0].find(tag, {'id': id})
79
            if not el:
80
                return [] # No match
81
            current_context = [el]
82
            continue
83
        if '.' in token:
84
            # Class selector
85
            tag, klass = token.split('.', 1)
86
            if not tag:
87
                tag = True
88
            found = []
89
            for context in current_context:
90
                found.extend(
91
                    context.findAll(tag,
92
                        {'class': lambda attr: attr and klass in attr.split()}
93
                    )
94
                )
95
            current_context = found
96
            continue
97
        if token == '*':
98
            # Star selector
99
            found = []
100
            for context in current_context:
101
                found.extend(context.findAll(True))
102
            current_context = found
103
            continue
104
        # Here we should just have a regular tag
105
        if not tag_re.match(token):
106
            return []
107
        found = []
108
        for context in current_context:
109
            found.extend(context.findAll(token))
110
        current_context = found
111
    return current_context
112
 
113
def monkeypatch(BeautifulSoupClass=None):
114
    """
115
    If you don't explicitly state the class to patch, defaults to the most 
116
    common import location for BeautifulSoup.
117
    """
118
    if not BeautifulSoupClass:
119
        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
120
    BeautifulSoupClass.findSelect = select
121
 
122
def unmonkeypatch(BeautifulSoupClass=None):
123
    if not BeautifulSoupClass:
124
        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
125
    delattr(BeautifulSoupClass, 'findSelect')