Blame | Last modification | View Log | RSS feed
'''Created on 04-Jun-2012@author: Varun GuptaCSS selector support for BeautifulSoup.soup = BeautifulSoup('<html>...')select(soup, 'div')- returns a list of div elementsselect(soup, 'div#main ul a')- returns a list of links inside a ul inside div#main'''import retag_re = re.compile('^[a-z0-9]+$')attribselect_re = re.compile(r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +r'=?"?(?P<value>[^\]"]*)"?\]$')# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/# \---/ \---/\-------------/ \-------/# | | | |# | | | The value# | | ~,|,^,$,* or =# | Attribute# Tagdef attribute_checker(operator, attribute, value=''):"""Takes an operator, attribute and optional value; returns a function thatwill return True for elements that match that combination."""return {'=': lambda el: el.get(attribute) == value,# attribute includes value as one of a set of space separated tokens'~': lambda el: value in el.get(attribute, '').split(),# attribute starts with value'^': lambda el: el.get(attribute, '').startswith(value),# attribute ends with value'$': lambda el: el.get(attribute, '').endswith(value),# attribute contains value'*': lambda el: value in el.get(attribute, ''),# attribute is either exactly value or starts with value-'|': lambda el: el.get(attribute, '') == value \or el.get(attribute, '').startswith('%s-' % value),}.get(operator, lambda el: el.has_key(attribute))def select(soup, selector):"""soup should be a BeautifulSoup instance; selector is a CSS selectorspecifying the elements you want to retrieve."""tokens = selector.split()current_context = [soup]for token in tokens:m = attribselect_re.match(token)if m:# Attribute selectortag, attribute, operator, value = m.groups()if not tag:tag = Truechecker = attribute_checker(operator, attribute, value)found = []for context in current_context:found.extend([el for el in context.findAll(tag) if checker(el)])current_context = foundcontinueif '#' in token:# ID selectortag, id = token.split('#', 1)if not tag:tag = Trueel = current_context[0].find(tag, {'id': id})if not el:return [] # No matchcurrent_context = [el]continueif '.' in token:# Class selectortag, klass = token.split('.', 1)if not tag:tag = Truefound = []for context in current_context:found.extend(context.findAll(tag,{'class': lambda attr: attr and klass in attr.split()}))current_context = foundcontinueif token == '*':# Star selectorfound = []for context in current_context:found.extend(context.findAll(True))current_context = foundcontinue# Here we should just have a regular tagif not tag_re.match(token):return []found = []for context in current_context:found.extend(context.findAll(token))current_context = foundreturn current_contextdef monkeypatch(BeautifulSoupClass=None):"""If you don't explicitly state the class to patch, defaults to the mostcommon import location for BeautifulSoup."""if not BeautifulSoupClass:from BeautifulSoup import BeautifulSoup as BeautifulSoupClassBeautifulSoupClass.findSelect = selectdef unmonkeypatch(BeautifulSoupClass=None):if not BeautifulSoupClass:from BeautifulSoup import BeautifulSoup as BeautifulSoupClassdelattr(BeautifulSoupClass, 'findSelect')