Upgrading to GitLab 12.0.2.

Commit 43411cb6 authored by SVN-Git Migration's avatar SVN-Git Migration

Imported Upstream version 4.0.0~b3

parents
Metadata-Version: 1.0
Name: beautifulsoup4
Version: 4.0.0b3
Summary: UNKNOWN
Home-page: http://www.crummy.com/software/BeautifulSoup/bs4/
Author: Leonard Richardson
Author-email: leonardr@segfault.org
License: MIT
Download-URL: http://www.crummy.com/software/BeautifulSoup/download/4.x/
Description: Beautiful Soup sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree.
Platform: UNKNOWN
Classifier: Development Status :: 4 - Beta
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: MIT License
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 3
Classifier: Topic :: Text Processing :: Markup :: HTML
Classifier: Topic :: Text Processing :: Markup :: XML
Classifier: Topic :: Text Processing :: Markup :: SGML
Classifier: Topic :: Software Development :: Libraries :: Python Modules
= Introduction =
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML")
>>> print soup.prettify()
<html>
<body>
<p>
Some
<b>
bad
<i>
HTML
</i>
</b>
</p>
</body>
</html>
>>> soup.find(text="bad")
u'bad'
>>> soup.i
<i>HTML</i>
>>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml")
>>> print soup.prettify()
<?xml version="1.0" encoding="utf-8">
<tag1>
Some
<tag2 />
bad
<tag3>
XML
</tag3>
</tag1>
= About Beautiful Soup 4 =
This is a nearly-complete rewrite that removes Beautiful Soup's custom
HTML parser in favor of a system that lets you write a little glue
code and plug in any HTML or XML parser you want.
Beautiful Soup 4.0 comes with glue code for four parsers:
* Python's standard HTMLParser (html.parser in Python 3)
* lxml's HTML and XML parsers
* html5lib's HTML parser
HTMLParser is the default, but I recommend you install one of the
other parsers, or you'll have problems handling real-world markup.
For complete documentation, see the Sphinx documentation in
docs/source. What follows is a summary of the changes from Beautiful
Soup 3.
== The module name has changed ==
Previously you imported the BeautifulSoup class from a module also
called BeautifulSoup. To save keystrokes and make it clear which
version of the API is in use, the module is now called 'bs4':
>>> from bs4 import BeautifulSoup
== It works with Python 3 ==
Beautiful Soup 3.1.0 worked with Python 3, but the parser it used was
so bad that it barely worked at all. Beautiful Soup 4 works with
Python 3, and since its parser is pluggable, you don't sacrifice
quality.
Special thanks to Thomas Kluyver and Ezio Melotti for getting Python 3
support to the finish line. Ezio Melotti is also to thank for greatly
improving the HTML parser that comes with Python 3.2.
== CDATA sections are normal text, if they're understood at all. ==
Currently, the lxml and html5lib HTML parsers ignore CDATA sections in
markup:
<p><![CDATA[foo]]></p> => <p></p>
A future version of html5lib will turn CDATA sections into text nodes,
but only within tags like <svg> and <math>:
<svg><![CDATA[foo]]></svg> => <p>foo</p>
The default XML parser (which uses lxml behind the scenes) turns CDATA
sections into ordinary text elements:
<p><![CDATA[foo]]></p> => <p>foo</p>
In theory it's possible to preserve the CDATA sections when using the
XML parser, but I don't see how to get it to work in practice.
== Miscellaneous other stuff ==
If the BeautifulSoup instance has .is_xml set to True, an appropriate
XML declaration will be emitted when the tree is transformed into a
string:
<?xml version="1.0" encoding="utf-8">
<markup>
...
</markup>
The ['lxml', 'xml'] tree builder sets .is_xml to True; the other tree
builders set it to False. If you want to parse XHTML with an HTML
parser, you can set it manually.
= Running the unit tests =
Here's how to run the tests on Python 2.7:
$ cd bs4
$ python2.7 -m unittest discover -s bs4
Here's how to do it with Python 3.2:
$ ./convert-py3k
$ cd py3k/bs4
$ python3 -m unittest discover -s bs4
The script test-all-versions will run the tests twice, once on Python
2.7 and once on Python 3.
This diff is collapsed.
from collections import defaultdict
import re
import sys
__all__ = [
'HTMLTreeBuilder',
'SAXTreeBuilder',
'TreeBuilder',
'TreeBuilderRegistry',
]
# Some useful features for a TreeBuilder to have.
FAST = 'fast'
PERMISSIVE = 'permissive'
STRICT = 'strict'
XML = 'xml'
HTML = 'html'
HTML_5 = 'html5'
class TreeBuilderRegistry(object):
def __init__(self):
self.builders_for_feature = defaultdict(list)
self.builders = []
def register(self, treebuilder_class):
"""Register a treebuilder based on its advertised features."""
for feature in treebuilder_class.features:
self.builders_for_feature[feature].insert(0, treebuilder_class)
self.builders.insert(0, treebuilder_class)
def lookup(self, *features):
if len(self.builders) == 0:
# There are no builders at all.
return None
if len(features) == 0:
# They didn't ask for any features. Give them the most
# recently registered builder.
return self.builders[0]
# Go down the list of features in order, and eliminate any builders
# that don't match every feature.
features = list(features)
features.reverse()
candidates = None
candidate_set = None
while len(features) > 0:
feature = features.pop()
we_have_the_feature = self.builders_for_feature.get(feature, [])
if len(we_have_the_feature) > 0:
if candidates is None:
candidates = we_have_the_feature
candidate_set = set(candidates)
else:
# Eliminate any candidates that don't have this feature.
candidate_set = candidate_set.intersection(
set(we_have_the_feature))
# The only valid candidates are the ones in candidate_set.
# Go through the original list of candidates and pick the first one
# that's in candidate_set.
if candidate_set is None:
return None
for candidate in candidates:
if candidate in candidate_set:
return candidate
return None
# The BeautifulSoup class will take feature lists from developers and use them
# to look up builders in this registry.
builder_registry = TreeBuilderRegistry()
class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree."""
features = []
is_xml = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
def __init__(self):
self.soup = None
def reset(self):
pass
def can_be_empty_element(self, tag_name):
"""Might a tag with this name be an empty-element tag?
The final markup may or may not actually present this tag as
self-closing.
For instance: an HTMLBuilder does not consider a <p> tag to be
an empty-element tag (it's not in
HTMLBuilder.empty_element_tags). This means an empty <p> tag
will be presented as "<p></p>", not "<p />".
The default implementation has no opinion about which tags are
empty-element tags, so a tag will be presented as an
empty-element tag if and only if it has no contents.
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
be left alone.
"""
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
def feed(self, markup):
raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
return markup, None, None
def test_fragment_to_document(self, fragment):
"""Wrap an HTML fragment to make it look like a document.
Different parsers do this differently. For instance, lxml
introduces an empty <head> tag, and html5lib
doesn't. Abstracting this away lets us write simple tests
which run HTML fragments through the parser and compare the
results against other HTML fragments.
This method should not be used outside of tests.
"""
return fragment
def set_up_substitutions(self, tag):
pass
class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events."""
def feed(self, markup):
raise NotImplementedError()
def close(self):
pass
def startElement(self, name, attrs):
attrs = dict((key[1], value) for key, value in list(attrs.items()))
#print "Start %s, %r" % (name, attrs)
self.soup.handle_starttag(name, attrs)
def endElement(self, name):
#print "End %s" % name
self.soup.handle_endtag(name)
def startElementNS(self, nsTuple, nodeName, attrs):
# Throw away (ns, nodeName) for now.
self.startElement(nodeName, attrs)
def endElementNS(self, nsTuple, nodeName):
# Throw away (ns, nodeName) for now.
self.endElement(nodeName)
#handler.endElementNS((ns, node.nodeName), node.nodeName)
def startPrefixMapping(self, prefix, nodeValue):
# Ignore the prefix for now.
pass
def endPrefixMapping(self, prefix):
# Ignore the prefix for now.
# handler.endPrefixMapping(prefix)
pass
def characters(self, content):
self.soup.handle_data(content)
def startDocument(self):
pass
def endDocument(self):
pass
class HTMLTreeBuilder(TreeBuilder):
"""This TreeBuilder knows facts about HTML.
Such as which tags are empty-element tags.
"""
preserve_whitespace_tags = set(['pre', 'textarea'])
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
# Used by set_up_substitutions to detect the charset in a META tag
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
def set_up_substitutions(self, tag):
if tag.name != 'meta':
return False
http_equiv = tag.get('http-equiv')
content = tag.get('content')
if (http_equiv is not None
and content is not None
and http_equiv.lower() == 'content-type'):
# This is an interesting meta tag.
match = self.CHARSET_RE.search(content)
if match:
if (self.soup.declared_html_encoding is not None or
self.soup.original_encoding == self.soup.from_encoding):
# An HTML encoding was sniffed while converting
# the document to Unicode, or an HTML encoding was
# sniffed during a previous pass through the
# document, or an encoding was specified
# explicitly and it worked. Rewrite the meta tag.
def rewrite(match):
return match.group(1) + "%SOUP-ENCODING%"
tag['content'] = self.CHARSET_RE.sub(rewrite, content)
return True
else:
# This is our first pass through the document.
# Go through it again with the encoding information.
new_charset = match.group(3)
if (new_charset is not None
and new_charset != self.soup.original_encoding):
self.soup.declared_html_encoding = new_charset
self.soup._feed(self.soup.declared_html_encoding)
raise StopParsing
pass
return False
def register_treebuilders_from(module):
"""Copy TreeBuilders from the given module into this module."""
# I'm fairly sure this is not the best way to do this.
this_module = sys.modules['bs4.builder']
for name in module.__all__:
obj = getattr(module, name)
if issubclass(obj, TreeBuilder):
setattr(this_module, name, obj)
this_module.__all__.append(name)
# Register the builder while we're at it.
this_module.builder_registry.register(obj)
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want
# html5lib to take precedence over lxml, because it's more
# reliable. And we only want to use HTMLParser as a last result.
from .import _htmlparser
register_treebuilders_from(_htmlparser)
try:
from . import _lxml
register_treebuilders_from(_lxml)
except ImportError:
# They don't have lxml installed.
pass
try:
from . import _html5lib
register_treebuilders_from(_html5lib)
except ImportError:
# They don't have html5lib installed.
pass
__all__ = [
'HTML5TreeBuilder',
]
from bs4.builder import (
PERMISSIVE,
HTML,
HTML_5,
HTMLTreeBuilder,
)
import html5lib
from html5lib.constants import DataLossWarning
import warnings
from bs4.element import (
Comment,
Doctype,
NavigableString,
Tag,
)
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
features = ['html5lib', PERMISSIVE, HTML_5, HTML]
def prepare_markup(self, markup, user_specified_encoding):
# Store the user-specified encoding for use later on.
self.user_specified_encoding = user_specified_encoding
return markup, None, None
# These methods are defined by Beautiful Soup.
def feed(self, markup):
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
doc = parser.parse(markup, encoding=self.user_specified_encoding)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
self.soup, namespaceHTMLElements)
return self.underlying_builder
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements):
self.soup = soup
if namespaceHTMLElements:
warnings.warn("namespaceHTMLElements not supported yet",
DataLossWarning)
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
self.soup.reset()
return Element(self.soup, self.soup, None)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
self.soup.object_was_parsed(doctype)
def elementClass(self, name, namespace):
if namespace is not None:
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace)
def commentClass(self, data):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
self.soup = BeautifulSoup("")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
def appendChild(self, node):
self.soup.insert(len(self.soup.contents), node.element)
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
return self.soup
def getFragment(self):
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
class AttrList(object):
def __init__(self, element):
self.element = element
self.attrs = dict(self.element.attrs)
def __iter__(self):
return list(self.attrs.items()).__iter__()
def __setitem__(self, name, value):
"set attr", name, value
self.element[name] = value
def items(self):
return list(self.attrs.items())
def keys(self):
return list(self.attrs.keys())
def __getitem__(self, name):
return self.attrs[name]
def __contains__(self, name):
return name in list(self.attrs.keys())
class Element(html5lib.treebuilders._base.Node):
def __init__(self, element, soup, namespace):
html5lib.treebuilders._base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
def _nodeIndex(self, node, refNode):
# Finds a node by identity rather than equality
for index in range(len(self.element.contents)):
if id(self.element.contents[index]) == id(refNode.element):
return index
return None
def appendChild(self, node):
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# Concatenate new text onto old text node
# (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
newStr = NavigableString(self.element.contents[-1]+node.element)
# Remove the old text node
# (Can't simply use .extract() by itself, because it fails if
# an equal text node exists within the parent node)
oldElement = self.element.contents[-1]
del self.element.contents[-1]
oldElement.parent = None
oldElement.extract()
self.element.insert(len(self.element.contents), newStr)
else:
self.element.insert(len(self.element.contents), node.element)
node.parent = self
def getAttributes(self):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes is not None and attributes != {}:
for name, value in list(attributes.items()):
self.element[name] = value
# The attributes may contain variables that need substitution.
# Call set_up_substitutions manually.
# The Tag constructor calls this method automatically,
# but html5lib creates a Tag object before setting up
# the attributes.
self.element.contains_substitutions = (
self.soup.builder.set_up_substitutions(
self.element))
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
text = TextNode(NavigableString(data), self.soup)
if insertBefore:
self.insertBefore(text, insertBefore)
else:
self.appendChild(text)
def insertBefore(self, node, refNode):
index = self._nodeIndex(node, refNode)
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[index-1].__class__ == NavigableString):
# (See comments in appendChild)
newStr = NavigableString(self.element.contents[index-1]+node.element)
oldNode = self.element.contents[index-1]
del self.element.contents[index-1]
oldNode.parent = None
oldNode.extract()
self.element.insert(index-1, newStr)
else:
self.element.insert(index, node.element)
node.parent = self
def removeChild(self, node):
index = self._nodeIndex(node.parent, node)
del node.parent.element.contents[index]
node.element.parent = None
node.element.extract()
node.parent = None
def reparentChildren(self, newParent):
while self.element.contents:
child = self.element.contents[0]
child.extract()
if isinstance(child, Tag):
newParent.appendChild(Element(child, self.soup, namespaces["html"]))
else:
newParent.appendChild(TextNode(child, self.soup))
def cloneNode(self):
node = Element(Tag(self.soup, self.soup.builder, self.element.name), self.soup, self.namespace)
for key,value in self.attributes:
node.attributes[key] = value
return node
def hasContent(self):
return self.element.contents
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TextNode(Element):
def __init__(self, element, soup):
html5lib.treebuilders._base.Node.__init__(self, None)
self.element = element
self.soup = soup
def cloneNode(self):
raise NotImplementedError
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
__all__ = [
'HTMLParserTreeBuilder',
]
try:
from html.parser import HTMLParser
CONSTRUCTOR_TAKES_STRICT = True
except ImportError, e:
from HTMLParser import HTMLParser
CONSTRUCTOR_TAKES_STRICT = False
from bs4.element import (
CData,
Comment,
Declaration,
Doctype,
ProcessingInstruction,
)
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.builder import (
HTML,
HTMLTreeBuilder,
STRICT,
)
HTMLPARSER = 'html.parser'
class HTMLParserTreeBuilder(HTMLParser, HTMLTreeBuilder):
is_xml = False
features = [HTML, STRICT, HTMLPARSER]
def __init__(self, *args, **kwargs):
if CONSTRUCTOR_TAKES_STRICT:
kwargs['strict'] = True
return super(HTMLParserTreeBuilder, self).__init__(*args, **kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
"""
:return: A 3-tuple (markup, original encoding, encoding
declared within markup).
"""
if isinstance(markup, unicode):
return markup, None, None
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
return (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding)
def feed(self, markup):
super(HTMLParserTreeBuilder, self).feed(markup)
def handle_starttag(self, name, attrs):
self.soup.handle_starttag(name, dict(attrs))
def handle_endtag(self, name):
self.soup.handle_endtag(name)
def handle_data(self, data):
self.soup.handle_data(data)
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed.
if name.startswith('x'):
data = unichr(int(name.lstrip('x'), 16))
else:
data = unichr(int(name))
self.handle_data(data)
def handle_entityref(self, name):
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
if character is not None:
data = character
else:
data = "&%s;" % name
self.handle_data(data)
def handle_comment(self, data):
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(Comment)
def handle_decl(self, data):
self.soup.endData()
if data.startswith("DOCTYPE "):
data = data[len(