from .dammit import EntitySubstitution class Formatter(EntitySubstitution): """Describes a strategy to use when outputting a parse tree to a string. Some parts of this strategy come from the distinction between HTML4, HTML5, and XML. Others are configurable by the user. Formatters are passed in as the `formatter` argument to methods like `PageElement.encode`. Most people won't need to think about formatters, and most people who need to think about them can pass in one of these predefined strings as `formatter` rather than making a new Formatter object: For HTML documents: * 'html' - HTML entity substitution for generic HTML documents. (default) * 'html5' - HTML entity substitution for HTML5 documents, as well as some optimizations in the way tags are rendered. * 'minimal' - Only make the substitutions necessary to guarantee valid HTML. * None - Do not perform any substitution. This will be faster but may result in invalid markup. For XML documents: * 'html' - Entity substitution for XHTML documents. * 'minimal' - Only make the substitutions necessary to guarantee valid XML. (default) * None - Do not perform any substitution. This will be faster but may result in invalid markup. """ # Registries of XML and HTML formatters. XML_FORMATTERS = {} HTML_FORMATTERS = {} HTML = 'html' XML = 'xml' HTML_DEFAULTS = dict( cdata_containing_tags=set(["script", "style"]), ) def _default(self, language, value, kwarg): if value is not None: return value if language == self.XML: return set() return self.HTML_DEFAULTS[kwarg] def __init__( self, language=None, entity_substitution=None, void_element_close_prefix='/', cdata_containing_tags=None, empty_attributes_are_booleans=False, indent=1, ): """Constructor. :param language: This should be Formatter.XML if you are formatting XML markup and Formatter.HTML if you are formatting HTML markup. :param entity_substitution: A function to call to replace special characters with XML/HTML entities. For examples, see bs4.dammit.EntitySubstitution.substitute_html and substitute_xml. :param void_element_close_prefix: By default, void elements are represented as <tag/> (XML rules) rather than <tag> (HTML rules). To get <tag>, pass in the empty string. :param cdata_containing_tags: The list of tags that are defined as containing CDATA in this dialect. For example, in HTML, <script> and <style> tags are defined as containing CDATA, and their contents should not be formatted. :param blank_attributes_are_booleans: Render attributes whose value is the empty string as HTML-style boolean attributes. (Attributes whose value is None are always rendered this way.) :param indent: If indent is a non-negative integer or string, then the contents of elements will be indented appropriately when pretty-printing. An indent level of 0, negative, or "" will only insert newlines. Using a positive integer indent indents that many spaces per level. If indent is a string (such as "\t"), that string is used to indent each level. The default behavior to indent one space per level. """ self.language = language self.entity_substitution = entity_substitution self.void_element_close_prefix = void_element_close_prefix self.cdata_containing_tags = self._default( language, cdata_containing_tags, 'cdata_containing_tags' ) self.empty_attributes_are_booleans=empty_attributes_are_booleans if indent is None: indent = 0 if isinstance(indent, int): if indent < 0: indent = 0 indent = ' ' * indent elif isinstance(indent, str): indent = indent else: indent = ' ' self.indent = indent def substitute(self, ns): """Process a string that needs to undergo entity substitution. This may be a string encountered in an attribute value or as text. :param ns: A string. :return: A string with certain characters replaced by named or numeric entities. """ if not self.entity_substitution: return ns from .element import NavigableString if (isinstance(ns, NavigableString) and ns.parent is not None and ns.parent.name in self.cdata_containing_tags): # Do nothing. return ns # Substitute. return self.entity_substitution(ns) def attribute_value(self, value): """Process the value of an attribute. :param ns: A string. :return: A string with certain characters replaced by named or numeric entities. """ return self.substitute(value) def attributes(self, tag): """Reorder a tag's attributes however you want. By default, attributes are sorted alphabetically. This makes behavior consistent between Python 2 and Python 3, and preserves backwards compatibility with older versions of Beautiful Soup. If `empty_boolean_attributes` is True, then attributes whose values are set to the empty string will be treated as boolean attributes. """ if tag.attrs is None: return [] return sorted( (k, (None if self.empty_attributes_are_booleans and v == '' else v)) for k, v in list(tag.attrs.items()) ) class HTMLFormatter(Formatter): """A generic Formatter for HTML.""" REGISTRY = {} def __init__(self, *args, **kwargs): super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) class XMLFormatter(Formatter): """A generic Formatter for XML.""" REGISTRY = {} def __init__(self, *args, **kwargs): super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) # Set up aliases for the default formatters. HTMLFormatter.REGISTRY['html'] = HTMLFormatter( entity_substitution=EntitySubstitution.substitute_html ) HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( entity_substitution=EntitySubstitution.substitute_html, void_element_close_prefix=None, empty_attributes_are_booleans=True, ) HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( entity_substitution=EntitySubstitution.substitute_xml ) HTMLFormatter.REGISTRY[None] = HTMLFormatter( entity_substitution=None ) XMLFormatter.REGISTRY["html"] = XMLFormatter( entity_substitution=EntitySubstitution.substitute_html ) XMLFormatter.REGISTRY["minimal"] = XMLFormatter( entity_substitution=EntitySubstitution.substitute_xml ) XMLFormatter.REGISTRY[None] = Formatter( Formatter(Formatter.XML, entity_substitution=None) )