SickGear/lib/html5lib/treewalkers/__init__.py

"""A collection of modules for iterating through different kinds of
tree, generating tokens identical to those produced by the tokenizer
module.

To create a tree walker for a new type of tree, you need to do
implement a tree walker object (called TreeWalker by convention) that
implements a 'serialize' method taking a tree as sole argument and
returning an iterator generating tokens.
"""

from __future__ import absolute_import, division, unicode_literals

__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"]

from .. import constants
from ..utils import default_etree

treeWalkerCache = {}


def getTreeWalker(treeType, implementation=None, **kwargs):
    """Get a TreeWalker class for various types of tree with built-in support

    Args:
        treeType (str): the name of the tree type required (case-insensitive).
            Supported values are:

            - "dom": The xml.dom.minidom DOM implementation
            - "etree": A generic walker for tree implementations exposing an
                       elementtree-like interface (known to work with
                       ElementTree, cElementTree and lxml.etree).
            - "lxml": Optimized walker for lxml.etree
            - "genshi": a Genshi stream

        Implementation: A module implementing the tree type e.g.
            xml.etree.ElementTree or cElementTree (Currently applies to the
            "etree" tree type only).
    """

    treeType = treeType.lower()
    if treeType not in treeWalkerCache:
        if treeType == "dom":
            from . import dom
            treeWalkerCache[treeType] = dom.TreeWalker
        elif treeType == "genshi":
            from . import genshistream
            treeWalkerCache[treeType] = genshistream.TreeWalker
        elif treeType == "lxml":
            from . import lxmletree
            treeWalkerCache[treeType] = lxmletree.TreeWalker
        elif treeType == "etree":
            from . import etree
            if implementation is None:
                implementation = default_etree
            # XXX: NEVER cache here, caching is done in the etree submodule
            return etree.getETreeModule(implementation, **kwargs).TreeWalker
    return treeWalkerCache.get(treeType)


def concatenateCharacterTokens(tokens):
    pendingCharacters = []
    for token in tokens:
        type = token["type"]
        if type in ("Characters", "SpaceCharacters"):
            pendingCharacters.append(token["data"])
        else:
            if pendingCharacters:
                yield {"type": "Characters", "data": "".join(pendingCharacters)}
                pendingCharacters = []
            yield token
    if pendingCharacters:
        yield {"type": "Characters", "data": "".join(pendingCharacters)}


def pprint(walker):
    """Pretty printer for tree walkers"""
    output = []
    indent = 0
    for token in concatenateCharacterTokens(walker):
        type = token["type"]
        if type in ("StartTag", "EmptyTag"):
            # tag name
            if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
                if token["namespace"] in constants.prefixes:
                    ns = constants.prefixes[token["namespace"]]
                else:
                    ns = token["namespace"]
                name = "%s %s" % (ns, token["name"])
            else:
                name = token["name"]
            output.append("%s<%s>" % (" " * indent, name))
            indent += 2
            # attributes (sorted for consistent ordering)
            attrs = token["data"]
            for (namespace, localname), value in sorted(attrs.items()):
                if namespace:
                    if namespace in constants.prefixes:
                        ns = constants.prefixes[namespace]
                    else:
                        ns = namespace
                    name = "%s %s" % (ns, localname)
                else:
                    name = localname
                output.append("%s%s=\"%s\"" % (" " * indent, name, value))
            # self-closing
            if type == "EmptyTag":
                indent -= 2

        elif type == "EndTag":
            indent -= 2

        elif type == "Comment":
            output.append("%s<!-- %s -->" % (" " * indent, token["data"]))

        elif type == "Doctype":
            if token["name"]:
                if token["publicId"]:
                    output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
                                  (" " * indent,
                                   token["name"],
                                   token["publicId"],
                                   token["systemId"] if token["systemId"] else ""))
                elif token["systemId"]:
                    output.append("""%s<!DOCTYPE %s "" "%s">""" %
                                  (" " * indent,
                                   token["name"],
                                   token["systemId"]))
                else:
                    output.append("%s<!DOCTYPE %s>" % (" " * indent,
                                                       token["name"]))
            else:
                output.append("%s<!DOCTYPE >" % (" " * indent,))

        elif type == "Characters":
            output.append("%s\"%s\"" % (" " * indent, token["data"]))

        elif type == "SpaceCharacters":
            assert False, "concatenateCharacterTokens should have got rid of all Space tokens"

        else:
            raise ValueError("Unknown token type, %s" % type)

    return "\n".join(output)
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 2014-03-10 05:18:05 +00:00			`"""A collection of modules for iterating through different kinds of`
			`tree, generating tokens identical to those produced by the tokenizer`
			`module.`

			`To create a tree walker for a new type of tree, you need to do`
			`implement a tree walker object (called TreeWalker by convention) that`
			`implements a 'serialize' method taking a tree as sole argument and`
			`returning an iterator generating tokens.`
			`"""`

Further improved memory handling of bs4 for torrent providers. 2014-07-21 23:01:46 +00:00			`from __future__ import absolute_import, division, unicode_literals`

Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d). 2016-01-12 01:17:02 +00:00			`__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"]`
Further improved memory handling of bs4 for torrent providers. 2014-07-21 23:01:46 +00:00
Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d). 2016-01-12 01:17:02 +00:00			`from .. import constants`
Further improved memory handling of bs4 for torrent providers. 2014-07-21 23:01:46 +00:00			`from ..utils import default_etree`

Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 2014-03-10 05:18:05 +00:00			`treeWalkerCache = {}`

Further improved memory handling of bs4 for torrent providers. 2014-07-21 23:01:46 +00:00
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 2014-03-10 05:18:05 +00:00			`def getTreeWalker(treeType, implementation=None, **kwargs):`
			`"""Get a TreeWalker class for various types of tree with built-in support`

Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d). 2016-01-12 01:17:02 +00:00			`Args:`
			`treeType (str): the name of the tree type required (case-insensitive).`
			`Supported values are:`
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 2014-03-10 05:18:05 +00:00
Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d). 2016-01-12 01:17:02 +00:00			`- "dom": The xml.dom.minidom DOM implementation`
			`- "etree": A generic walker for tree implementations exposing an`
			`elementtree-like interface (known to work with`
			`ElementTree, cElementTree and lxml.etree).`
			`- "lxml": Optimized walker for lxml.etree`
			`- "genshi": a Genshi stream`
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 2014-03-10 05:18:05 +00:00
Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d). 2016-01-12 01:17:02 +00:00			`Implementation: A module implementing the tree type e.g.`
			`xml.etree.ElementTree or cElementTree (Currently applies to the`
			`"etree" tree type only).`
			`"""`
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 2014-03-10 05:18:05 +00:00
			`treeType = treeType.lower()`
			`if treeType not in treeWalkerCache:`
Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d). 2016-01-12 01:17:02 +00:00			`if treeType == "dom":`
			`from . import dom`
			`treeWalkerCache[treeType] = dom.TreeWalker`
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 2014-03-10 05:18:05 +00:00			`elif treeType == "genshi":`
Further improved memory handling of bs4 for torrent providers. 2014-07-21 23:01:46 +00:00			`from . import genshistream`
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 2014-03-10 05:18:05 +00:00			`treeWalkerCache[treeType] = genshistream.TreeWalker`
			`elif treeType == "lxml":`
Further improved memory handling of bs4 for torrent providers. 2014-07-21 23:01:46 +00:00			`from . import lxmletree`
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 2014-03-10 05:18:05 +00:00			`treeWalkerCache[treeType] = lxmletree.TreeWalker`
			`elif treeType == "etree":`
Further improved memory handling of bs4 for torrent providers. 2014-07-21 23:01:46 +00:00			`from . import etree`
			`if implementation is None:`
			`implementation = default_etree`
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 2014-03-10 05:18:05 +00:00			`# XXX: NEVER cache here, caching is done in the etree submodule`
			`return etree.getETreeModule(implementation, **kwargs).TreeWalker`
			`return treeWalkerCache.get(treeType)`
Update html5lib 0.999 to 0.99999999/1.0b9 (46dae3d). 2016-01-12 01:17:02 +00:00

			`def concatenateCharacterTokens(tokens):`
			`pendingCharacters = []`
			`for token in tokens:`
			`type = token["type"]`
			`if type in ("Characters", "SpaceCharacters"):`
			`pendingCharacters.append(token["data"])`
			`else:`
			`if pendingCharacters:`
			`yield {"type": "Characters", "data": "".join(pendingCharacters)}`
			`pendingCharacters = []`
			`yield token`
			`if pendingCharacters:`
			`yield {"type": "Characters", "data": "".join(pendingCharacters)}`


			`def pprint(walker):`
			`"""Pretty printer for tree walkers"""`
			`output = []`
			`indent = 0`
			`for token in concatenateCharacterTokens(walker):`
			`type = token["type"]`
			`if type in ("StartTag", "EmptyTag"):`
			`# tag name`
			`if token["namespace"] and token["namespace"] != constants.namespaces["html"]:`
			`if token["namespace"] in constants.prefixes:`
			`ns = constants.prefixes[token["namespace"]]`
			`else:`
			`ns = token["namespace"]`
			`name = "%s %s" % (ns, token["name"])`
			`else:`
			`name = token["name"]`
			`output.append("%s<%s>" % (" " * indent, name))`
			`indent += 2`
			`# attributes (sorted for consistent ordering)`
			`attrs = token["data"]`
			`for (namespace, localname), value in sorted(attrs.items()):`
			`if namespace:`
			`if namespace in constants.prefixes:`
			`ns = constants.prefixes[namespace]`
			`else:`
			`ns = namespace`
			`name = "%s %s" % (ns, localname)`
			`else:`
			`name = localname`
			`output.append("%s%s=\"%s\"" % (" " * indent, name, value))`
			`# self-closing`
			`if type == "EmptyTag":`
			`indent -= 2`

			`elif type == "EndTag":`
			`indent -= 2`

			`elif type == "Comment":`
			`output.append("%s<!-- %s -->" % (" " * indent, token["data"]))`

			`elif type == "Doctype":`
			`if token["name"]:`
			`if token["publicId"]:`
			`output.append("""%s<!DOCTYPE %s "%s" "%s">""" %`
			`(" " * indent,`
			`token["name"],`
			`token["publicId"],`
			`token["systemId"] if token["systemId"] else ""))`
			`elif token["systemId"]:`
			`output.append("""%s<!DOCTYPE %s "" "%s">""" %`
			`(" " * indent,`
			`token["name"],`
			`token["systemId"]))`
			`else:`
			`output.append("%s<!DOCTYPE %s>" % (" " * indent,`
			`token["name"]))`
			`else:`
			`output.append("%s<!DOCTYPE >" % (" " * indent,))`

			`elif type == "Characters":`
			`output.append("%s\"%s\"" % (" " * indent, token["data"]))`

			`elif type == "SpaceCharacters":`
			`assert False, "concatenateCharacterTokens should have got rid of all Space tokens"`

			`else:`
			`raise ValueError("Unknown token type, %s" % type)`

			`return "\n".join(output)`