Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72).

2025-03-15 09:07:43 +00:00 · 2017-01-27 14:24:24 +00:00 · 2017-01-27 14:24:24 +00:00 · 69a589a54c
commit 69a589a54c
parent d60ef4dc44
34 changed files with 1684 additions and 1283 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -13,6 +13,7 @@
 * Update cachecontrol library 0.11.5 to 0.11.7 (3b3b776)
 * Update Certifi 2015.11.20.1 (385476b) to 2017.01.23 (9f9dc30)
 * Update feedparser library 5.2.0 (8c62940) to 5.2.1 (f1dd1bb)
 * Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72)
 [develop changelog]
--- a/lib/html5lib/init.py
+++ b/lib/html5lib/init.py
@ -22,4 +22,4 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
           "getTreeWalker", "serialize"]
 # this has to be at the top level, see how setup.py parses this
-__version__ = "0.99999999-dev"
+__version__ = "0.9999999999-dev"
--- a/lib/html5lib/_ihatexml.py
+++ b/lib/html5lib/_ihatexml.py
@ -175,9 +175,9 @@ def escapeRegexp(string):
    return string
 # output from the above
-nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
-nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa
 # Simpler things
 nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
@ -186,7 +186,7 @@ nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
 class InfosetFilter(object):
    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
-    def __init__(self, replaceChars=None,
+    def __init__(self,
                 dropXmlnsLocalName=False,
                 dropXmlnsAttrNs=False,
                 preventDoubleDashComments=False,
@ -217,7 +217,7 @@ class InfosetFilter(object):
        else:
            return self.toXmlName(name)
-    def coerceElement(self, name, namespace=None):
+    def coerceElement(self, name):
        return self.toXmlName(name)
    def coerceComment(self, data):
@ -232,7 +232,7 @@ class InfosetFilter(object):
    def coerceCharacters(self, data):
        if self.replaceFormFeedCharacters:
-            for i in range(data.count("\x0C")):
+            for _ in range(data.count("\x0C")):
                warnings.warn("Text cannot contain U+000C", DataLossWarning)
            data = data.replace("\x0C", " ")
        # Other non-xml characters
--- a/lib/html5lib/_inputstream.py
+++ b/lib/html5lib/_inputstream.py
@ -1,14 +1,16 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import text_type
+from six import text_type, binary_type
 from six.moves import http_client, urllib
 import codecs
 import re
 import webencodings
 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
-from .constants import encodings, ReparseException
+from .constants import ReparseException
-from . import utils
+from . import _utils
 from io import StringIO
@ -17,12 +19,6 @@ try:
 except ImportError:
    BytesIO = StringIO
 try:
    from io import BufferedIOBase
 except ImportError:
    class BufferedIOBase(object):
        pass
 # Non-unicode versions of constants for use in the pre-parser
 spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
 asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
@ -30,15 +26,17 @@ asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
-invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"  # noqa
-if utils.supports_lone_surrogates:
+if _utils.supports_lone_surrogates:
    # Use one extra step of indirection and create surrogates with
-    # unichr. Not using this indirection would introduce an illegal
+    # eval. Not using this indirection would introduce an illegal
    # unicode literal on platforms not supporting such lone
    # surrogates.
-    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
+    assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
-                                    eval('"\\uD800-\\uDFFF"'))
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
                                    eval('"\\uD800-\\uDFFF"') +  # pylint:disable=eval-used
                                    "]")
 else:
    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
@ -130,7 +128,7 @@ class BufferedStream(object):
        return b"".join(rv)
-def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
+def HTMLInputStream(source, **kwargs):
    # Work around Python bug #20007: read(0) closes the connection.
    # http://bugs.python.org/issue20007
    if (isinstance(source, http_client.HTTPResponse) or
@ -144,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
        isUnicode = isinstance(source, text_type)
    if isUnicode:
-        if encoding is not None:
+        encodings = [x for x in kwargs if x.endswith("_encoding")]
-            raise TypeError("Cannot explicitly set an encoding with a unicode string")
+        if encodings:
            raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
-        return HTMLUnicodeInputStream(source)
+        return HTMLUnicodeInputStream(source, **kwargs)
    else:
-        return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
+        return HTMLBinaryInputStream(source, **kwargs)
 class HTMLUnicodeInputStream(object):
@ -175,27 +174,21 @@ class HTMLUnicodeInputStream(object):
        regardless of any BOM or later declaration (such as in a meta
        element)
        parseMeta - Look for a <meta> element containing encoding information
        """
-        if not utils.supports_lone_surrogates:
+        if not _utils.supports_lone_surrogates:
            # Such platforms will have already checked for such
            # surrogate errors, so no need to do this checking.
            self.reportCharacterErrors = None
            self.replaceCharactersRegexp = None
        elif len("\U0010FFFF") == 1:
            self.reportCharacterErrors = self.characterErrorsUCS4
            self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
        else:
            self.reportCharacterErrors = self.characterErrorsUCS2
            self.replaceCharactersRegexp = re.compile(
                eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
        # List of where new lines occur
        self.newLines = [0]
-        self.charEncoding = ("utf-8", "certain")
+        self.charEncoding = (lookupEncoding("utf-8"), "certain")
        self.dataStream = self.openStream(source)
        self.reset()
@ -288,10 +281,7 @@ class HTMLUnicodeInputStream(object):
        if self.reportCharacterErrors:
            self.reportCharacterErrors(data)
-            # Replace invalid characters
+        # Replace invalid characters
            # Note U+0000 is dealt with in the tokenizer
            data = self.replaceCharactersRegexp.sub("\ufffd", data)
        data = data.replace("\r\n", "\n")
        data = data.replace("\r", "\n")
@ -301,7 +291,7 @@ class HTMLUnicodeInputStream(object):
        return True
    def characterErrorsUCS4(self, data):
-        for i in range(len(invalid_unicode_re.findall(data))):
+        for _ in range(len(invalid_unicode_re.findall(data))):
            self.errors.append("invalid-codepoint")
    def characterErrorsUCS2(self, data):
@ -314,9 +304,9 @@ class HTMLUnicodeInputStream(object):
            codepoint = ord(match.group())
            pos = match.start()
            # Pretty sure there should be endianness issues here
-            if utils.isSurrogatePair(data[pos:pos + 2]):
+            if _utils.isSurrogatePair(data[pos:pos + 2]):
                # We have a surrogate pair!
-                char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
+                char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
                if char_val in non_bmp_invalid_codepoints:
                    self.errors.append("invalid-codepoint")
                skip = True
@ -399,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
    """
-    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
+    def __init__(self, source, override_encoding=None, transport_encoding=None,
                 same_origin_parent_encoding=None, likely_encoding=None,
                 default_encoding="windows-1252", useChardet=True):
        """Initialises the HTMLInputStream.
        HTMLInputStream(source, [encoding]) -> Normalized stream from source
@ -412,8 +404,6 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
        regardless of any BOM or later declaration (such as in a meta
        element)
        parseMeta - Look for a <meta> element containing encoding information
        """
        # Raw Stream - for unicode objects this will encode to utf-8 and set
        #              self.charEncoding as appropriate
@ -421,27 +411,28 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
        HTMLUnicodeInputStream.__init__(self, self.rawStream)
        self.charEncoding = (codecName(encoding), "certain")
        # Encoding Information
        # Number of bytes to use when looking for a meta element with
        # encoding information
-        self.numBytesMeta = 512
+        self.numBytesMeta = 1024
        # Number of bytes to use when using detecting encoding using chardet
        self.numBytesChardet = 100
-        # Encoding to use if no other information can be found
+        # Things from args
-        self.defaultEncoding = "windows-1252"
+        self.override_encoding = override_encoding
        self.transport_encoding = transport_encoding
        self.same_origin_parent_encoding = same_origin_parent_encoding
        self.likely_encoding = likely_encoding
        self.default_encoding = default_encoding
-        # Detect encoding iff no explicit "transport level" encoding is supplied
+        # Determine encoding
-        if (self.charEncoding[0] is None):
+        self.charEncoding = self.determineEncoding(useChardet)
-            self.charEncoding = self.detectEncoding(parseMeta, chardet)
+        assert self.charEncoding[0] is not None
        # Call superclass
        self.reset()
    def reset(self):
-        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
+        self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
                                                                 'replace')
        HTMLUnicodeInputStream.reset(self)
    def openStream(self, source):
@ -458,29 +449,50 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
        try:
            stream.seek(stream.tell())
-        except:
+        except:  # pylint:disable=bare-except
            stream = BufferedStream(stream)
        return stream
-    def detectEncoding(self, parseMeta=True, chardet=True):
+    def determineEncoding(self, chardet=True):
-        # First look for a BOM
+        # BOMs take precedence over everything
        # This will also read past the BOM if present
-        encoding = self.detectBOM()
+        charEncoding = self.detectBOM(), "certain"
-        confidence = "certain"
+        if charEncoding[0] is not None:
-        # If there is no BOM need to look for meta elements with encoding
+            return charEncoding
-        # information
+
-        if encoding is None and parseMeta:
+        # If we've been overriden, we've been overriden
-            encoding = self.detectEncodingMeta()
+        charEncoding = lookupEncoding(self.override_encoding), "certain"
-            confidence = "tentative"
+        if charEncoding[0] is not None:
            return charEncoding
        # Now check the transport layer
        charEncoding = lookupEncoding(self.transport_encoding), "certain"
        if charEncoding[0] is not None:
            return charEncoding
        # Look for meta elements with encoding information
        charEncoding = self.detectEncodingMeta(), "tentative"
        if charEncoding[0] is not None:
            return charEncoding
        # Parent document encoding
        charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
        if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
            return charEncoding
        # "likely" encoding
        charEncoding = lookupEncoding(self.likely_encoding), "tentative"
        if charEncoding[0] is not None:
            return charEncoding
        # Guess with chardet, if available
-        if encoding is None and chardet:
+        if chardet:
            confidence = "tentative"
            try:
-                try:
+                from chardet.universaldetector import UniversalDetector
-                    from charade.universaldetector import UniversalDetector
+            except ImportError:
-                except ImportError:
+                pass
-                    from chardet.universaldetector import UniversalDetector
+            else:
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
@ -491,36 +503,33 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
-                encoding = detector.result['encoding']
+                encoding = lookupEncoding(detector.result['encoding'])
                self.rawStream.seek(0)
-            except ImportError:
+                if encoding is not None:
-                pass
+                    return encoding, "tentative"
        # If all else fails use the default encoding
        if encoding is None:
            confidence = "tentative"
            encoding = self.defaultEncoding
-        # Substitute for equivalent encodings:
+        # Try the default encoding
-        encodingSub = {"iso-8859-1": "windows-1252"}
+        charEncoding = lookupEncoding(self.default_encoding), "tentative"
        if charEncoding[0] is not None:
            return charEncoding
-        if encoding.lower() in encodingSub:
+        # Fallback to html5lib's default if even that hasn't worked
-            encoding = encodingSub[encoding.lower()]
+        return lookupEncoding("windows-1252"), "tentative"
        return encoding, confidence
    def changeEncoding(self, newEncoding):
        assert self.charEncoding[1] != "certain"
-        newEncoding = codecName(newEncoding)
+        newEncoding = lookupEncoding(newEncoding)
        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
            newEncoding = "utf-8"
        if newEncoding is None:
            return
        if newEncoding.name in ("utf-16be", "utf-16le"):
            newEncoding = lookupEncoding("utf-8")
            assert newEncoding is not None
        elif newEncoding == self.charEncoding[0]:
            self.charEncoding = (self.charEncoding[0], "certain")
        else:
            self.rawStream.seek(0)
            self.reset()
            self.charEncoding = (newEncoding, "certain")
            self.reset()
            raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
    def detectBOM(self):
@ -529,8 +538,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
        encoding otherwise return None"""
        bomDict = {
            codecs.BOM_UTF8: 'utf-8',
-            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
+            codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
-            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
+            codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
        }
        # Go to beginning of file and read in 4 bytes
@ -550,9 +559,12 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
-        self.rawStream.seek(encoding and seek or 0)
+        if encoding:
-
+            self.rawStream.seek(seek)
-        return encoding
+            return lookupEncoding(encoding)
        else:
            self.rawStream.seek(0)
            return None
    def detectEncodingMeta(self):
        """Report the encoding declared by the meta element
@ -563,8 +575,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
        self.rawStream.seek(0)
        encoding = parser.getEncoding()
-        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
+        if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
-            encoding = "utf-8"
+            encoding = lookupEncoding("utf-8")
        return encoding
@ -578,6 +590,7 @@ class EncodingBytes(bytes):
        return bytes.__new__(self, value.lower())
    def __init__(self, value):
        # pylint:disable=unused-argument
        self._position = -1
    def __iter__(self):
@ -688,7 +701,7 @@ class EncodingParser(object):
            (b"<!", self.handleOther),
            (b"<?", self.handleOther),
            (b"<", self.handlePossibleStartTag))
-        for byte in self.data:
+        for _ in self.data:
            keepParsing = True
            for key, method in methodDispatch:
                if self.data.matchBytes(key):
@ -727,7 +740,7 @@ class EncodingParser(object):
                        return False
                elif attr[0] == b"charset":
                    tentativeEncoding = attr[1]
-                    codec = codecName(tentativeEncoding)
+                    codec = lookupEncoding(tentativeEncoding)
                    if codec is not None:
                        self.encoding = codec
                        return False
@ -735,7 +748,7 @@ class EncodingParser(object):
                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
                    tentativeEncoding = contentParser.parse()
                    if tentativeEncoding is not None:
-                        codec = codecName(tentativeEncoding)
+                        codec = lookupEncoding(tentativeEncoding)
                        if codec is not None:
                            if hasPragma:
                                self.encoding = codec
@ -892,16 +905,19 @@ class ContentAttrParser(object):
            return None
-def codecName(encoding):
+def lookupEncoding(encoding):
    """Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding."""
-    if isinstance(encoding, bytes):
+    if isinstance(encoding, binary_type):
        try:
            encoding = encoding.decode("ascii")
        except UnicodeDecodeError:
            return None
-    if encoding:
+
-        canonicalName = ascii_punctuation_re.sub("", encoding).lower()
+    if encoding is not None:
-        return encodings.get(canonicalName, None)
+        try:
            return webencodings.lookup(encoding)
        except AttributeError:
            return None
    else:
        return None
--- a/lib/html5lib/_tokenizer.py
+++ b/lib/html5lib/_tokenizer.py
@ -1,9 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
-try:
+from six import unichr as chr
    chr = unichr # flake8: noqa
 except NameError:
    pass
 from collections import deque
@ -14,9 +11,9 @@ from .constants import digits, hexDigits, EOF
 from .constants import tokenTypes, tagTokenTypes
 from .constants import replacementCharacters
-from .inputstream import HTMLInputStream
+from ._inputstream import HTMLInputStream
-from .trie import Trie
+from ._trie import Trie
 entitiesTrie = Trie(entities)
@ -34,16 +31,11 @@ class HTMLTokenizer(object):
      Points to HTMLInputStream object.
    """
-    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
+    def __init__(self, stream, parser=None, **kwargs):
                 lowercaseElementName=True, lowercaseAttrName=True, parser=None):
-        self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
+        self.stream = HTMLInputStream(stream, **kwargs)
        self.parser = parser
        # Perform case conversions?
        self.lowercaseElementName = lowercaseElementName
        self.lowercaseAttrName = lowercaseAttrName
        # Setup the initial tokenizer state
        self.escapeFlag = False
        self.lastFourChars = []
@ -147,8 +139,8 @@ class HTMLTokenizer(object):
        output = "&"
        charStack = [self.stream.char()]
-        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
+        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
-                or (allowedChar is not None and allowedChar == charStack[0])):
+                (allowedChar is not None and allowedChar == charStack[0])):
            self.stream.unget(charStack[0])
        elif charStack[0] == "#":
@ -235,8 +227,7 @@ class HTMLTokenizer(object):
        token = self.currentToken
        # Add token to the queue to be yielded
        if (token["type"] in tagTokenTypes):
-            if self.lowercaseElementName:
+            token["name"] = token["name"].translate(asciiUpper2Lower)
                token["name"] = token["name"].translate(asciiUpper2Lower)
            if token["type"] == tokenTypes["EndTag"]:
                if token["data"]:
                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
@ -921,10 +912,9 @@ class HTMLTokenizer(object):
            # Attributes are not dropped at this stage. That happens when the
            # start tag token is emitted so values can still be safely appended
            # to attributes, but we do want to report the parse error in time.
-            if self.lowercaseAttrName:
+            self.currentToken["data"][-1][0] = (
-                self.currentToken["data"][-1][0] = (
+                self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
-                    self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
+            for name, _ in self.currentToken["data"][:-1]:
            for name, value in self.currentToken["data"][:-1]:
                if self.currentToken["data"][-1][0] == name:
                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
                                            "duplicate-attribute"})
@ -1716,11 +1706,11 @@ class HTMLTokenizer(object):
                else:
                    data.append(char)
-        data = "".join(data)
+        data = "".join(data)  # pylint:disable=redefined-variable-type
        # Deal with null here rather than in the parser
        nullCount = data.count("\u0000")
        if nullCount > 0:
-            for i in range(nullCount):
+            for _ in range(nullCount):
                self.tokenQueue.append({"type": tokenTypes["ParseError"],
                                        "data": "invalid-codepoint"})
            data = data.replace("\u0000", "\uFFFD")
--- a/lib/html5lib/_trie/init.py
+++ b/lib/html5lib/_trie/init.py
@ -4,9 +4,11 @@ from .py import Trie as PyTrie
 Trie = PyTrie
 # pylint:disable=wrong-import-position
 try:
    from .datrie import Trie as DATrie
 except ImportError:
    pass
 else:
    Trie = DATrie
 # pylint:enable=wrong-import-position
--- a/lib/html5lib/_trie/_base.py
+++ b/lib/html5lib/_trie/_base.py
@ -7,7 +7,8 @@ class Trie(Mapping):
    """Abstract base class for tries"""
    def keys(self, prefix=None):
-        keys = super().keys()
+        # pylint:disable=arguments-differ
        keys = super(Trie, self).keys()
        if prefix is None:
            return set(keys)
--- a/lib/html5lib/_trie/datrie.py
+++ b/lib/html5lib/_trie/datrie.py
--- a/lib/html5lib/_trie/py.py
+++ b/lib/html5lib/_trie/py.py
--- a/lib/html5lib/_utils.py
+++ b/lib/html5lib/_utils.py
@ -1,5 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
 import sys
 from types import ModuleType
 from six import text_type
@ -12,9 +13,11 @@ except ImportError:
 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
           "surrogatePairToCodepoint", "moduleFactoryFactory",
-           "supports_lone_surrogates"]
+           "supports_lone_surrogates", "PY27"]
 PY27 = sys.version_info[0] == 2 and sys.version_info[1] >= 7
 # Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
 # caught by the below test. In general this would be any platform
 # using UTF-16 as its encoding of unicode strings, such as
@ -22,12 +25,12 @@ __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
 # surrogates, and there is no mechanism to further escape such
 # escapes.
 try:
-    _x = eval('"\\uD800"')
+    _x = eval('"\\uD800"')  # pylint:disable=eval-used
    if not isinstance(_x, text_type):
        # We need this with u"" because of http://bugs.jython.org/issue2039
-        _x = eval('u"\\uD800"')
+        _x = eval('u"\\uD800"')  # pylint:disable=eval-used
        assert isinstance(_x, text_type)
-except:
+except:  # pylint:disable=bare-except
    supports_lone_surrogates = False
 else:
    supports_lone_surrogates = True
@ -52,12 +55,13 @@ class MethodDispatcher(dict):
        # anything here.
        _dictEntries = []
        for name, value in items:
-            if type(name) in (list, tuple, frozenset, set):
+            if isinstance(name, (list, tuple, frozenset, set)):
                for item in name:
                    _dictEntries.append((item, value))
            else:
                _dictEntries.append((name, value))
        dict.__init__(self, _dictEntries)
        assert len(self) == len(_dictEntries)
        self.default = None
    def __getitem__(self, key):
@ -109,3 +113,15 @@ def moduleFactoryFactory(factory):
            return mod
    return moduleFactory
 def memoize(func):
    cache = {}
    def wrapped(*args, **kwargs):
        key = (tuple(args), tuple(kwargs.items()))
        if key not in cache:
            cache[key] = func(*args, **kwargs)
        return cache[key]
    return wrapped
--- a/lib/html5lib/constants.py
+++ b/lib/html5lib/constants.py
@ -283,6 +283,12 @@ E = {
        "Element %(name)s not allowed in a non-html context",
    "unexpected-end-tag-before-html":
        "Unexpected end tag (%(name)s) before html.",
    "unexpected-inhead-noscript-tag":
        "Element %(name)s not allowed in a inhead-noscript context",
    "eof-in-head-noscript":
        "Unexpected end of file. Expected inhead-noscript content",
    "char-in-head-noscript":
        "Unexpected non-space character. Expected inhead-noscript content",
    "XXX-undefined-error":
        "Undefined error (this sucks and should be fixed)",
 }
@ -431,6 +437,73 @@ mathmlTextIntegrationPointElements = frozenset([
    (namespaces["mathml"], "mtext")
 ])
 adjustSVGAttributes = {
    "attributename": "attributeName",
    "attributetype": "attributeType",
    "basefrequency": "baseFrequency",
    "baseprofile": "baseProfile",
    "calcmode": "calcMode",
    "clippathunits": "clipPathUnits",
    "contentscripttype": "contentScriptType",
    "contentstyletype": "contentStyleType",
    "diffuseconstant": "diffuseConstant",
    "edgemode": "edgeMode",
    "externalresourcesrequired": "externalResourcesRequired",
    "filterres": "filterRes",
    "filterunits": "filterUnits",
    "glyphref": "glyphRef",
    "gradienttransform": "gradientTransform",
    "gradientunits": "gradientUnits",
    "kernelmatrix": "kernelMatrix",
    "kernelunitlength": "kernelUnitLength",
    "keypoints": "keyPoints",
    "keysplines": "keySplines",
    "keytimes": "keyTimes",
    "lengthadjust": "lengthAdjust",
    "limitingconeangle": "limitingConeAngle",
    "markerheight": "markerHeight",
    "markerunits": "markerUnits",
    "markerwidth": "markerWidth",
    "maskcontentunits": "maskContentUnits",
    "maskunits": "maskUnits",
    "numoctaves": "numOctaves",
    "pathlength": "pathLength",
    "patterncontentunits": "patternContentUnits",
    "patterntransform": "patternTransform",
    "patternunits": "patternUnits",
    "pointsatx": "pointsAtX",
    "pointsaty": "pointsAtY",
    "pointsatz": "pointsAtZ",
    "preservealpha": "preserveAlpha",
    "preserveaspectratio": "preserveAspectRatio",
    "primitiveunits": "primitiveUnits",
    "refx": "refX",
    "refy": "refY",
    "repeatcount": "repeatCount",
    "repeatdur": "repeatDur",
    "requiredextensions": "requiredExtensions",
    "requiredfeatures": "requiredFeatures",
    "specularconstant": "specularConstant",
    "specularexponent": "specularExponent",
    "spreadmethod": "spreadMethod",
    "startoffset": "startOffset",
    "stddeviation": "stdDeviation",
    "stitchtiles": "stitchTiles",
    "surfacescale": "surfaceScale",
    "systemlanguage": "systemLanguage",
    "tablevalues": "tableValues",
    "targetx": "targetX",
    "targety": "targetY",
    "textlength": "textLength",
    "viewbox": "viewBox",
    "viewtarget": "viewTarget",
    "xchannelselector": "xChannelSelector",
    "ychannelselector": "yChannelSelector",
    "zoomandpan": "zoomAndPan"
 }
 adjustMathMLAttributes = {"definitionurl": "definitionURL"}
 adjustForeignAttributes = {
    "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
    "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
@ -2813,7 +2886,6 @@ replacementCharacters = {
    0x0d: "\u000D",
    0x80: "\u20AC",
    0x81: "\u0081",
    0x81: "\u0081",
    0x82: "\u201A",
    0x83: "\u0192",
    0x84: "\u201E",
@ -2846,235 +2918,6 @@ replacementCharacters = {
    0x9F: "\u0178",
 }
 encodings = {
    '437': 'cp437',
    '850': 'cp850',
    '852': 'cp852',
    '855': 'cp855',
    '857': 'cp857',
    '860': 'cp860',
    '861': 'cp861',
    '862': 'cp862',
    '863': 'cp863',
    '865': 'cp865',
    '866': 'cp866',
    '869': 'cp869',
    'ansix341968': 'ascii',
    'ansix341986': 'ascii',
    'arabic': 'iso8859-6',
    'ascii': 'ascii',
    'asmo708': 'iso8859-6',
    'big5': 'big5',
    'big5hkscs': 'big5hkscs',
    'chinese': 'gbk',
    'cp037': 'cp037',
    'cp1026': 'cp1026',
    'cp154': 'ptcp154',
    'cp367': 'ascii',
    'cp424': 'cp424',
    'cp437': 'cp437',
    'cp500': 'cp500',
    'cp775': 'cp775',
    'cp819': 'windows-1252',
    'cp850': 'cp850',
    'cp852': 'cp852',
    'cp855': 'cp855',
    'cp857': 'cp857',
    'cp860': 'cp860',
    'cp861': 'cp861',
    'cp862': 'cp862',
    'cp863': 'cp863',
    'cp864': 'cp864',
    'cp865': 'cp865',
    'cp866': 'cp866',
    'cp869': 'cp869',
    'cp936': 'gbk',
    'cpgr': 'cp869',
    'cpis': 'cp861',
    'csascii': 'ascii',
    'csbig5': 'big5',
    'cseuckr': 'cp949',
    'cseucpkdfmtjapanese': 'euc_jp',
    'csgb2312': 'gbk',
    'cshproman8': 'hp-roman8',
    'csibm037': 'cp037',
    'csibm1026': 'cp1026',
    'csibm424': 'cp424',
    'csibm500': 'cp500',
    'csibm855': 'cp855',
    'csibm857': 'cp857',
    'csibm860': 'cp860',
    'csibm861': 'cp861',
    'csibm863': 'cp863',
    'csibm864': 'cp864',
    'csibm865': 'cp865',
    'csibm866': 'cp866',
    'csibm869': 'cp869',
    'csiso2022jp': 'iso2022_jp',
    'csiso2022jp2': 'iso2022_jp_2',
    'csiso2022kr': 'iso2022_kr',
    'csiso58gb231280': 'gbk',
    'csisolatin1': 'windows-1252',
    'csisolatin2': 'iso8859-2',
    'csisolatin3': 'iso8859-3',
    'csisolatin4': 'iso8859-4',
    'csisolatin5': 'windows-1254',
    'csisolatin6': 'iso8859-10',
    'csisolatinarabic': 'iso8859-6',
    'csisolatincyrillic': 'iso8859-5',
    'csisolatingreek': 'iso8859-7',
    'csisolatinhebrew': 'iso8859-8',
    'cskoi8r': 'koi8-r',
    'csksc56011987': 'cp949',
    'cspc775baltic': 'cp775',
    'cspc850multilingual': 'cp850',
    'cspc862latinhebrew': 'cp862',
    'cspc8codepage437': 'cp437',
    'cspcp852': 'cp852',
    'csptcp154': 'ptcp154',
    'csshiftjis': 'shift_jis',
    'csunicode11utf7': 'utf-7',
    'cyrillic': 'iso8859-5',
    'cyrillicasian': 'ptcp154',
    'ebcdiccpbe': 'cp500',
    'ebcdiccpca': 'cp037',
    'ebcdiccpch': 'cp500',
    'ebcdiccphe': 'cp424',
    'ebcdiccpnl': 'cp037',
    'ebcdiccpus': 'cp037',
    'ebcdiccpwt': 'cp037',
    'ecma114': 'iso8859-6',
    'ecma118': 'iso8859-7',
    'elot928': 'iso8859-7',
    'eucjp': 'euc_jp',
    'euckr': 'cp949',
    'extendedunixcodepackedformatforjapanese': 'euc_jp',
    'gb18030': 'gb18030',
    'gb2312': 'gbk',
    'gb231280': 'gbk',
    'gbk': 'gbk',
    'greek': 'iso8859-7',
    'greek8': 'iso8859-7',
    'hebrew': 'iso8859-8',
    'hproman8': 'hp-roman8',
    'hzgb2312': 'hz',
    'ibm037': 'cp037',
    'ibm1026': 'cp1026',
    'ibm367': 'ascii',
    'ibm424': 'cp424',
    'ibm437': 'cp437',
    'ibm500': 'cp500',
    'ibm775': 'cp775',
    'ibm819': 'windows-1252',
    'ibm850': 'cp850',
    'ibm852': 'cp852',
    'ibm855': 'cp855',
    'ibm857': 'cp857',
    'ibm860': 'cp860',
    'ibm861': 'cp861',
    'ibm862': 'cp862',
    'ibm863': 'cp863',
    'ibm864': 'cp864',
    'ibm865': 'cp865',
    'ibm866': 'cp866',
    'ibm869': 'cp869',
    'iso2022jp': 'iso2022_jp',
    'iso2022jp2': 'iso2022_jp_2',
    'iso2022kr': 'iso2022_kr',
    'iso646irv1991': 'ascii',
    'iso646us': 'ascii',
    'iso88591': 'windows-1252',
    'iso885910': 'iso8859-10',
    'iso8859101992': 'iso8859-10',
    'iso885911987': 'windows-1252',
    'iso885913': 'iso8859-13',
    'iso885914': 'iso8859-14',
    'iso8859141998': 'iso8859-14',
    'iso885915': 'iso8859-15',
    'iso885916': 'iso8859-16',
    'iso8859162001': 'iso8859-16',
    'iso88592': 'iso8859-2',
    'iso885921987': 'iso8859-2',
    'iso88593': 'iso8859-3',
    'iso885931988': 'iso8859-3',
    'iso88594': 'iso8859-4',
    'iso885941988': 'iso8859-4',
    'iso88595': 'iso8859-5',
    'iso885951988': 'iso8859-5',
    'iso88596': 'iso8859-6',
    'iso885961987': 'iso8859-6',
    'iso88597': 'iso8859-7',
    'iso885971987': 'iso8859-7',
    'iso88598': 'iso8859-8',
    'iso885981988': 'iso8859-8',
    'iso88599': 'windows-1254',
    'iso885991989': 'windows-1254',
    'isoceltic': 'iso8859-14',
    'isoir100': 'windows-1252',
    'isoir101': 'iso8859-2',
    'isoir109': 'iso8859-3',
    'isoir110': 'iso8859-4',
    'isoir126': 'iso8859-7',
    'isoir127': 'iso8859-6',
    'isoir138': 'iso8859-8',
    'isoir144': 'iso8859-5',
    'isoir148': 'windows-1254',
    'isoir149': 'cp949',
    'isoir157': 'iso8859-10',
    'isoir199': 'iso8859-14',
    'isoir226': 'iso8859-16',
    'isoir58': 'gbk',
    'isoir6': 'ascii',
    'koi8r': 'koi8-r',
    'koi8u': 'koi8-u',
    'korean': 'cp949',
    'ksc5601': 'cp949',
    'ksc56011987': 'cp949',
    'ksc56011989': 'cp949',
    'l1': 'windows-1252',
    'l10': 'iso8859-16',
    'l2': 'iso8859-2',
    'l3': 'iso8859-3',
    'l4': 'iso8859-4',
    'l5': 'windows-1254',
    'l6': 'iso8859-10',
    'l8': 'iso8859-14',
    'latin1': 'windows-1252',
    'latin10': 'iso8859-16',
    'latin2': 'iso8859-2',
    'latin3': 'iso8859-3',
    'latin4': 'iso8859-4',
    'latin5': 'windows-1254',
    'latin6': 'iso8859-10',
    'latin8': 'iso8859-14',
    'latin9': 'iso8859-15',
    'ms936': 'gbk',
    'mskanji': 'shift_jis',
    'pt154': 'ptcp154',
    'ptcp154': 'ptcp154',
    'r8': 'hp-roman8',
    'roman8': 'hp-roman8',
    'shiftjis': 'shift_jis',
    'tis620': 'cp874',
    'unicode11utf7': 'utf-7',
    'us': 'ascii',
    'usascii': 'ascii',
    'utf16': 'utf-16',
    'utf16be': 'utf-16-be',
    'utf16le': 'utf-16-le',
    'utf8': 'utf-8',
    'windows1250': 'cp1250',
    'windows1251': 'cp1251',
    'windows1252': 'cp1252',
    'windows1253': 'cp1253',
    'windows1254': 'cp1254',
    'windows1255': 'cp1255',
    'windows1256': 'cp1256',
    'windows1257': 'cp1257',
    'windows1258': 'cp1258',
    'windows936': 'gbk',
    'x-x-big5': 'big5'}
 tokenTypes = {
    "Doctype": 0,
    "Characters": 1,
--- a/lib/html5lib/filters/alphabeticalattributes.py
+++ b/lib/html5lib/filters/alphabeticalattributes.py
@ -1,6 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals
-from . import _base
+from . import base
 try:
    from collections import OrderedDict
@ -8,9 +8,9 @@ except ImportError:
    from ordereddict import OrderedDict
-class Filter(_base.Filter):
+class Filter(base.Filter):
    def __iter__(self):
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            if token["type"] in ("StartTag", "EmptyTag"):
                attrs = OrderedDict()
                for name, value in sorted(token["data"].items(),
--- a/lib/html5lib/filters/_base.py
+++ b/lib/html5lib/filters/_base.py
--- a/lib/html5lib/filters/inject_meta_charset.py
+++ b/lib/html5lib/filters/inject_meta_charset.py
@ -1,11 +1,11 @@
 from __future__ import absolute_import, division, unicode_literals
-from . import _base
+from . import base
-class Filter(_base.Filter):
+class Filter(base.Filter):
    def __init__(self, source, encoding):
-        _base.Filter.__init__(self, source)
+        base.Filter.__init__(self, source)
        self.encoding = encoding
    def __iter__(self):
@ -13,7 +13,7 @@ class Filter(_base.Filter):
        meta_found = (self.encoding is None)
        pending = []
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            type = token["type"]
            if type == "StartTag":
                if token["name"].lower() == "head":
--- a/lib/html5lib/filters/lint.py
+++ b/lib/html5lib/filters/lint.py
@ -1,90 +1,81 @@
 from __future__ import absolute_import, division, unicode_literals
-from . import _base
+from six import text_type
-from ..constants import cdataElements, rcdataElements, voidElements
+
 from . import base
 from ..constants import namespaces, voidElements
 from ..constants import spaceCharacters
 spaceCharacters = "".join(spaceCharacters)
-class LintError(Exception):
+class Filter(base.Filter):
-    pass
+    def __init__(self, source, require_matching_tags=True):
        super(Filter, self).__init__(source)
        self.require_matching_tags = require_matching_tags
 class Filter(_base.Filter):
    def __iter__(self):
        open_elements = []
-        contentModelFlag = "PCDATA"
+        for token in base.Filter.__iter__(self):
        for token in _base.Filter.__iter__(self):
            type = token["type"]
            if type in ("StartTag", "EmptyTag"):
                namespace = token["namespace"]
                name = token["name"]
-                if contentModelFlag != "PCDATA":
+                assert namespace is None or isinstance(namespace, text_type)
-                    raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
+                assert namespace != ""
-                if not isinstance(name, str):
+                assert isinstance(name, text_type)
-                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
+                assert name != ""
-                if not name:
+                assert isinstance(token["data"], dict)
-                    raise LintError("Empty tag name")
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
-                if type == "StartTag" and name in voidElements:
+                    assert type == "EmptyTag"
-                    raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
+                else:
-                elif type == "EmptyTag" and name not in voidElements:
+                    assert type == "StartTag"
-                    raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
+                if type == "StartTag" and self.require_matching_tags:
-                if type == "StartTag":
+                    open_elements.append((namespace, name))
-                    open_elements.append(name)
+                for (namespace, name), value in token["data"].items():
-                for name, value in token["data"]:
+                    assert namespace is None or isinstance(namespace, text_type)
-                    if not isinstance(name, str):
+                    assert namespace != ""
-                        raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
+                    assert isinstance(name, text_type)
-                    if not name:
+                    assert name != ""
-                        raise LintError("Empty attribute name")
+                    assert isinstance(value, text_type)
                    if not isinstance(value, str):
                        raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
                if name in cdataElements:
                    contentModelFlag = "CDATA"
                elif name in rcdataElements:
                    contentModelFlag = "RCDATA"
                elif name == "plaintext":
                    contentModelFlag = "PLAINTEXT"
            elif type == "EndTag":
                namespace = token["namespace"]
                name = token["name"]
-                if not isinstance(name, str):
+                assert namespace is None or isinstance(namespace, text_type)
-                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
+                assert namespace != ""
-                if not name:
+                assert isinstance(name, text_type)
-                    raise LintError("Empty tag name")
+                assert name != ""
-                if name in voidElements:
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
-                    raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
+                    assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
-                start_name = open_elements.pop()
+                elif self.require_matching_tags:
-                if start_name != name:
+                    start = open_elements.pop()
-                    raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
+                    assert start == (namespace, name)
                contentModelFlag = "PCDATA"
            elif type == "Comment":
-                if contentModelFlag != "PCDATA":
+                data = token["data"]
-                    raise LintError("Comment not in PCDATA content model flag")
+                assert isinstance(data, text_type)
            elif type in ("Characters", "SpaceCharacters"):
                data = token["data"]
-                if not isinstance(data, str):
+                assert isinstance(data, text_type)
-                    raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
+                assert data != ""
                if not data:
                    raise LintError("%(type)s token with empty data" % {"type": type})
                if type == "SpaceCharacters":
-                    data = data.strip(spaceCharacters)
+                    assert data.strip(spaceCharacters) == ""
                    if data:
                        raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
            elif type == "Doctype":
                name = token["name"]
-                if contentModelFlag != "PCDATA":
+                assert name is None or isinstance(name, text_type)
-                    raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
+                assert token["publicId"] is None or isinstance(name, text_type)
-                if not isinstance(name, str):
+                assert token["systemId"] is None or isinstance(name, text_type)
                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
                # XXX: what to do with token["data"] ?
-            elif type in ("ParseError", "SerializeError"):
+            elif type == "Entity":
-                pass
+                assert isinstance(token["name"], text_type)
            elif type == "SerializerError":
                assert isinstance(token["data"], text_type)
            else:
-                raise LintError("Unknown token type: %(type)s" % {"type": type})
+                assert False, "Unknown token type: %(type)s" % {"type": type}
            yield token
--- a/lib/html5lib/filters/optionaltags.py
+++ b/lib/html5lib/filters/optionaltags.py
@ -1,9 +1,9 @@
 from __future__ import absolute_import, division, unicode_literals
-from . import _base
+from . import base
-class Filter(_base.Filter):
+class Filter(base.Filter):
    def slider(self):
        previous1 = previous2 = None
        for token in self.source:
@ -11,7 +11,8 @@ class Filter(_base.Filter):
                yield previous2, previous1, token
            previous2 = previous1
            previous1 = token
-        yield previous2, previous1, None
+        if previous1 is not None:
            yield previous2, previous1, None
    def __iter__(self):
        for previous, token, next in self.slider():
--- a/lib/html5lib/filters/sanitizer.py
+++ b/lib/html5lib/filters/sanitizer.py
@ -1,12 +1,865 @@
 from __future__ import absolute_import, division, unicode_literals
-from . import _base
+import re
-from ..sanitizer import HTMLSanitizerMixin
+from xml.sax.saxutils import escape, unescape
 from six.moves import urllib_parse as urlparse
 from . import base
 from ..constants import namespaces, prefixes
 __all__ = ["Filter"]
-class Filter(_base.Filter, HTMLSanitizerMixin):
+allowed_elements = frozenset((
    (namespaces['html'], 'a'),
    (namespaces['html'], 'abbr'),
    (namespaces['html'], 'acronym'),
    (namespaces['html'], 'address'),
    (namespaces['html'], 'area'),
    (namespaces['html'], 'article'),
    (namespaces['html'], 'aside'),
    (namespaces['html'], 'audio'),
    (namespaces['html'], 'b'),
    (namespaces['html'], 'big'),
    (namespaces['html'], 'blockquote'),
    (namespaces['html'], 'br'),
    (namespaces['html'], 'button'),
    (namespaces['html'], 'canvas'),
    (namespaces['html'], 'caption'),
    (namespaces['html'], 'center'),
    (namespaces['html'], 'cite'),
    (namespaces['html'], 'code'),
    (namespaces['html'], 'col'),
    (namespaces['html'], 'colgroup'),
    (namespaces['html'], 'command'),
    (namespaces['html'], 'datagrid'),
    (namespaces['html'], 'datalist'),
    (namespaces['html'], 'dd'),
    (namespaces['html'], 'del'),
    (namespaces['html'], 'details'),
    (namespaces['html'], 'dfn'),
    (namespaces['html'], 'dialog'),
    (namespaces['html'], 'dir'),
    (namespaces['html'], 'div'),
    (namespaces['html'], 'dl'),
    (namespaces['html'], 'dt'),
    (namespaces['html'], 'em'),
    (namespaces['html'], 'event-source'),
    (namespaces['html'], 'fieldset'),
    (namespaces['html'], 'figcaption'),
    (namespaces['html'], 'figure'),
    (namespaces['html'], 'footer'),
    (namespaces['html'], 'font'),
    (namespaces['html'], 'form'),
    (namespaces['html'], 'header'),
    (namespaces['html'], 'h1'),
    (namespaces['html'], 'h2'),
    (namespaces['html'], 'h3'),
    (namespaces['html'], 'h4'),
    (namespaces['html'], 'h5'),
    (namespaces['html'], 'h6'),
    (namespaces['html'], 'hr'),
    (namespaces['html'], 'i'),
    (namespaces['html'], 'img'),
    (namespaces['html'], 'input'),
    (namespaces['html'], 'ins'),
    (namespaces['html'], 'keygen'),
    (namespaces['html'], 'kbd'),
    (namespaces['html'], 'label'),
    (namespaces['html'], 'legend'),
    (namespaces['html'], 'li'),
    (namespaces['html'], 'm'),
    (namespaces['html'], 'map'),
    (namespaces['html'], 'menu'),
    (namespaces['html'], 'meter'),
    (namespaces['html'], 'multicol'),
    (namespaces['html'], 'nav'),
    (namespaces['html'], 'nextid'),
    (namespaces['html'], 'ol'),
    (namespaces['html'], 'output'),
    (namespaces['html'], 'optgroup'),
    (namespaces['html'], 'option'),
    (namespaces['html'], 'p'),
    (namespaces['html'], 'pre'),
    (namespaces['html'], 'progress'),
    (namespaces['html'], 'q'),
    (namespaces['html'], 's'),
    (namespaces['html'], 'samp'),
    (namespaces['html'], 'section'),
    (namespaces['html'], 'select'),
    (namespaces['html'], 'small'),
    (namespaces['html'], 'sound'),
    (namespaces['html'], 'source'),
    (namespaces['html'], 'spacer'),
    (namespaces['html'], 'span'),
    (namespaces['html'], 'strike'),
    (namespaces['html'], 'strong'),
    (namespaces['html'], 'sub'),
    (namespaces['html'], 'sup'),
    (namespaces['html'], 'table'),
    (namespaces['html'], 'tbody'),
    (namespaces['html'], 'td'),
    (namespaces['html'], 'textarea'),
    (namespaces['html'], 'time'),
    (namespaces['html'], 'tfoot'),
    (namespaces['html'], 'th'),
    (namespaces['html'], 'thead'),
    (namespaces['html'], 'tr'),
    (namespaces['html'], 'tt'),
    (namespaces['html'], 'u'),
    (namespaces['html'], 'ul'),
    (namespaces['html'], 'var'),
    (namespaces['html'], 'video'),
    (namespaces['mathml'], 'maction'),
    (namespaces['mathml'], 'math'),
    (namespaces['mathml'], 'merror'),
    (namespaces['mathml'], 'mfrac'),
    (namespaces['mathml'], 'mi'),
    (namespaces['mathml'], 'mmultiscripts'),
    (namespaces['mathml'], 'mn'),
    (namespaces['mathml'], 'mo'),
    (namespaces['mathml'], 'mover'),
    (namespaces['mathml'], 'mpadded'),
    (namespaces['mathml'], 'mphantom'),
    (namespaces['mathml'], 'mprescripts'),
    (namespaces['mathml'], 'mroot'),
    (namespaces['mathml'], 'mrow'),
    (namespaces['mathml'], 'mspace'),
    (namespaces['mathml'], 'msqrt'),
    (namespaces['mathml'], 'mstyle'),
    (namespaces['mathml'], 'msub'),
    (namespaces['mathml'], 'msubsup'),
    (namespaces['mathml'], 'msup'),
    (namespaces['mathml'], 'mtable'),
    (namespaces['mathml'], 'mtd'),
    (namespaces['mathml'], 'mtext'),
    (namespaces['mathml'], 'mtr'),
    (namespaces['mathml'], 'munder'),
    (namespaces['mathml'], 'munderover'),
    (namespaces['mathml'], 'none'),
    (namespaces['svg'], 'a'),
    (namespaces['svg'], 'animate'),
    (namespaces['svg'], 'animateColor'),
    (namespaces['svg'], 'animateMotion'),
    (namespaces['svg'], 'animateTransform'),
    (namespaces['svg'], 'clipPath'),
    (namespaces['svg'], 'circle'),
    (namespaces['svg'], 'defs'),
    (namespaces['svg'], 'desc'),
    (namespaces['svg'], 'ellipse'),
    (namespaces['svg'], 'font-face'),
    (namespaces['svg'], 'font-face-name'),
    (namespaces['svg'], 'font-face-src'),
    (namespaces['svg'], 'g'),
    (namespaces['svg'], 'glyph'),
    (namespaces['svg'], 'hkern'),
    (namespaces['svg'], 'linearGradient'),
    (namespaces['svg'], 'line'),
    (namespaces['svg'], 'marker'),
    (namespaces['svg'], 'metadata'),
    (namespaces['svg'], 'missing-glyph'),
    (namespaces['svg'], 'mpath'),
    (namespaces['svg'], 'path'),
    (namespaces['svg'], 'polygon'),
    (namespaces['svg'], 'polyline'),
    (namespaces['svg'], 'radialGradient'),
    (namespaces['svg'], 'rect'),
    (namespaces['svg'], 'set'),
    (namespaces['svg'], 'stop'),
    (namespaces['svg'], 'svg'),
    (namespaces['svg'], 'switch'),
    (namespaces['svg'], 'text'),
    (namespaces['svg'], 'title'),
    (namespaces['svg'], 'tspan'),
    (namespaces['svg'], 'use'),
 ))
 allowed_attributes = frozenset((
    # HTML attributes
    (None, 'abbr'),
    (None, 'accept'),
    (None, 'accept-charset'),
    (None, 'accesskey'),
    (None, 'action'),
    (None, 'align'),
    (None, 'alt'),
    (None, 'autocomplete'),
    (None, 'autofocus'),
    (None, 'axis'),
    (None, 'background'),
    (None, 'balance'),
    (None, 'bgcolor'),
    (None, 'bgproperties'),
    (None, 'border'),
    (None, 'bordercolor'),
    (None, 'bordercolordark'),
    (None, 'bordercolorlight'),
    (None, 'bottompadding'),
    (None, 'cellpadding'),
    (None, 'cellspacing'),
    (None, 'ch'),
    (None, 'challenge'),
    (None, 'char'),
    (None, 'charoff'),
    (None, 'choff'),
    (None, 'charset'),
    (None, 'checked'),
    (None, 'cite'),
    (None, 'class'),
    (None, 'clear'),
    (None, 'color'),
    (None, 'cols'),
    (None, 'colspan'),
    (None, 'compact'),
    (None, 'contenteditable'),
    (None, 'controls'),
    (None, 'coords'),
    (None, 'data'),
    (None, 'datafld'),
    (None, 'datapagesize'),
    (None, 'datasrc'),
    (None, 'datetime'),
    (None, 'default'),
    (None, 'delay'),
    (None, 'dir'),
    (None, 'disabled'),
    (None, 'draggable'),
    (None, 'dynsrc'),
    (None, 'enctype'),
    (None, 'end'),
    (None, 'face'),
    (None, 'for'),
    (None, 'form'),
    (None, 'frame'),
    (None, 'galleryimg'),
    (None, 'gutter'),
    (None, 'headers'),
    (None, 'height'),
    (None, 'hidefocus'),
    (None, 'hidden'),
    (None, 'high'),
    (None, 'href'),
    (None, 'hreflang'),
    (None, 'hspace'),
    (None, 'icon'),
    (None, 'id'),
    (None, 'inputmode'),
    (None, 'ismap'),
    (None, 'keytype'),
    (None, 'label'),
    (None, 'leftspacing'),
    (None, 'lang'),
    (None, 'list'),
    (None, 'longdesc'),
    (None, 'loop'),
    (None, 'loopcount'),
    (None, 'loopend'),
    (None, 'loopstart'),
    (None, 'low'),
    (None, 'lowsrc'),
    (None, 'max'),
    (None, 'maxlength'),
    (None, 'media'),
    (None, 'method'),
    (None, 'min'),
    (None, 'multiple'),
    (None, 'name'),
    (None, 'nohref'),
    (None, 'noshade'),
    (None, 'nowrap'),
    (None, 'open'),
    (None, 'optimum'),
    (None, 'pattern'),
    (None, 'ping'),
    (None, 'point-size'),
    (None, 'poster'),
    (None, 'pqg'),
    (None, 'preload'),
    (None, 'prompt'),
    (None, 'radiogroup'),
    (None, 'readonly'),
    (None, 'rel'),
    (None, 'repeat-max'),
    (None, 'repeat-min'),
    (None, 'replace'),
    (None, 'required'),
    (None, 'rev'),
    (None, 'rightspacing'),
    (None, 'rows'),
    (None, 'rowspan'),
    (None, 'rules'),
    (None, 'scope'),
    (None, 'selected'),
    (None, 'shape'),
    (None, 'size'),
    (None, 'span'),
    (None, 'src'),
    (None, 'start'),
    (None, 'step'),
    (None, 'style'),
    (None, 'summary'),
    (None, 'suppress'),
    (None, 'tabindex'),
    (None, 'target'),
    (None, 'template'),
    (None, 'title'),
    (None, 'toppadding'),
    (None, 'type'),
    (None, 'unselectable'),
    (None, 'usemap'),
    (None, 'urn'),
    (None, 'valign'),
    (None, 'value'),
    (None, 'variable'),
    (None, 'volume'),
    (None, 'vspace'),
    (None, 'vrml'),
    (None, 'width'),
    (None, 'wrap'),
    (namespaces['xml'], 'lang'),
    # MathML attributes
    (None, 'actiontype'),
    (None, 'align'),
    (None, 'columnalign'),
    (None, 'columnalign'),
    (None, 'columnalign'),
    (None, 'columnlines'),
    (None, 'columnspacing'),
    (None, 'columnspan'),
    (None, 'depth'),
    (None, 'display'),
    (None, 'displaystyle'),
    (None, 'equalcolumns'),
    (None, 'equalrows'),
    (None, 'fence'),
    (None, 'fontstyle'),
    (None, 'fontweight'),
    (None, 'frame'),
    (None, 'height'),
    (None, 'linethickness'),
    (None, 'lspace'),
    (None, 'mathbackground'),
    (None, 'mathcolor'),
    (None, 'mathvariant'),
    (None, 'mathvariant'),
    (None, 'maxsize'),
    (None, 'minsize'),
    (None, 'other'),
    (None, 'rowalign'),
    (None, 'rowalign'),
    (None, 'rowalign'),
    (None, 'rowlines'),
    (None, 'rowspacing'),
    (None, 'rowspan'),
    (None, 'rspace'),
    (None, 'scriptlevel'),
    (None, 'selection'),
    (None, 'separator'),
    (None, 'stretchy'),
    (None, 'width'),
    (None, 'width'),
    (namespaces['xlink'], 'href'),
    (namespaces['xlink'], 'show'),
    (namespaces['xlink'], 'type'),
    # SVG attributes
    (None, 'accent-height'),
    (None, 'accumulate'),
    (None, 'additive'),
    (None, 'alphabetic'),
    (None, 'arabic-form'),
    (None, 'ascent'),
    (None, 'attributeName'),
    (None, 'attributeType'),
    (None, 'baseProfile'),
    (None, 'bbox'),
    (None, 'begin'),
    (None, 'by'),
    (None, 'calcMode'),
    (None, 'cap-height'),
    (None, 'class'),
    (None, 'clip-path'),
    (None, 'color'),
    (None, 'color-rendering'),
    (None, 'content'),
    (None, 'cx'),
    (None, 'cy'),
    (None, 'd'),
    (None, 'dx'),
    (None, 'dy'),
    (None, 'descent'),
    (None, 'display'),
    (None, 'dur'),
    (None, 'end'),
    (None, 'fill'),
    (None, 'fill-opacity'),
    (None, 'fill-rule'),
    (None, 'font-family'),
    (None, 'font-size'),
    (None, 'font-stretch'),
    (None, 'font-style'),
    (None, 'font-variant'),
    (None, 'font-weight'),
    (None, 'from'),
    (None, 'fx'),
    (None, 'fy'),
    (None, 'g1'),
    (None, 'g2'),
    (None, 'glyph-name'),
    (None, 'gradientUnits'),
    (None, 'hanging'),
    (None, 'height'),
    (None, 'horiz-adv-x'),
    (None, 'horiz-origin-x'),
    (None, 'id'),
    (None, 'ideographic'),
    (None, 'k'),
    (None, 'keyPoints'),
    (None, 'keySplines'),
    (None, 'keyTimes'),
    (None, 'lang'),
    (None, 'marker-end'),
    (None, 'marker-mid'),
    (None, 'marker-start'),
    (None, 'markerHeight'),
    (None, 'markerUnits'),
    (None, 'markerWidth'),
    (None, 'mathematical'),
    (None, 'max'),
    (None, 'min'),
    (None, 'name'),
    (None, 'offset'),
    (None, 'opacity'),
    (None, 'orient'),
    (None, 'origin'),
    (None, 'overline-position'),
    (None, 'overline-thickness'),
    (None, 'panose-1'),
    (None, 'path'),
    (None, 'pathLength'),
    (None, 'points'),
    (None, 'preserveAspectRatio'),
    (None, 'r'),
    (None, 'refX'),
    (None, 'refY'),
    (None, 'repeatCount'),
    (None, 'repeatDur'),
    (None, 'requiredExtensions'),
    (None, 'requiredFeatures'),
    (None, 'restart'),
    (None, 'rotate'),
    (None, 'rx'),
    (None, 'ry'),
    (None, 'slope'),
    (None, 'stemh'),
    (None, 'stemv'),
    (None, 'stop-color'),
    (None, 'stop-opacity'),
    (None, 'strikethrough-position'),
    (None, 'strikethrough-thickness'),
    (None, 'stroke'),
    (None, 'stroke-dasharray'),
    (None, 'stroke-dashoffset'),
    (None, 'stroke-linecap'),
    (None, 'stroke-linejoin'),
    (None, 'stroke-miterlimit'),
    (None, 'stroke-opacity'),
    (None, 'stroke-width'),
    (None, 'systemLanguage'),
    (None, 'target'),
    (None, 'text-anchor'),
    (None, 'to'),
    (None, 'transform'),
    (None, 'type'),
    (None, 'u1'),
    (None, 'u2'),
    (None, 'underline-position'),
    (None, 'underline-thickness'),
    (None, 'unicode'),
    (None, 'unicode-range'),
    (None, 'units-per-em'),
    (None, 'values'),
    (None, 'version'),
    (None, 'viewBox'),
    (None, 'visibility'),
    (None, 'width'),
    (None, 'widths'),
    (None, 'x'),
    (None, 'x-height'),
    (None, 'x1'),
    (None, 'x2'),
    (namespaces['xlink'], 'actuate'),
    (namespaces['xlink'], 'arcrole'),
    (namespaces['xlink'], 'href'),
    (namespaces['xlink'], 'role'),
    (namespaces['xlink'], 'show'),
    (namespaces['xlink'], 'title'),
    (namespaces['xlink'], 'type'),
    (namespaces['xml'], 'base'),
    (namespaces['xml'], 'lang'),
    (namespaces['xml'], 'space'),
    (None, 'y'),
    (None, 'y1'),
    (None, 'y2'),
    (None, 'zoomAndPan'),
 ))
 attr_val_is_uri = frozenset((
    (None, 'href'),
    (None, 'src'),
    (None, 'cite'),
    (None, 'action'),
    (None, 'longdesc'),
    (None, 'poster'),
    (None, 'background'),
    (None, 'datasrc'),
    (None, 'dynsrc'),
    (None, 'lowsrc'),
    (None, 'ping'),
    (namespaces['xlink'], 'href'),
    (namespaces['xml'], 'base'),
 ))
 svg_attr_val_allows_ref = frozenset((
    (None, 'clip-path'),
    (None, 'color-profile'),
    (None, 'cursor'),
    (None, 'fill'),
    (None, 'filter'),
    (None, 'marker'),
    (None, 'marker-start'),
    (None, 'marker-mid'),
    (None, 'marker-end'),
    (None, 'mask'),
    (None, 'stroke'),
 ))
 svg_allow_local_href = frozenset((
    (None, 'altGlyph'),
    (None, 'animate'),
    (None, 'animateColor'),
    (None, 'animateMotion'),
    (None, 'animateTransform'),
    (None, 'cursor'),
    (None, 'feImage'),
    (None, 'filter'),
    (None, 'linearGradient'),
    (None, 'pattern'),
    (None, 'radialGradient'),
    (None, 'textpath'),
    (None, 'tref'),
    (None, 'set'),
    (None, 'use')
 ))
 allowed_css_properties = frozenset((
    'azimuth',
    'background-color',
    'border-bottom-color',
    'border-collapse',
    'border-color',
    'border-left-color',
    'border-right-color',
    'border-top-color',
    'clear',
    'color',
    'cursor',
    'direction',
    'display',
    'elevation',
    'float',
    'font',
    'font-family',
    'font-size',
    'font-style',
    'font-variant',
    'font-weight',
    'height',
    'letter-spacing',
    'line-height',
    'overflow',
    'pause',
    'pause-after',
    'pause-before',
    'pitch',
    'pitch-range',
    'richness',
    'speak',
    'speak-header',
    'speak-numeral',
    'speak-punctuation',
    'speech-rate',
    'stress',
    'text-align',
    'text-decoration',
    'text-indent',
    'unicode-bidi',
    'vertical-align',
    'voice-family',
    'volume',
    'white-space',
    'width',
 ))
 allowed_css_keywords = frozenset((
    'auto',
    'aqua',
    'black',
    'block',
    'blue',
    'bold',
    'both',
    'bottom',
    'brown',
    'center',
    'collapse',
    'dashed',
    'dotted',
    'fuchsia',
    'gray',
    'green',
    '!important',
    'italic',
    'left',
    'lime',
    'maroon',
    'medium',
    'none',
    'navy',
    'normal',
    'nowrap',
    'olive',
    'pointer',
    'purple',
    'red',
    'right',
    'solid',
    'silver',
    'teal',
    'top',
    'transparent',
    'underline',
    'white',
    'yellow',
 ))
 allowed_svg_properties = frozenset((
    'fill',
    'fill-opacity',
    'fill-rule',
    'stroke',
    'stroke-width',
    'stroke-linecap',
    'stroke-linejoin',
    'stroke-opacity',
 ))
 allowed_protocols = frozenset((
    'ed2k',
    'ftp',
    'http',
    'https',
    'irc',
    'mailto',
    'news',
    'gopher',
    'nntp',
    'telnet',
    'webcal',
    'xmpp',
    'callto',
    'feed',
    'urn',
    'aim',
    'rsync',
    'tag',
    'ssh',
    'sftp',
    'rtsp',
    'afs',
    'data',
 ))
 allowed_content_types = frozenset((
    'image/png',
    'image/jpeg',
    'image/gif',
    'image/webp',
    'image/bmp',
    'text/plain',
 ))
 data_content_type = re.compile(r'''
                                ^
                                # Match a content type <application>/<type>
                                (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
                                # Match any character set and encoding
                                (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
                                  |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
                                # Assume the rest is data
                                ,.*
                                $
                                ''',
                               re.VERBOSE)
 class Filter(base.Filter):
    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
    def __init__(self,
                 source,
                 allowed_elements=allowed_elements,
                 allowed_attributes=allowed_attributes,
                 allowed_css_properties=allowed_css_properties,
                 allowed_css_keywords=allowed_css_keywords,
                 allowed_svg_properties=allowed_svg_properties,
                 allowed_protocols=allowed_protocols,
                 allowed_content_types=allowed_content_types,
                 attr_val_is_uri=attr_val_is_uri,
                 svg_attr_val_allows_ref=svg_attr_val_allows_ref,
                 svg_allow_local_href=svg_allow_local_href):
        super(Filter, self).__init__(source)
        self.allowed_elements = allowed_elements
        self.allowed_attributes = allowed_attributes
        self.allowed_css_properties = allowed_css_properties
        self.allowed_css_keywords = allowed_css_keywords
        self.allowed_svg_properties = allowed_svg_properties
        self.allowed_protocols = allowed_protocols
        self.allowed_content_types = allowed_content_types
        self.attr_val_is_uri = attr_val_is_uri
        self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
        self.svg_allow_local_href = svg_allow_local_href
    def __iter__(self):
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            token = self.sanitize_token(token)
            if token:
                yield token
    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
    # attributes are parsed, and a restricted set, # specified by
    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
    # in ALLOWED_PROTOCOLS are allowed.
    #
    #   sanitize_html('<script> do_nasty_stuff() </script>')
    #    => &lt;script> do_nasty_stuff() &lt;/script>
    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
    #    => <a>Click here for $100</a>
    def sanitize_token(self, token):
        # accommodate filters which use token_type differently
        token_type = token["type"]
        if token_type in ("StartTag", "EndTag", "EmptyTag"):
            name = token["name"]
            namespace = token["namespace"]
            if ((namespace, name) in self.allowed_elements or
                (namespace is None and
                 (namespaces["html"], name) in self.allowed_elements)):
                return self.allowed_token(token)
            else:
                return self.disallowed_token(token)
        elif token_type == "Comment":
            pass
        else:
            return token
    def allowed_token(self, token):
        if "data" in token:
            attrs = token["data"]
            attr_names = set(attrs.keys())
            # Remove forbidden attributes
            for to_remove in (attr_names - self.allowed_attributes):
                del token["data"][to_remove]
                attr_names.remove(to_remove)
            # Remove attributes with disallowed URL values
            for attr in (attr_names & self.attr_val_is_uri):
                assert attr in attrs
                # I don't have a clue where this regexp comes from or why it matches those
                # characters, nor why we call unescape. I just know it's always been here.
                # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
                # this will do is remove *more* than it otherwise would.
                val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\s]+", '',
                                       unescape(attrs[attr])).lower()
                # remove replacement characters from unescaped characters
                val_unescaped = val_unescaped.replace("\ufffd", "")
                try:
                    uri = urlparse.urlparse(val_unescaped)
                except ValueError:
                    uri = None
                    del attrs[attr]
                if uri and uri.scheme:
                    if uri.scheme not in self.allowed_protocols:
                        del attrs[attr]
                    if uri.scheme == 'data':
                        m = data_content_type.match(uri.path)
                        if not m:
                            del attrs[attr]
                        elif m.group('content_type') not in self.allowed_content_types:
                            del attrs[attr]
            for attr in self.svg_attr_val_allows_ref:
                if attr in attrs:
                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                         ' ',
                                         unescape(attrs[attr]))
            if (token["name"] in self.svg_allow_local_href and
                (namespaces['xlink'], 'href') in attrs and re.search('^\s*[^#\s].*',
                                                                     attrs[(namespaces['xlink'], 'href')])):
                del attrs[(namespaces['xlink'], 'href')]
            if (None, 'style') in attrs:
                attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
            token["data"] = attrs
        return token
    def disallowed_token(self, token):
        token_type = token["type"]
        if token_type == "EndTag":
            token["data"] = "</%s>" % token["name"]
        elif token["data"]:
            assert token_type in ("StartTag", "EmptyTag")
            attrs = []
            for (ns, name), v in token["data"].items():
                attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
        else:
            token["data"] = "<%s>" % token["name"]
        if token.get("selfClosing"):
            token["data"] = token["data"][:-1] + "/>"
        token["type"] = "Characters"
        del token["name"]
        return token
    def sanitize_css(self, style):
        # disallow urls
        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
        # gauntlet
        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
            return ''
        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
            return ''
        clean = []
        for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
            if not value:
                continue
            if prop.lower() in self.allowed_css_properties:
                clean.append(prop + ': ' + value + ';')
            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
                                                'padding']:
                for keyword in value.split():
                    if keyword not in self.allowed_css_keywords and \
                            not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):  # noqa
                        break
                else:
                    clean.append(prop + ': ' + value + ';')
            elif prop.lower() in self.allowed_svg_properties:
                clean.append(prop + ': ' + value + ';')
        return ' '.join(clean)
--- a/lib/html5lib/filters/whitespace.py
+++ b/lib/html5lib/filters/whitespace.py
@ -2,20 +2,20 @@ from __future__ import absolute_import, division, unicode_literals
 import re
-from . import _base
+from . import base
 from ..constants import rcdataElements, spaceCharacters
 spaceCharacters = "".join(spaceCharacters)
 SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
-class Filter(_base.Filter):
+class Filter(base.Filter):
    spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
    def __iter__(self):
        preserve = 0
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            type = token["type"]
            if type == "StartTag" \
                    and (preserve or token["name"] in self.spacePreserveElements):
--- a/lib/html5lib/html5parser.py
+++ b/lib/html5lib/html5parser.py
@ -1,39 +1,44 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import with_metaclass
+from six import with_metaclass, viewkeys, PY3
 import types
-from . import inputstream
+try:
-from . import tokenizer
+    from collections import OrderedDict
 except ImportError:
    from ordereddict import OrderedDict
 from . import _inputstream
 from . import _tokenizer
 from . import treebuilders
-from .treebuilders._base import Marker
+from .treebuilders.base import Marker
-from . import utils
+from . import _utils
-from . import constants
+from .constants import (
-from .constants import spaceCharacters, asciiUpper2Lower
+    spaceCharacters, asciiUpper2Lower,
-from .constants import specialElements
+    specialElements, headingElements, cdataElements, rcdataElements,
-from .constants import headingElements
+    tokenTypes, tagTokenTypes,
-from .constants import cdataElements, rcdataElements
+    namespaces,
-from .constants import tokenTypes, ReparseException, namespaces
+    htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
-from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
+    adjustForeignAttributes as adjustForeignAttributesMap,
-from .constants import adjustForeignAttributes as adjustForeignAttributesMap
+    adjustMathMLAttributes, adjustSVGAttributes,
-from .constants import E
+    E,
    ReparseException
 )
-def parse(doc, treebuilder="etree", encoding=None,
+def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
          namespaceHTMLElements=True):
    """Parse a string or file-like object into a tree"""
    tb = treebuilders.getTreeBuilder(treebuilder)
    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
-    return p.parse(doc, encoding=encoding)
+    return p.parse(doc, **kwargs)
-def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
+def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
                  namespaceHTMLElements=True):
    tb = treebuilders.getTreeBuilder(treebuilder)
    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
-    return p.parseFragment(doc, container=container, encoding=encoding)
+    return p.parseFragment(doc, container=container, **kwargs)
 def method_decorator_metaclass(function):
@ -52,18 +57,13 @@ class HTMLParser(object):
    """HTML parser. Generates a tree structure from a stream of (possibly
        malformed) HTML"""
-    def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
+    def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
                 strict=False, namespaceHTMLElements=True, debug=False):
        """
        strict - raise an exception when a parse error is encountered
        tree - a treebuilder class controlling the type of tree that will be
        returned. Built in treebuilders can be accessed through
        html5lib.treebuilders.getTreeBuilder(treeType)
        tokenizer - a class that provides a stream of tokens to the treebuilder.
        This may be replaced for e.g. a sanitizer which converts some tags to
        text
        """
        # Raise an exception on the first error encountered
@ -72,29 +72,24 @@ class HTMLParser(object):
        if tree is None:
            tree = treebuilders.getTreeBuilder("etree")
        self.tree = tree(namespaceHTMLElements)
        self.tokenizer_class = tokenizer
        self.errors = []
        self.phases = dict([(name, cls(self, self.tree)) for name, cls in
                            getPhases(debug).items()])
-    def _parse(self, stream, innerHTML=False, container="div",
+    def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
               encoding=None, parseMeta=True, useChardet=True, **kwargs):
        self.innerHTMLMode = innerHTML
        self.container = container
-        self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
+        self.scripting = scripting
-                                              parseMeta=parseMeta,
+        self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
                                              useChardet=useChardet,
                                              parser=self, **kwargs)
        self.reset()
-        while True:
+        try:
-            try:
+            self.mainLoop()
-                self.mainLoop()
+        except ReparseException:
-                break
+            self.reset()
-            except ReparseException:
+            self.mainLoop()
                self.reset()
    def reset(self):
        self.tree.reset()
@ -121,7 +116,7 @@ class HTMLParser(object):
            self.phase.insertHtmlElement()
            self.resetInsertionMode()
        else:
-            self.innerHTML = False
+            self.innerHTML = False  # pylint:disable=redefined-variable-type
            self.phase = self.phases["initial"]
        self.lastPhase = None
@ -139,7 +134,7 @@ class HTMLParser(object):
        """
        if not hasattr(self, 'tokenizer'):
            return None
-        return self.tokenizer.stream.charEncoding[0]
+        return self.tokenizer.stream.charEncoding[0].name
    def isHTMLIntegrationPoint(self, element):
        if (element.name == "annotation-xml" and
@ -164,8 +159,10 @@ class HTMLParser(object):
        ParseErrorToken = tokenTypes["ParseError"]
        for token in self.normalizedTokens():
            prev_token = None
            new_token = token
            while new_token is not None:
                prev_token = new_token
                currentNode = self.tree.openElements[-1] if self.tree.openElements else None
                currentNodeNamespace = currentNode.namespace if currentNode else None
                currentNodeName = currentNode.name if currentNode else None
@ -184,6 +181,7 @@ class HTMLParser(object):
                          type in (CharactersToken, SpaceCharactersToken))) or
                        (currentNodeNamespace == namespaces["mathml"] and
                         currentNodeName == "annotation-xml" and
                         type == StartTagToken and
                         token["name"] == "svg") or
                        (self.isHTMLIntegrationPoint(currentNode) and
                         type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
@ -204,10 +202,10 @@ class HTMLParser(object):
                    elif type == DoctypeToken:
                        new_token = phase.processDoctype(new_token)
-            if (type == StartTagToken and token["selfClosing"]
+            if (type == StartTagToken and prev_token["selfClosing"] and
-                    and not token["selfClosingAcknowledged"]):
+                    not prev_token["selfClosingAcknowledged"]):
                self.parseError("non-void-element-with-trailing-solidus",
-                                {"name": token["name"]})
+                                {"name": prev_token["name"]})
        # When the loop finishes it's EOF
        reprocess = True
@ -222,7 +220,7 @@ class HTMLParser(object):
        for token in self.tokenizer:
            yield self.normalizeToken(token)
-    def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
+    def parse(self, stream, *args, **kwargs):
        """Parse a HTML document into a well-formed tree
        stream - a filelike object or string containing the HTML to be parsed
@ -231,13 +229,13 @@ class HTMLParser(object):
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)
        scripting - treat noscript elements as if javascript was turned on
        """
-        self._parse(stream, innerHTML=False, encoding=encoding,
+        self._parse(stream, False, None, *args, **kwargs)
                    parseMeta=parseMeta, useChardet=useChardet)
        return self.tree.getDocument()
-    def parseFragment(self, stream, container="div", encoding=None,
+    def parseFragment(self, stream, *args, **kwargs):
                      parseMeta=False, useChardet=True):
        """Parse a HTML fragment into a well-formed tree fragment
        container - name of the element we're setting the innerHTML property
@ -249,12 +247,16 @@ class HTMLParser(object):
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)
        scripting - treat noscript elements as if javascript was turned on
        """
-        self._parse(stream, True, container=container, encoding=encoding)
+        self._parse(stream, True, *args, **kwargs)
        return self.tree.getFragment()
-    def parseError(self, errorcode="XXX-undefined-error", datavars={}):
+    def parseError(self, errorcode="XXX-undefined-error", datavars=None):
        # XXX The idea is to make errorcode mandatory.
        if datavars is None:
            datavars = {}
        self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
        if self.strict:
            raise ParseError(E[errorcode] % datavars)
@ -263,98 +265,25 @@ class HTMLParser(object):
        """ HTML5 specific normalizations to the token stream """
        if token["type"] == tokenTypes["StartTag"]:
-            token["data"] = dict(token["data"][::-1])
+            raw = token["data"]
            token["data"] = OrderedDict(raw)
            if len(raw) > len(token["data"]):
                # we had some duplicated attribute, fix so first wins
                token["data"].update(raw[::-1])
        return token
    def adjustMathMLAttributes(self, token):
-        replacements = {"definitionurl": "definitionURL"}
+        adjust_attributes(token, adjustMathMLAttributes)
        for k, v in replacements.items():
            if k in token["data"]:
                token["data"][v] = token["data"][k]
                del token["data"][k]
    def adjustSVGAttributes(self, token):
-        replacements = {
+        adjust_attributes(token, adjustSVGAttributes)
            "attributename": "attributeName",
            "attributetype": "attributeType",
            "basefrequency": "baseFrequency",
            "baseprofile": "baseProfile",
            "calcmode": "calcMode",
            "clippathunits": "clipPathUnits",
            "contentscripttype": "contentScriptType",
            "contentstyletype": "contentStyleType",
            "diffuseconstant": "diffuseConstant",
            "edgemode": "edgeMode",
            "externalresourcesrequired": "externalResourcesRequired",
            "filterres": "filterRes",
            "filterunits": "filterUnits",
            "glyphref": "glyphRef",
            "gradienttransform": "gradientTransform",
            "gradientunits": "gradientUnits",
            "kernelmatrix": "kernelMatrix",
            "kernelunitlength": "kernelUnitLength",
            "keypoints": "keyPoints",
            "keysplines": "keySplines",
            "keytimes": "keyTimes",
            "lengthadjust": "lengthAdjust",
            "limitingconeangle": "limitingConeAngle",
            "markerheight": "markerHeight",
            "markerunits": "markerUnits",
            "markerwidth": "markerWidth",
            "maskcontentunits": "maskContentUnits",
            "maskunits": "maskUnits",
            "numoctaves": "numOctaves",
            "pathlength": "pathLength",
            "patterncontentunits": "patternContentUnits",
            "patterntransform": "patternTransform",
            "patternunits": "patternUnits",
            "pointsatx": "pointsAtX",
            "pointsaty": "pointsAtY",
            "pointsatz": "pointsAtZ",
            "preservealpha": "preserveAlpha",
            "preserveaspectratio": "preserveAspectRatio",
            "primitiveunits": "primitiveUnits",
            "refx": "refX",
            "refy": "refY",
            "repeatcount": "repeatCount",
            "repeatdur": "repeatDur",
            "requiredextensions": "requiredExtensions",
            "requiredfeatures": "requiredFeatures",
            "specularconstant": "specularConstant",
            "specularexponent": "specularExponent",
            "spreadmethod": "spreadMethod",
            "startoffset": "startOffset",
            "stddeviation": "stdDeviation",
            "stitchtiles": "stitchTiles",
            "surfacescale": "surfaceScale",
            "systemlanguage": "systemLanguage",
            "tablevalues": "tableValues",
            "targetx": "targetX",
            "targety": "targetY",
            "textlength": "textLength",
            "viewbox": "viewBox",
            "viewtarget": "viewTarget",
            "xchannelselector": "xChannelSelector",
            "ychannelselector": "yChannelSelector",
            "zoomandpan": "zoomAndPan"
        }
        for originalName in list(token["data"].keys()):
            if originalName in replacements:
                svgName = replacements[originalName]
                token["data"][svgName] = token["data"][originalName]
                del token["data"][originalName]
    def adjustForeignAttributes(self, token):
-        replacements = adjustForeignAttributesMap
+        adjust_attributes(token, adjustForeignAttributesMap)
        for originalName in token["data"].keys():
            if originalName in replacements:
                foreignName = replacements[originalName]
                token["data"][foreignName] = token["data"][originalName]
                del token["data"][originalName]
    def reparseTokenNormal(self, token):
        # pylint:disable=unused-argument
        self.parser.phase()
    def resetInsertionMode(self):
@ -419,11 +348,12 @@ class HTMLParser(object):
        self.phase = self.phases["text"]
@_utils.memoize
 def getPhases(debug):
    def log(function):
        """Logger that records which phase processes each token"""
        type_names = dict((value, key) for key, value in
-                          constants.tokenTypes.items())
+                          tokenTypes.items())
        def wrapped(self, *args, **kwargs):
            if function.__name__.startswith("process") and len(args) > 0:
@ -432,7 +362,7 @@ def getPhases(debug):
                    info = {"type": type_names[token['type']]}
                except:
                    raise
-                if token['type'] in constants.tagTokenTypes:
+                if token['type'] in tagTokenTypes:
                    info["name"] = token['name']
                self.parser.log.append((self.parser.tokenizer.state.__name__,
@ -451,6 +381,7 @@ def getPhases(debug):
        else:
            return type
    # pylint:disable=unused-argument
    class Phase(with_metaclass(getMetaclass(debug, log))):
        """Base class for helper object that implements each phase of processing
        """
@ -517,77 +448,76 @@ def getPhases(debug):
            if publicId != "":
                publicId = publicId.translate(asciiUpper2Lower)
-            if (not correct or token["name"] != "html"
+            if (not correct or token["name"] != "html" or
-                or publicId.startswith(
+                    publicId.startswith(
-                    ("+//silmaril//dtd html pro v0r11 19970101//",
+                        ("+//silmaril//dtd html pro v0r11 19970101//",
-                     "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
+                         "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
-                     "-//as//dtd html 3.0 aswedit + extensions//",
+                         "-//as//dtd html 3.0 aswedit + extensions//",
-                     "-//ietf//dtd html 2.0 level 1//",
+                         "-//ietf//dtd html 2.0 level 1//",
-                     "-//ietf//dtd html 2.0 level 2//",
+                         "-//ietf//dtd html 2.0 level 2//",
-                     "-//ietf//dtd html 2.0 strict level 1//",
+                         "-//ietf//dtd html 2.0 strict level 1//",
-                     "-//ietf//dtd html 2.0 strict level 2//",
+                         "-//ietf//dtd html 2.0 strict level 2//",
-                     "-//ietf//dtd html 2.0 strict//",
+                         "-//ietf//dtd html 2.0 strict//",
-                     "-//ietf//dtd html 2.0//",
+                         "-//ietf//dtd html 2.0//",
-                     "-//ietf//dtd html 2.1e//",
+                         "-//ietf//dtd html 2.1e//",
-                     "-//ietf//dtd html 3.0//",
+                         "-//ietf//dtd html 3.0//",
-                     "-//ietf//dtd html 3.2 final//",
+                         "-//ietf//dtd html 3.2 final//",
-                     "-//ietf//dtd html 3.2//",
+                         "-//ietf//dtd html 3.2//",
-                     "-//ietf//dtd html 3//",
+                         "-//ietf//dtd html 3//",
-                     "-//ietf//dtd html level 0//",
+                         "-//ietf//dtd html level 0//",
-                     "-//ietf//dtd html level 1//",
+                         "-//ietf//dtd html level 1//",
-                     "-//ietf//dtd html level 2//",
+                         "-//ietf//dtd html level 2//",
-                     "-//ietf//dtd html level 3//",
+                         "-//ietf//dtd html level 3//",
-                     "-//ietf//dtd html strict level 0//",
+                         "-//ietf//dtd html strict level 0//",
-                     "-//ietf//dtd html strict level 1//",
+                         "-//ietf//dtd html strict level 1//",
-                     "-//ietf//dtd html strict level 2//",
+                         "-//ietf//dtd html strict level 2//",
-                     "-//ietf//dtd html strict level 3//",
+                         "-//ietf//dtd html strict level 3//",
-                     "-//ietf//dtd html strict//",
+                         "-//ietf//dtd html strict//",
-                     "-//ietf//dtd html//",
+                         "-//ietf//dtd html//",
-                     "-//metrius//dtd metrius presentational//",
+                         "-//metrius//dtd metrius presentational//",
-                     "-//microsoft//dtd internet explorer 2.0 html strict//",
+                         "-//microsoft//dtd internet explorer 2.0 html strict//",
-                     "-//microsoft//dtd internet explorer 2.0 html//",
+                         "-//microsoft//dtd internet explorer 2.0 html//",
-                     "-//microsoft//dtd internet explorer 2.0 tables//",
+                         "-//microsoft//dtd internet explorer 2.0 tables//",
-                     "-//microsoft//dtd internet explorer 3.0 html strict//",
+                         "-//microsoft//dtd internet explorer 3.0 html strict//",
-                     "-//microsoft//dtd internet explorer 3.0 html//",
+                         "-//microsoft//dtd internet explorer 3.0 html//",
-                     "-//microsoft//dtd internet explorer 3.0 tables//",
+                         "-//microsoft//dtd internet explorer 3.0 tables//",
-                     "-//netscape comm. corp.//dtd html//",
+                         "-//netscape comm. corp.//dtd html//",
-                     "-//netscape comm. corp.//dtd strict html//",
+                         "-//netscape comm. corp.//dtd strict html//",
-                     "-//o'reilly and associates//dtd html 2.0//",
+                         "-//o'reilly and associates//dtd html 2.0//",
-                     "-//o'reilly and associates//dtd html extended 1.0//",
+                         "-//o'reilly and associates//dtd html extended 1.0//",
-                     "-//o'reilly and associates//dtd html extended relaxed 1.0//",
+                         "-//o'reilly and associates//dtd html extended relaxed 1.0//",
-                     "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
+                         "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
-                     "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
+                         "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
-                     "-//spyglass//dtd html 2.0 extended//",
+                         "-//spyglass//dtd html 2.0 extended//",
-                     "-//sq//dtd html 2.0 hotmetal + extensions//",
+                         "-//sq//dtd html 2.0 hotmetal + extensions//",
-                     "-//sun microsystems corp.//dtd hotjava html//",
+                         "-//sun microsystems corp.//dtd hotjava html//",
-                     "-//sun microsystems corp.//dtd hotjava strict html//",
+                         "-//sun microsystems corp.//dtd hotjava strict html//",
-                     "-//w3c//dtd html 3 1995-03-24//",
+                         "-//w3c//dtd html 3 1995-03-24//",
-                     "-//w3c//dtd html 3.2 draft//",
+                         "-//w3c//dtd html 3.2 draft//",
-                     "-//w3c//dtd html 3.2 final//",
+                         "-//w3c//dtd html 3.2 final//",
-                     "-//w3c//dtd html 3.2//",
+                         "-//w3c//dtd html 3.2//",
-                     "-//w3c//dtd html 3.2s draft//",
+                         "-//w3c//dtd html 3.2s draft//",
-                     "-//w3c//dtd html 4.0 frameset//",
+                         "-//w3c//dtd html 4.0 frameset//",
-                     "-//w3c//dtd html 4.0 transitional//",
+                         "-//w3c//dtd html 4.0 transitional//",
-                     "-//w3c//dtd html experimental 19960712//",
+                         "-//w3c//dtd html experimental 19960712//",
-                     "-//w3c//dtd html experimental 970421//",
+                         "-//w3c//dtd html experimental 970421//",
-                     "-//w3c//dtd w3 html//",
+                         "-//w3c//dtd w3 html//",
-                     "-//w3o//dtd w3 html 3.0//",
+                         "-//w3o//dtd w3 html 3.0//",
-                     "-//webtechs//dtd mozilla html 2.0//",
+                         "-//webtechs//dtd mozilla html 2.0//",
-                     "-//webtechs//dtd mozilla html//"))
+                         "-//webtechs//dtd mozilla html//")) or
-                or publicId in
+                    publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
-                    ("-//w3o//dtd w3 html strict 3.0//en//",
+                                 "-/w3c/dtd html 4.0 transitional/en",
-                     "-/w3c/dtd html 4.0 transitional/en",
+                                 "html") or
-                     "html")
+                    publicId.startswith(
-                or publicId.startswith(
+                        ("-//w3c//dtd html 4.01 frameset//",
-                    ("-//w3c//dtd html 4.01 frameset//",
+                         "-//w3c//dtd html 4.01 transitional//")) and
-                     "-//w3c//dtd html 4.01 transitional//")) and
+                    systemId is None or
-                    systemId is None
+                    systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
                    or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
                self.parser.compatMode = "quirks"
            elif (publicId.startswith(
                    ("-//w3c//dtd xhtml 1.0 frameset//",
-                     "-//w3c//dtd xhtml 1.0 transitional//"))
+                     "-//w3c//dtd xhtml 1.0 transitional//")) or
-                  or publicId.startswith(
+                  publicId.startswith(
                      ("-//w3c//dtd html 4.01 frameset//",
                       "-//w3c//dtd html 4.01 transitional//")) and
                  systemId is not None):
@ -660,13 +590,13 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("head", self.startTagHead)
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                (("head", "body", "html", "br"), self.endTagImplyHead)
            ])
            self.endTagHandler.default = self.endTagOther
@ -706,10 +636,11 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("title", self.startTagTitle),
-                (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
+                (("noframes", "style"), self.startTagNoFramesStyle),
                ("noscript", self.startTagNoscript),
                ("script", self.startTagScript),
                (("base", "basefont", "bgsound", "command", "link"),
                 self.startTagBaseLinkCommand),
@ -718,7 +649,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther
-            self. endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("head", self.endTagHead),
                (("br", "html", "body"), self.endTagHtmlBodyBr)
            ])
@ -760,18 +691,25 @@ def getPhases(debug):
                    # the abstract Unicode string, and just use the
                    # ContentAttrParser on that, but using UTF-8 allows all chars
                    # to be encoded and as a ASCII-superset works.
-                    data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
+                    data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
-                    parser = inputstream.ContentAttrParser(data)
+                    parser = _inputstream.ContentAttrParser(data)
                    codec = parser.parse()
                    self.parser.tokenizer.stream.changeEncoding(codec)
        def startTagTitle(self, token):
            self.parser.parseRCDataRawtext(token, "RCDATA")
-        def startTagNoScriptNoFramesStyle(self, token):
+        def startTagNoFramesStyle(self, token):
            # Need to decide whether to implement the scripting-disabled case
            self.parser.parseRCDataRawtext(token, "RAWTEXT")
        def startTagNoscript(self, token):
            if self.parser.scripting:
                self.parser.parseRCDataRawtext(token, "RAWTEXT")
            else:
                self.tree.insertElement(token)
                self.parser.phase = self.parser.phases["inHeadNoscript"]
        def startTagScript(self, token):
            self.tree.insertElement(token)
            self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
@ -797,15 +735,75 @@ def getPhases(debug):
        def anythingElse(self):
            self.endTagHead(impliedTagToken("head"))
-    # XXX If we implement a parser for which scripting is disabled we need to
+    class InHeadNoscriptPhase(Phase):
-    # implement this phase.
+        def __init__(self, parser, tree):
-    #
+            Phase.__init__(self, parser, tree)
-    # class InHeadNoScriptPhase(Phase):
+
            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
                (("head", "noscript"), self.startTagHeadNoscript),
            ])
            self.startTagHandler.default = self.startTagOther
            self.endTagHandler = _utils.MethodDispatcher([
                ("noscript", self.endTagNoscript),
                ("br", self.endTagBr),
            ])
            self.endTagHandler.default = self.endTagOther
        def processEOF(self):
            self.parser.parseError("eof-in-head-noscript")
            self.anythingElse()
            return True
        def processComment(self, token):
            return self.parser.phases["inHead"].processComment(token)
        def processCharacters(self, token):
            self.parser.parseError("char-in-head-noscript")
            self.anythingElse()
            return token
        def processSpaceCharacters(self, token):
            return self.parser.phases["inHead"].processSpaceCharacters(token)
        def startTagHtml(self, token):
            return self.parser.phases["inBody"].processStartTag(token)
        def startTagBaseLinkCommand(self, token):
            return self.parser.phases["inHead"].processStartTag(token)
        def startTagHeadNoscript(self, token):
            self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
        def startTagOther(self, token):
            self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
            self.anythingElse()
            return token
        def endTagNoscript(self, token):
            node = self.parser.tree.openElements.pop()
            assert node.name == "noscript", "Expected noscript got %s" % node.name
            self.parser.phase = self.parser.phases["inHead"]
        def endTagBr(self, token):
            self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
            self.anythingElse()
            return token
        def endTagOther(self, token):
            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
        def anythingElse(self):
            # Caller must raise parse error first!
            self.endTagNoscript(impliedTagToken("noscript"))
    class AfterHeadPhase(Phase):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("body", self.startTagBody),
                ("frameset", self.startTagFrameset),
@ -815,8 +813,8 @@ def getPhases(debug):
                ("head", self.startTagHead)
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"),
+            self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
-                                                          self.endTagHtmlBodyBr)])
+                                                           self.endTagHtmlBodyBr)])
            self.endTagHandler.default = self.endTagOther
        def processEOF(self):
@ -874,10 +872,10 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            # Keep a ref to this for special handling of whitespace in <pre>
+            # Set this to the default handler
-            self.processSpaceCharactersNonPre = self.processSpaceCharacters
+            self.processSpaceCharacters = self.processSpaceCharactersNonPre
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                (("base", "basefont", "bgsound", "command", "link", "meta",
                  "script", "style", "title"),
@ -885,7 +883,7 @@ def getPhases(debug):
                ("body", self.startTagBody),
                ("frameset", self.startTagFrameset),
                (("address", "article", "aside", "blockquote", "center", "details",
-                  "details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
+                  "dir", "div", "dl", "fieldset", "figcaption", "figure",
                  "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
                  "section", "summary", "ul"),
                 self.startTagCloseP),
@ -911,7 +909,8 @@ def getPhases(debug):
                ("isindex", self.startTagIsIndex),
                ("textarea", self.startTagTextarea),
                ("iframe", self.startTagIFrame),
-                (("noembed", "noframes", "noscript"), self.startTagRawtext),
+                ("noscript", self.startTagNoscript),
                (("noembed", "noframes"), self.startTagRawtext),
                ("select", self.startTagSelect),
                (("rp", "rt"), self.startTagRpRt),
                (("option", "optgroup"), self.startTagOpt),
@ -923,7 +922,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("body", self.endTagBody),
                ("html", self.endTagHtml),
                (("address", "article", "aside", "blockquote", "button", "center",
@ -942,17 +941,9 @@ def getPhases(debug):
            self.endTagHandler.default = self.endTagOther
        def isMatchingFormattingElement(self, node1, node2):
-            if node1.name != node2.name or node1.namespace != node2.namespace:
+            return (node1.name == node2.name and
-                return False
+                    node1.namespace == node2.namespace and
-            elif len(node1.attributes) != len(node2.attributes):
+                    node1.attributes == node2.attributes)
                return False
            else:
                attributes1 = sorted(node1.attributes.items())
                attributes2 = sorted(node2.attributes.items())
                for attr1, attr2 in zip(attributes1, attributes2):
                    if attr1 != attr2:
                        return False
            return True
        # helper
        def addFormattingElement(self, token):
@ -988,8 +979,8 @@ def getPhases(debug):
            data = token["data"]
            self.processSpaceCharacters = self.processSpaceCharactersNonPre
            if (data.startswith("\n") and
-                self.tree.openElements[-1].name in ("pre", "listing", "textarea")
+                self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
-                    and not self.tree.openElements[-1].hasContent()):
+                    not self.tree.openElements[-1].hasContent()):
                data = data[1:]
            if data:
                self.tree.reconstructActiveFormattingElements()
@ -1007,7 +998,7 @@ def getPhases(debug):
                     for char in token["data"]])):
                self.parser.framesetOK = False
-        def processSpaceCharacters(self, token):
+        def processSpaceCharactersNonPre(self, token):
            self.tree.reconstructActiveFormattingElements()
            self.tree.insertText(token["data"])
@ -1016,8 +1007,8 @@ def getPhases(debug):
        def startTagBody(self, token):
            self.parser.parseError("unexpected-start-tag", {"name": "body"})
-            if (len(self.tree.openElements) == 1
+            if (len(self.tree.openElements) == 1 or
-                    or self.tree.openElements[1].name != "body"):
+                    self.tree.openElements[1].name != "body"):
                assert self.parser.innerHTML
            else:
                self.parser.framesetOK = False
@ -1232,6 +1223,12 @@ def getPhases(debug):
            self.parser.framesetOK = False
            self.startTagRawtext(token)
        def startTagNoscript(self, token):
            if self.parser.scripting:
                self.startTagRawtext(token)
            else:
                self.startTagOther(token)
        def startTagRawtext(self, token):
            """iframe, noembed noframes, noscript(if scripting enabled)"""
            self.parser.parseRCDataRawtext(token, "RAWTEXT")
@ -1595,9 +1592,9 @@ def getPhases(debug):
    class TextPhase(Phase):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([])
+            self.startTagHandler = _utils.MethodDispatcher([])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("script", self.endTagScript)])
            self.endTagHandler.default = self.endTagOther
@ -1629,7 +1626,7 @@ def getPhases(debug):
        # http://www.whatwg.org/specs/web-apps/current-work/#in-table
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("caption", self.startTagCaption),
                ("colgroup", self.startTagColgroup),
@ -1643,7 +1640,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("table", self.endTagTable),
                (("body", "caption", "col", "colgroup", "html", "tbody", "td",
                  "tfoot", "th", "thead", "tr"), self.endTagIgnore)
@ -1820,14 +1817,14 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
                  "thead", "tr"), self.startTagTableElement)
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("caption", self.endTagCaption),
                ("table", self.endTagTable),
                (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
@ -1892,13 +1889,13 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("col", self.startTagCol)
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("colgroup", self.endTagColgroup),
                ("col", self.endTagCol)
            ])
@ -1926,6 +1923,7 @@ def getPhases(debug):
        def startTagCol(self, token):
            self.tree.insertElement(token)
            self.tree.openElements.pop()
            token["selfClosingAcknowledged"] = True
        def startTagOther(self, token):
            ignoreEndTag = self.ignoreEndTagColgroup()
@ -1955,7 +1953,7 @@ def getPhases(debug):
        # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("tr", self.startTagTr),
                (("td", "th"), self.startTagTableCell),
@ -1964,7 +1962,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
                ("table", self.endTagTable),
                (("body", "caption", "col", "colgroup", "html", "td", "th",
@ -2053,7 +2051,7 @@ def getPhases(debug):
        # http://www.whatwg.org/specs/web-apps/current-work/#in-row
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                (("td", "th"), self.startTagTableCell),
                (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
@ -2061,7 +2059,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("tr", self.endTagTr),
                ("table", self.endTagTable),
                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
@ -2142,14 +2140,14 @@ def getPhases(debug):
        # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
                  "thead", "tr"), self.startTagTableOther)
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                (("td", "th"), self.endTagTableCell),
                (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
                (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
@ -2218,7 +2216,7 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("option", self.startTagOption),
                ("optgroup", self.startTagOptgroup),
@ -2228,7 +2226,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("option", self.endTagOption),
                ("optgroup", self.endTagOptgroup),
                ("select", self.endTagSelect)
@ -2318,13 +2316,13 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
                 self.startTagTable)
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
                 self.endTagTable)
            ])
@ -2445,7 +2443,7 @@ def getPhases(debug):
        def processEndTag(self, token):
            nodeIndex = len(self.tree.openElements) - 1
            node = self.tree.openElements[-1]
-            if node.name != token["name"]:
+            if node.name.translate(asciiUpper2Lower) != token["name"]:
                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
            while True:
@ -2472,12 +2470,12 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml)
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
+            self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
            self.endTagHandler.default = self.endTagOther
        def processEOF(self):
@ -2520,7 +2518,7 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("frameset", self.startTagFrameset),
                ("frame", self.startTagFrame),
@ -2528,7 +2526,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("frameset", self.endTagFrameset)
            ])
            self.endTagHandler.default = self.endTagOther
@ -2577,13 +2575,13 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("noframes", self.startTagNoframes)
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("html", self.endTagHtml)
            ])
            self.endTagHandler.default = self.endTagOther
@ -2613,7 +2611,7 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml)
            ])
            self.startTagHandler.default = self.startTagOther
@ -2651,7 +2649,7 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("noframes", self.startTagNoFrames)
            ])
@ -2682,13 +2680,14 @@ def getPhases(debug):
        def processEndTag(self, token):
            self.parser.parseError("expected-eof-but-got-end-tag",
                                   {"name": token["name"]})
    # pylint:enable=unused-argument
    return {
        "initial": InitialPhase,
        "beforeHtml": BeforeHtmlPhase,
        "beforeHead": BeforeHeadPhase,
        "inHead": InHeadPhase,
-        # XXX "inHeadNoscript": InHeadNoScriptPhase,
+        "inHeadNoscript": InHeadNoscriptPhase,
        "afterHead": AfterHeadPhase,
        "inBody": InBodyPhase,
        "text": TextPhase,
@ -2711,6 +2710,16 @@ def getPhases(debug):
    }
 def adjust_attributes(token, replacements):
    if PY3 or _utils.PY27:
        needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
    else:
        needs_adjustment = frozenset(token['data']) & frozenset(replacements)
    if needs_adjustment:
        token['data'] = OrderedDict((replacements.get(k, k), v)
                                    for k, v in token['data'].items())
 def impliedTagToken(name, type="EndTag", attributes=None,
                    selfClosing=False):
    if attributes is None:
--- a/lib/html5lib/sanitizer.py
+++ b/lib/html5lib/sanitizer.py
@ -1,300 +0,0 @@
 from __future__ import absolute_import, division, unicode_literals
 import re
 from xml.sax.saxutils import escape, unescape
 from six.moves import urllib_parse as urlparse
 from .tokenizer import HTMLTokenizer
 from .constants import tokenTypes
 content_type_rgx = re.compile(r'''
                               ^
                               # Match a content type <application>/<type>
                               (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
                               # Match any character set and encoding
                               (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
                                 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
                               # Assume the rest is data
                               ,.*
                               $
                               ''',
                              re.VERBOSE)
 class HTMLSanitizerMixin(object):
    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
                           'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
                           'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
                           'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
                           'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
                           'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
                           'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
                           'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
                           'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
                           'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
                           'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
                           'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
                           'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
    mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
                       'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
                       'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
                       'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
                       'munderover', 'none']
    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
                    'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
                    'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
                    'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
                    'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
                    'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
                             'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
                             'background', 'balance', 'bgcolor', 'bgproperties', 'border',
                             'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
                             'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
                             'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
                             'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
                             'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
                             'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
                             'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
                             'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
                             'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
                             'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
                             'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
                             'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
                             'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
                             'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
                             'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
                             'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
                             'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
                             'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
                             'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
                             'width', 'wrap', 'xml:lang']
    mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
                         'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
                         'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
                         'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
                         'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
                         'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
                         'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
                         'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
                         'xlink:type', 'xmlns', 'xmlns:xlink']
    svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
                      'arabic-form', 'ascent', 'attributeName', 'attributeType',
                      'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
                      'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
                      'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
                      'fill-opacity', 'fill-rule', 'font-family', 'font-size',
                      'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
                      'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
                      'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
                      'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
                      'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
                      'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
                      'opacity', 'orient', 'origin', 'overline-position',
                      'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
                      'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
                      'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
                      'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
                      'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
                      'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
                      'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
                      'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
                      'transform', 'type', 'u1', 'u2', 'underline-position',
                      'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
                      'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
                      'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
                      'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
                      'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
                      'y1', 'y2', 'zoomAndPan']
    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
                       'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
    svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
                               'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
                               'mask', 'stroke']
    svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
                            'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
                            'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
                            'set', 'use']
    acceptable_css_properties = ['azimuth', 'background-color',
                                 'border-bottom-color', 'border-collapse', 'border-color',
                                 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
                                 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
                                 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
                                 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
                                 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
                                 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
                                 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
                                 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
                                 'white-space', 'width']
    acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
                               'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
                               'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
                               'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
                               'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
                               'transparent', 'underline', 'white', 'yellow']
    acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
                                 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
                                 'stroke-opacity']
    acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
                            'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
                            'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
                            'ssh', 'sftp', 'rtsp', 'afs', 'data']
    acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
    # subclasses may define their own versions of these constants
    allowed_elements = acceptable_elements + mathml_elements + svg_elements
    allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
    allowed_css_properties = acceptable_css_properties
    allowed_css_keywords = acceptable_css_keywords
    allowed_svg_properties = acceptable_svg_properties
    allowed_protocols = acceptable_protocols
    allowed_content_types = acceptable_content_types
    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
    # attributes are parsed, and a restricted set, # specified by
    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
    # in ALLOWED_PROTOCOLS are allowed.
    #
    #   sanitize_html('<script> do_nasty_stuff() </script>')
    #    => &lt;script> do_nasty_stuff() &lt;/script>
    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
    #    => <a>Click here for $100</a>
    def sanitize_token(self, token):
        # accommodate filters which use token_type differently
        token_type = token["type"]
        if token_type in list(tokenTypes.keys()):
            token_type = tokenTypes[token_type]
        if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
                          tokenTypes["EmptyTag"]):
            if token["name"] in self.allowed_elements:
                return self.allowed_token(token, token_type)
            else:
                return self.disallowed_token(token, token_type)
        elif token_type == tokenTypes["Comment"]:
            pass
        else:
            return token
    def allowed_token(self, token, token_type):
        if "data" in token:
            attrs = dict([(name, val) for name, val in
                          token["data"][::-1]
                          if name in self.allowed_attributes])
            for attr in self.attr_val_is_uri:
                if attr not in attrs:
                    continue
                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                       unescape(attrs[attr])).lower()
                # remove replacement characters from unescaped characters
                val_unescaped = val_unescaped.replace("\ufffd", "")
                try:
                    uri = urlparse.urlparse(val_unescaped)
                except ValueError:
                    uri = None
                    del attrs[attr]
                if uri and uri.scheme:
                    if uri.scheme not in self.allowed_protocols:
                        del attrs[attr]
                    if uri.scheme == 'data':
                        m = content_type_rgx.match(uri.path)
                        if not m:
                            del attrs[attr]
                        elif m.group('content_type') not in self.allowed_content_types:
                            del attrs[attr]
            for attr in self.svg_attr_val_allows_ref:
                if attr in attrs:
                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                         ' ',
                                         unescape(attrs[attr]))
            if (token["name"] in self.svg_allow_local_href and
                'xlink:href' in attrs and re.search('^\s*[^#\s].*',
                                                    attrs['xlink:href'])):
                del attrs['xlink:href']
            if 'style' in attrs:
                attrs['style'] = self.sanitize_css(attrs['style'])
            token["data"] = [[name, val] for name, val in list(attrs.items())]
        return token
    def disallowed_token(self, token, token_type):
        if token_type == tokenTypes["EndTag"]:
            token["data"] = "</%s>" % token["name"]
        elif token["data"]:
            attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
            token["data"] = "<%s%s>" % (token["name"], attrs)
        else:
            token["data"] = "<%s>" % token["name"]
        if token.get("selfClosing"):
            token["data"] = token["data"][:-1] + "/>"
        if token["type"] in list(tokenTypes.keys()):
            token["type"] = "Characters"
        else:
            token["type"] = tokenTypes["Characters"]
        del token["name"]
        return token
    def sanitize_css(self, style):
        # disallow urls
        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
        # gauntlet
        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
            return ''
        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
            return ''
        clean = []
        for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
            if not value:
                continue
            if prop.lower() in self.allowed_css_properties:
                clean.append(prop + ': ' + value + ';')
            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
                                                'padding']:
                for keyword in value.split():
                    if keyword not in self.acceptable_css_keywords and \
                            not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
                        break
                else:
                    clean.append(prop + ': ' + value + ';')
            elif prop.lower() in self.allowed_svg_properties:
                clean.append(prop + ': ' + value + ';')
        return ' '.join(clean)
 class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
                 lowercaseElementName=False, lowercaseAttrName=False, parser=None):
        # Change case matching defaults as we only output lowercase html anyway
        # This solution doesn't seem ideal...
        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
                               lowercaseElementName, lowercaseAttrName, parser=parser)
    def __iter__(self):
        for token in HTMLTokenizer.__iter__(self):
            token = self.sanitize_token(token)
            if token:
                yield token
--- a/lib/html5lib/serializer/htmlserializer.py
+++ b/lib/html5lib/serializer/htmlserializer.py
@ -1,79 +1,87 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
-try:
+import re
    from functools import reduce
 except ImportError:
    pass
-from ..constants import voidElements, booleanAttributes, spaceCharacters
+from codecs import register_error, xmlcharrefreplace_errors
-from ..constants import rcdataElements, entities, xmlEntities
+
-from .. import utils
+from .constants import voidElements, booleanAttributes, spaceCharacters
 from .constants import rcdataElements, entities, xmlEntities
 from . import treewalkers, _utils
 from xml.sax.saxutils import escape
-spaceCharacters = "".join(spaceCharacters)
+_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
 _quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
 _quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
                                   "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
                                   "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
                                   "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
                                   "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
                                   "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
                                   "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
                                   "\u3000]")
 try:
    from codecs import register_error, xmlcharrefreplace_errors
 except ImportError:
    unicode_encode_errors = "strict"
 else:
    unicode_encode_errors = "htmlentityreplace"
-    encode_entity_map = {}
+_encode_entity_map = {}
-    is_ucs4 = len("\U0010FFFF") == 1
+_is_ucs4 = len("\U0010FFFF") == 1
-    for k, v in list(entities.items()):
+for k, v in list(entities.items()):
-        # skip multi-character entities
+    # skip multi-character entities
-        if ((is_ucs4 and len(v) > 1) or
+    if ((_is_ucs4 and len(v) > 1) or
-                (not is_ucs4 and len(v) > 2)):
+            (not _is_ucs4 and len(v) > 2)):
-            continue
+        continue
-        if v != "&":
+    if v != "&":
-            if len(v) == 2:
+        if len(v) == 2:
-                v = utils.surrogatePairToCodepoint(v)
+            v = _utils.surrogatePairToCodepoint(v)
            else:
                v = ord(v)
            if v not in encode_entity_map or k.islower():
                # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
                encode_entity_map[v] = k
    def htmlentityreplace_errors(exc):
        if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
            res = []
            codepoints = []
            skip = False
            for i, c in enumerate(exc.object[exc.start:exc.end]):
                if skip:
                    skip = False
                    continue
                index = i + exc.start
                if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
                    codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
                    skip = True
                else:
                    codepoint = ord(c)
                codepoints.append(codepoint)
            for cp in codepoints:
                e = encode_entity_map.get(cp)
                if e:
                    res.append("&")
                    res.append(e)
                    if not e.endswith(";"):
                        res.append(";")
                else:
                    res.append("&#x%s;" % (hex(cp)[2:]))
            return ("".join(res), exc.end)
        else:
-            return xmlcharrefreplace_errors(exc)
+            v = ord(v)
        if v not in _encode_entity_map or k.islower():
            # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
            _encode_entity_map[v] = k
    register_error(unicode_encode_errors, htmlentityreplace_errors)
-    del register_error
+def htmlentityreplace_errors(exc):
    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
        res = []
        codepoints = []
        skip = False
        for i, c in enumerate(exc.object[exc.start:exc.end]):
            if skip:
                skip = False
                continue
            index = i + exc.start
            if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
                codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
                skip = True
            else:
                codepoint = ord(c)
            codepoints.append(codepoint)
        for cp in codepoints:
            e = _encode_entity_map.get(cp)
            if e:
                res.append("&")
                res.append(e)
                if not e.endswith(";"):
                    res.append(";")
            else:
                res.append("&#x%s;" % (hex(cp)[2:]))
        return ("".join(res), exc.end)
    else:
        return xmlcharrefreplace_errors(exc)
 register_error("htmlentityreplace", htmlentityreplace_errors)
 def serialize(input, tree="etree", encoding=None, **serializer_opts):
    # XXX: Should we cache this?
    walker = treewalkers.getTreeWalker(tree)
    s = HTMLSerializer(**serializer_opts)
    return s.render(walker(input), encoding)
 class HTMLSerializer(object):
    # attribute quoting options
-    quote_attr_values = False
+    quote_attr_values = "legacy"  # be secure by default
    quote_char = '"'
    use_best_quote_char = True
@ -109,9 +117,9 @@ class HTMLSerializer(object):
        inject_meta_charset=True|False
          Whether it insert a meta element to define the character set of the
          document.
-        quote_attr_values=True|False
+        quote_attr_values="legacy"|"spec"|"always"
          Whether to quote attribute values that don't require quoting
-          per HTML5 parsing rules.
+          per legacy browser behaviour, when required by the standard, or always.
        quote_char=u'"'|u"'"
          Use given quote character for attribute quoting. Default is to
          use double quote unless attribute value contains a double quote,
@ -147,6 +155,9 @@ class HTMLSerializer(object):
        .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
        """
        unexpected_args = frozenset(kwargs) - frozenset(self.options)
        if len(unexpected_args) > 0:
            raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
        if 'quote_char' in kwargs:
            self.use_best_quote_char = False
        for attr in self.options:
@ -157,7 +168,7 @@ class HTMLSerializer(object):
    def encode(self, string):
        assert(isinstance(string, text_type))
        if self.encoding:
-            return string.encode(self.encoding, unicode_encode_errors)
+            return string.encode(self.encoding, "htmlentityreplace")
        else:
            return string
@ -169,28 +180,30 @@ class HTMLSerializer(object):
            return string
    def serialize(self, treewalker, encoding=None):
        # pylint:disable=too-many-nested-blocks
        self.encoding = encoding
        in_cdata = False
        self.errors = []
        if encoding and self.inject_meta_charset:
-            from ..filters.inject_meta_charset import Filter
+            from .filters.inject_meta_charset import Filter
            treewalker = Filter(treewalker, encoding)
        # Alphabetical attributes is here under the assumption that none of
        # the later filters add or change order of attributes; it needs to be
        # before the sanitizer so escaped elements come out correctly
        if self.alphabetical_attributes:
            from .filters.alphabeticalattributes import Filter
            treewalker = Filter(treewalker)
        # WhitespaceFilter should be used before OptionalTagFilter
        # for maximum efficiently of this latter filter
        if self.strip_whitespace:
-            from ..filters.whitespace import Filter
+            from .filters.whitespace import Filter
            treewalker = Filter(treewalker)
        if self.sanitize:
-            from ..filters.sanitizer import Filter
+            from .filters.sanitizer import Filter
            treewalker = Filter(treewalker)
        if self.omit_optional_tags:
-            from ..filters.optionaltags import Filter
+            from .filters.optionaltags import Filter
            treewalker = Filter(treewalker)
        # Alphabetical attributes must be last, as other filters
        # could add attributes and alter the order
        if self.alphabetical_attributes:
            from ..filters.alphabeticalattributes import Filter
            treewalker = Filter(treewalker)
        for token in treewalker:
@ -229,7 +242,7 @@ class HTMLSerializer(object):
                    in_cdata = True
                elif in_cdata:
                    self.serializeError("Unexpected child element of a CDATA element")
-                for (attr_namespace, attr_name), attr_value in token["data"].items():
+                for (_, attr_name), attr_value in token["data"].items():
                    # TODO: Add namespace support here
                    k = attr_name
                    v = attr_value
@ -237,14 +250,18 @@ class HTMLSerializer(object):
                    yield self.encodeStrict(k)
                    if not self.minimize_boolean_attributes or \
-                        (k not in booleanAttributes.get(name, tuple())
+                        (k not in booleanAttributes.get(name, tuple()) and
-                         and k not in booleanAttributes.get("", tuple())):
+                         k not in booleanAttributes.get("", tuple())):
                        yield self.encodeStrict("=")
-                        if self.quote_attr_values or not v:
+                        if self.quote_attr_values == "always" or len(v) == 0:
                            quote_attr = True
                        elif self.quote_attr_values == "spec":
                            quote_attr = _quoteAttributeSpec.search(v) is not None
                        elif self.quote_attr_values == "legacy":
                            quote_attr = _quoteAttributeLegacy.search(v) is not None
                        else:
-                            quote_attr = reduce(lambda x, y: x or (y in v),
+                            raise ValueError("quote_attr_values must be one of: "
-                                                spaceCharacters + ">\"'=", False)
+                                             "'always', 'spec', or 'legacy'")
                        v = v.replace("&", "&amp;")
                        if self.escape_lt_in_attrs:
                            v = v.replace("<", "&lt;")
@ -312,6 +329,6 @@ class HTMLSerializer(object):
            raise SerializeError
-def SerializeError(Exception):
+class SerializeError(Exception):
    """Error in serialized tree"""
    pass
--- a/lib/html5lib/serializer/init.py
+++ b/lib/html5lib/serializer/init.py
@ -1,16 +0,0 @@
 from __future__ import absolute_import, division, unicode_literals
 from .. import treewalkers
 from .htmlserializer import HTMLSerializer
 def serialize(input, tree="etree", format="html", encoding=None,
              **serializer_opts):
    # XXX: Should we cache this?
    walker = treewalkers.getTreeWalker(tree)
    if format == "html":
        s = HTMLSerializer(**serializer_opts)
    else:
        raise ValueError("type must be html")
    return s.render(walker(input), encoding)
--- a/lib/html5lib/treeadapters/init.py
+++ b/lib/html5lib/treeadapters/init.py
@ -5,7 +5,7 @@ from . import sax
 __all__ = ["sax"]
 try:
-    from . import genshi  # flake8: noqa
+    from . import genshi  # noqa
 except ImportError:
    pass
 else:
--- a/lib/html5lib/treebuilders/init.py
+++ b/lib/html5lib/treebuilders/init.py
@ -28,7 +28,7 @@ to the format used in the unittests
 from __future__ import absolute_import, division, unicode_literals
-from ..utils import default_etree
+from .._utils import default_etree
 treeBuilderCache = {}
--- a/lib/html5lib/treebuilders/_base.py
+++ b/lib/html5lib/treebuilders/_base.py
@ -126,6 +126,7 @@ class TreeBuilder(object):
    commentClass - the class to use for comments
    doctypeClass - the class to use for doctypes
    """
    # pylint:disable=not-callable
    # Document class
    documentClass = None
@ -166,12 +167,17 @@ class TreeBuilder(object):
        # If we pass a node in we match that. if we pass a string
        # match any node with that name
        exactNode = hasattr(target, "nameTuple")
        if not exactNode:
            if isinstance(target, text_type):
                target = (namespaces["html"], target)
            assert isinstance(target, tuple)
        listElements, invert = listElementsMap[variant]
        for node in reversed(self.openElements):
-            if (node.name == target and not exactNode or
+            if exactNode and node == target:
-                    node == target and exactNode):
+                return True
            elif not exactNode and node.nameTuple == target:
                return True
            elif (invert ^ (node.nameTuple in listElements)):
                return False
@ -353,8 +359,8 @@ class TreeBuilder(object):
    def generateImpliedEndTags(self, exclude=None):
        name = self.openElements[-1].name
        # XXX td, th and tr are not actually needed
-        if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
+        if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
-                and name != exclude):
+                name != exclude):
            self.openElements.pop()
            # XXX This is not entirely what the specification says. We should
            # investigate it more closely.
--- a/lib/html5lib/treebuilders/dom.py
+++ b/lib/html5lib/treebuilders/dom.py
@ -1,54 +1,62 @@
 from __future__ import absolute_import, division, unicode_literals
 from collections import MutableMapping
 from xml.dom import minidom, Node
 import weakref
-from . import _base
+from . import base
 from .. import constants
 from ..constants import namespaces
-from ..utils import moduleFactoryFactory
+from .._utils import moduleFactoryFactory
 def getDomBuilder(DomImplementation):
    Dom = DomImplementation
-    class AttrList(object):
+    class AttrList(MutableMapping):
        def __init__(self, element):
            self.element = element
        def __iter__(self):
-            return list(self.element.attributes.items()).__iter__()
+            return iter(self.element.attributes.keys())
        def __setitem__(self, name, value):
            self.element.setAttribute(name, value)
        def __len__(self):
            return len(list(self.element.attributes.items()))
        def items(self):
            return [(item[0], item[1]) for item in
                    list(self.element.attributes.items())]
        def keys(self):
            return list(self.element.attributes.keys())
        def __getitem__(self, name):
            return self.element.getAttribute(name)
        def __contains__(self, name):
            if isinstance(name, tuple):
                raise NotImplementedError
            else:
-                return self.element.hasAttribute(name)
+                attr = self.element.ownerDocument.createAttribute(name)
                attr.value = value
                self.element.attributes[name] = attr
-    class NodeBuilder(_base.Node):
+        def __len__(self):
            return len(self.element.attributes)
        def items(self):
            return list(self.element.attributes.items())
        def values(self):
            return list(self.element.attributes.values())
        def __getitem__(self, name):
            if isinstance(name, tuple):
                raise NotImplementedError
            else:
                return self.element.attributes[name].value
        def __delitem__(self, name):
            if isinstance(name, tuple):
                raise NotImplementedError
            else:
                del self.element.attributes[name]
    class NodeBuilder(base.Node):
        def __init__(self, element):
-            _base.Node.__init__(self, element.nodeName)
+            base.Node.__init__(self, element.nodeName)
            self.element = element
-        namespace = property(lambda self: hasattr(self.element, "namespaceURI")
+        namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
-                             and self.element.namespaceURI or None)
+                             self.element.namespaceURI or None)
        def appendChild(self, node):
            node.parent = self
@ -109,7 +117,7 @@ def getDomBuilder(DomImplementation):
        nameTuple = property(getNameTuple)
-    class TreeBuilder(_base.TreeBuilder):
+    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
        def documentClass(self):
            self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
            return weakref.proxy(self)
@ -149,15 +157,16 @@ def getDomBuilder(DomImplementation):
            return self.dom
        def getFragment(self):
-            return _base.TreeBuilder.getFragment(self).element
+            return base.TreeBuilder.getFragment(self).element
        def insertText(self, data, parent=None):
            data = data
            if parent != self:
-                _base.TreeBuilder.insertText(self, data, parent)
+                base.TreeBuilder.insertText(self, data, parent)
            else:
                # HACK: allow text nodes as children of the document node
                if hasattr(self.dom, '_child_node_types'):
                    # pylint:disable=protected-access
                    if Node.TEXT_NODE not in self.dom._child_node_types:
                        self.dom._child_node_types = list(self.dom._child_node_types)
                        self.dom._child_node_types.append(Node.TEXT_NODE)
--- a/lib/html5lib/treebuilders/etree.py
+++ b/lib/html5lib/treebuilders/etree.py
@ -1,13 +1,15 @@
 from __future__ import absolute_import, division, unicode_literals
 # pylint:disable=protected-access
 from six import text_type
 import re
-from . import _base
+from . import base
-from .. import ihatexml
+from .. import _ihatexml
 from .. import constants
 from ..constants import namespaces
-from ..utils import moduleFactoryFactory
+from .._utils import moduleFactoryFactory
 tag_regexp = re.compile("{([^}]*)}(.*)")
@ -16,7 +18,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
    ElementTree = ElementTreeImplementation
    ElementTreeCommentType = ElementTree.Comment("asd").tag
-    class Element(_base.Node):
+    class Element(base.Node):
        def __init__(self, name, namespace=None):
            self._name = name
            self._namespace = namespace
@ -98,6 +100,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            node.parent = self
        def removeChild(self, node):
            self._childNodes.remove(node)
            self._element.remove(node._element)
            node.parent = None
@ -139,7 +142,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
                if self._element.text is not None:
                    newParent._element.text += self._element.text
            self._element.text = ""
-            _base.Node.reparentChildren(self, newParent)
+            base.Node.reparentChildren(self, newParent)
    class Comment(Element):
        def __init__(self, data):
@ -253,10 +256,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        return "\n".join(rv)
-    def tostring(element):
+    def tostring(element):  # pylint:disable=unused-variable
        """Serialize an element and its child nodes to a string"""
        rv = []
-        filter = ihatexml.InfosetFilter()
+        filter = _ihatexml.InfosetFilter()
        def serializeElement(element):
            if isinstance(element, ElementTree.ElementTree):
@ -307,7 +310,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        return "".join(rv)
-    class TreeBuilder(_base.TreeBuilder):
+    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
        documentClass = Document
        doctypeClass = DocumentType
        elementClass = Element
@ -329,7 +332,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
                    return self.document._element.find("html")
        def getFragment(self):
-            return _base.TreeBuilder.getFragment(self)._element
+            return base.TreeBuilder.getFragment(self)._element
    return locals()
--- a/lib/html5lib/treebuilders/etree_lxml.py
+++ b/lib/html5lib/treebuilders/etree_lxml.py
@ -10,16 +10,17 @@ When any of these things occur, we emit a DataLossWarning
 """
 from __future__ import absolute_import, division, unicode_literals
 # pylint:disable=protected-access
 import warnings
 import re
 import sys
-from . import _base
+from . import base
 from ..constants import DataLossWarning
 from .. import constants
 from . import etree as etree_builders
-from .. import ihatexml
+from .. import _ihatexml
 import lxml.etree as etree
@ -53,8 +54,7 @@ class Document(object):
 def testSerializer(element):
    rv = []
-    finalText = None
+    infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
    infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
    def serializeElement(element, indent=0):
        if not hasattr(element, "tag"):
@ -128,16 +128,12 @@ def testSerializer(element):
                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
    serializeElement(element, 0)
    if finalText is not None:
        rv.append("|%s\"%s\"" % (' ' * 2, finalText))
    return "\n".join(rv)
 def tostring(element):
    """Serialize an element and its child nodes to a string"""
    rv = []
    finalText = None
    def serializeElement(element):
        if not hasattr(element, "tag"):
@ -173,13 +169,10 @@ def tostring(element):
    serializeElement(element)
    if finalText is not None:
        rv.append("%s\"" % (' ' * 2, finalText))
    return "".join(rv)
-class TreeBuilder(_base.TreeBuilder):
+class TreeBuilder(base.TreeBuilder):
    documentClass = Document
    doctypeClass = DocumentType
    elementClass = None
@ -189,13 +182,15 @@ class TreeBuilder(_base.TreeBuilder):
    def __init__(self, namespaceHTMLElements, fullTree=False):
        builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
-        infosetFilter = self.infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
+        infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
        self.namespaceHTMLElements = namespaceHTMLElements
        class Attributes(dict):
-            def __init__(self, element, value={}):
+            def __init__(self, element, value=None):
                if value is None:
                    value = {}
                self._element = element
-                dict.__init__(self, value)
+                dict.__init__(self, value)  # pylint:disable=non-parent-init-called
                for key, value in self.items():
                    if isinstance(key, tuple):
                        name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
@ -259,10 +254,10 @@ class TreeBuilder(_base.TreeBuilder):
        self.elementClass = Element
        self.commentClass = Comment
        # self.fragmentClass = builder.DocumentFragment
-        _base.TreeBuilder.__init__(self, namespaceHTMLElements)
+        base.TreeBuilder.__init__(self, namespaceHTMLElements)
    def reset(self):
-        _base.TreeBuilder.reset(self)
+        base.TreeBuilder.reset(self)
        self.insertComment = self.insertCommentInitial
        self.initial_comments = []
        self.doctype = None
@ -303,12 +298,14 @@ class TreeBuilder(_base.TreeBuilder):
            self.doctype = doctype
    def insertCommentInitial(self, data, parent=None):
        assert parent is None or parent is self.document
        assert self.document._elementTree is None
        self.initial_comments.append(data)
    def insertCommentMain(self, data, parent=None):
        if (parent == self.document and
                self.document._elementTree.getroot()[-1].tag == comment_type):
-                warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
+            warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
        super(TreeBuilder, self).insertComment(data, parent)
    def insertRoot(self, token):
--- a/lib/html5lib/treewalkers/init.py
+++ b/lib/html5lib/treewalkers/init.py
@ -10,10 +10,10 @@ returning an iterator generating tokens.
 from __future__ import absolute_import, division, unicode_literals
 __all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"]
 from .. import constants
-from ..utils import default_etree
+from .._utils import default_etree
 __all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"]
 treeWalkerCache = {}
@ -43,11 +43,11 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
            from . import dom
            treeWalkerCache[treeType] = dom.TreeWalker
        elif treeType == "genshi":
-            from . import genshistream
+            from . import genshi
-            treeWalkerCache[treeType] = genshistream.TreeWalker
+            treeWalkerCache[treeType] = genshi.TreeWalker
        elif treeType == "lxml":
-            from . import lxmletree
+            from . import etree_lxml
-            treeWalkerCache[treeType] = lxmletree.TreeWalker
+            treeWalkerCache[treeType] = etree_lxml.TreeWalker
        elif treeType == "etree":
            from . import etree
            if implementation is None:
--- a/lib/html5lib/treewalkers/_base.py
+++ b/lib/html5lib/treewalkers/_base.py
@ -1,11 +1,11 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import text_type, string_types
+
 from xml.dom import Node
 from ..constants import namespaces, voidElements, spaceCharacters
 __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
           "TreeWalker", "NonRecursiveTreeWalker"]
 from xml.dom import Node
 DOCUMENT = Node.DOCUMENT_NODE
 DOCTYPE = Node.DOCUMENT_TYPE_NODE
 TEXT = Node.TEXT_NODE
@ -14,28 +14,9 @@ COMMENT = Node.COMMENT_NODE
 ENTITY = Node.ENTITY_NODE
 UNKNOWN = "<#UNKNOWN#>"
 from ..constants import voidElements, spaceCharacters
 spaceCharacters = "".join(spaceCharacters)
 def to_text(s, blank_if_none=True):
    """Wrapper around six.text_type to convert None to empty string"""
    if s is None:
        if blank_if_none:
            return ""
        else:
            return None
    elif isinstance(s, text_type):
        return s
    else:
        return text_type(s)
 def is_text_or_none(string):
    """Wrapper around isinstance(string_types) or is None"""
    return string is None or isinstance(string, string_types)
 class TreeWalker(object):
    def __init__(self, tree):
        self.tree = tree
@ -47,47 +28,25 @@ class TreeWalker(object):
        return {"type": "SerializeError", "data": msg}
    def emptyTag(self, namespace, name, attrs, hasChildren=False):
-        assert namespace is None or isinstance(namespace, string_types), type(namespace)
+        yield {"type": "EmptyTag", "name": name,
-        assert isinstance(name, string_types), type(name)
+               "namespace": namespace,
        assert all((namespace is None or isinstance(namespace, string_types)) and
                   isinstance(name, string_types) and
                   isinstance(value, string_types)
                   for (namespace, name), value in attrs.items())
        yield {"type": "EmptyTag", "name": to_text(name, False),
               "namespace": to_text(namespace),
               "data": attrs}
        if hasChildren:
            yield self.error("Void element has children")
    def startTag(self, namespace, name, attrs):
        assert namespace is None or isinstance(namespace, string_types), type(namespace)
        assert isinstance(name, string_types), type(name)
        assert all((namespace is None or isinstance(namespace, string_types)) and
                   isinstance(name, string_types) and
                   isinstance(value, string_types)
                   for (namespace, name), value in attrs.items())
        return {"type": "StartTag",
-                "name": text_type(name),
+                "name": name,
-                "namespace": to_text(namespace),
+                "namespace": namespace,
-                "data": dict(((to_text(namespace, False), to_text(name)),
+                "data": attrs}
                              to_text(value, False))
                             for (namespace, name), value in attrs.items())}
    def endTag(self, namespace, name):
        assert namespace is None or isinstance(namespace, string_types), type(namespace)
        assert isinstance(name, string_types), type(namespace)
        return {"type": "EndTag",
-                "name": to_text(name, False),
+                "name": name,
-                "namespace": to_text(namespace),
+                "namespace": namespace}
                "data": {}}
    def text(self, data):
-        assert isinstance(data, string_types), type(data)
+        data = data
        data = to_text(data)
        middle = data.lstrip(spaceCharacters)
        left = data[:len(data) - len(middle)]
        if left:
@ -101,25 +60,16 @@ class TreeWalker(object):
            yield {"type": "SpaceCharacters", "data": right}
    def comment(self, data):
-        assert isinstance(data, string_types), type(data)
+        return {"type": "Comment", "data": data}
        return {"type": "Comment", "data": text_type(data)}
    def doctype(self, name, publicId=None, systemId=None, correct=True):
        assert is_text_or_none(name), type(name)
        assert is_text_or_none(publicId), type(publicId)
        assert is_text_or_none(systemId), type(systemId)
    def doctype(self, name, publicId=None, systemId=None):
        return {"type": "Doctype",
-                "name": to_text(name),
+                "name": name,
-                "publicId": to_text(publicId),
+                "publicId": publicId,
-                "systemId": to_text(systemId),
+                "systemId": systemId}
                "correct": to_text(correct)}
    def entity(self, name):
-        assert isinstance(name, string_types), type(name)
+        return {"type": "Entity", "name": name}
        return {"type": "Entity", "name": text_type(name)}
    def unknown(self, nodeType):
        return self.error("Unknown node type: " + nodeType)
@ -154,7 +104,7 @@ class NonRecursiveTreeWalker(TreeWalker):
            elif type == ELEMENT:
                namespace, name, attributes, hasChildren = details
-                if name in voidElements:
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
                    for token in self.emptyTag(namespace, name, attributes,
                                               hasChildren):
                        yield token
@ -187,7 +137,7 @@ class NonRecursiveTreeWalker(TreeWalker):
                    type, details = details[0], details[1:]
                    if type == ELEMENT:
                        namespace, name, attributes, hasChildren = details
-                        if name not in voidElements:
+                        if (namespace and namespace != namespaces["html"]) or name not in voidElements:
                            yield self.endTag(namespace, name)
                    if self.tree is currentNode:
                        currentNode = None
--- a/lib/html5lib/treewalkers/dom.py
+++ b/lib/html5lib/treewalkers/dom.py
@ -2,16 +2,16 @@ from __future__ import absolute_import, division, unicode_literals
 from xml.dom import Node
-from . import _base
+from . import base
-class TreeWalker(_base.NonRecursiveTreeWalker):
+class TreeWalker(base.NonRecursiveTreeWalker):
    def getNodeDetails(self, node):
        if node.nodeType == Node.DOCUMENT_TYPE_NODE:
-            return _base.DOCTYPE, node.name, node.publicId, node.systemId
+            return base.DOCTYPE, node.name, node.publicId, node.systemId
        elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
-            return _base.TEXT, node.nodeValue
+            return base.TEXT, node.nodeValue
        elif node.nodeType == Node.ELEMENT_NODE:
            attrs = {}
@ -21,17 +21,17 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
                    attrs[(attr.namespaceURI, attr.localName)] = attr.value
                else:
                    attrs[(None, attr.name)] = attr.value
-            return (_base.ELEMENT, node.namespaceURI, node.nodeName,
+            return (base.ELEMENT, node.namespaceURI, node.nodeName,
                    attrs, node.hasChildNodes())
        elif node.nodeType == Node.COMMENT_NODE:
-            return _base.COMMENT, node.nodeValue
+            return base.COMMENT, node.nodeValue
        elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
-            return (_base.DOCUMENT,)
+            return (base.DOCUMENT,)
        else:
-            return _base.UNKNOWN, node.nodeType
+            return base.UNKNOWN, node.nodeType
    def getFirstChild(self, node):
        return node.firstChild
--- a/lib/html5lib/treewalkers/etree.py
+++ b/lib/html5lib/treewalkers/etree.py
@ -12,8 +12,8 @@ import re
 from six import string_types
-from . import _base
+from . import base
-from ..utils import moduleFactoryFactory
+from .._utils import moduleFactoryFactory
 tag_regexp = re.compile("{([^}]*)}(.*)")
@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation):
    ElementTree = ElementTreeImplementation
    ElementTreeCommentType = ElementTree.Comment("asd").tag
-    class TreeWalker(_base.NonRecursiveTreeWalker):
+    class TreeWalker(base.NonRecursiveTreeWalker):  # pylint:disable=unused-variable
        """Given the particular ElementTree representation, this implementation,
        to avoid using recursion, returns "nodes" as tuples with the following
        content:
@ -38,9 +38,9 @@ def getETreeBuilder(ElementTreeImplementation):
        """
        def getNodeDetails(self, node):
            if isinstance(node, tuple):  # It might be the root Element
-                elt, key, parents, flag = node
+                elt, _, _, flag = node
                if flag in ("text", "tail"):
-                    return _base.TEXT, getattr(elt, flag)
+                    return base.TEXT, getattr(elt, flag)
                else:
                    node = elt
@ -48,14 +48,14 @@ def getETreeBuilder(ElementTreeImplementation):
                node = node.getroot()
            if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
-                return (_base.DOCUMENT,)
+                return (base.DOCUMENT,)
            elif node.tag == "<!DOCTYPE>":
-                return (_base.DOCTYPE, node.text,
+                return (base.DOCTYPE, node.text,
                        node.get("publicId"), node.get("systemId"))
            elif node.tag == ElementTreeCommentType:
-                return _base.COMMENT, node.text
+                return base.COMMENT, node.text
            else:
                assert isinstance(node.tag, string_types), type(node.tag)
@ -73,7 +73,7 @@ def getETreeBuilder(ElementTreeImplementation):
                        attrs[(match.group(1), match.group(2))] = value
                    else:
                        attrs[(None, name)] = value
-                return (_base.ELEMENT, namespace, tag,
+                return (base.ELEMENT, namespace, tag,
                        attrs, len(node) or node.text)
        def getFirstChild(self, node):
--- a/lib/html5lib/treewalkers/etree_lxml.py
+++ b/lib/html5lib/treewalkers/etree_lxml.py
@ -4,9 +4,9 @@ from six import text_type
 from lxml import etree
 from ..treebuilders.etree import tag_regexp
-from . import _base
+from . import base
-from .. import ihatexml
+from .. import _ihatexml
 def ensure_str(s):
@ -15,20 +15,27 @@ def ensure_str(s):
    elif isinstance(s, text_type):
        return s
    else:
-        return s.decode("utf-8", "strict")
+        return s.decode("ascii", "strict")
 class Root(object):
    def __init__(self, et):
        self.elementtree = et
        self.children = []
-        if et.docinfo.internalDTD:
+
-            self.children.append(Doctype(self,
+        try:
-                                         ensure_str(et.docinfo.root_name),
+            if et.docinfo.internalDTD:
-                                         ensure_str(et.docinfo.public_id),
+                self.children.append(Doctype(self,
-                                         ensure_str(et.docinfo.system_url)))
+                                             ensure_str(et.docinfo.root_name),
-        root = et.getroot()
+                                             ensure_str(et.docinfo.public_id),
-        node = root
+                                             ensure_str(et.docinfo.system_url)))
        except AttributeError:
            pass
        try:
            node = et.getroot()
        except AttributeError:
            node = et
        while node.getprevious() is not None:
            node = node.getprevious()
@ -115,35 +122,38 @@ class FragmentWrapper(object):
        return len(self.obj)
-class TreeWalker(_base.NonRecursiveTreeWalker):
+class TreeWalker(base.NonRecursiveTreeWalker):
    def __init__(self, tree):
-        if hasattr(tree, "getroot"):
+        # pylint:disable=redefined-variable-type
-            tree = Root(tree)
+        if isinstance(tree, list):
-        elif isinstance(tree, list):
+            self.fragmentChildren = set(tree)
            tree = FragmentRoot(tree)
-        _base.NonRecursiveTreeWalker.__init__(self, tree)
+        else:
-        self.filter = ihatexml.InfosetFilter()
+            self.fragmentChildren = set()
            tree = Root(tree)
        base.NonRecursiveTreeWalker.__init__(self, tree)
        self.filter = _ihatexml.InfosetFilter()
    def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
-            return _base.TEXT, ensure_str(getattr(node, key))
+            return base.TEXT, ensure_str(getattr(node, key))
        elif isinstance(node, Root):
-            return (_base.DOCUMENT,)
+            return (base.DOCUMENT,)
        elif isinstance(node, Doctype):
-            return _base.DOCTYPE, node.name, node.public_id, node.system_id
+            return base.DOCTYPE, node.name, node.public_id, node.system_id
        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
-            return _base.TEXT, node.obj
+            return base.TEXT, ensure_str(node.obj)
        elif node.tag == etree.Comment:
-            return _base.COMMENT, ensure_str(node.text)
+            return base.COMMENT, ensure_str(node.text)
        elif node.tag == etree.Entity:
-            return _base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;
+            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;
        else:
            # This is assumed to be an ordinary element
@ -162,7 +172,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
-            return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
+            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text)
    def getFirstChild(self, node):
@ -197,5 +207,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
            if key == "text":
                return node
            # else: fallback to "normal" processing
        elif node in self.fragmentChildren:
            return None
        return node.getparent()
--- a/lib/html5lib/treewalkers/genshistream.py
+++ b/lib/html5lib/treewalkers/genshistream.py
@ -4,12 +4,12 @@ from genshi.core import QName
 from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
 from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
-from . import _base
+from . import base
 from ..constants import voidElements, namespaces
-class TreeWalker(_base.TreeWalker):
+class TreeWalker(base.TreeWalker):
    def __iter__(self):
        # Buffer the events so we can pass in the following one
        previous = None
@ -25,7 +25,7 @@ class TreeWalker(_base.TreeWalker):
                yield token
    def tokens(self, event, next):
-        kind, data, pos = event
+        kind, data, _ = event
        if kind == START:
            tag, attribs = data
            name = tag.localname
@ -39,8 +39,8 @@ class TreeWalker(_base.TreeWalker):
            if namespace == namespaces["html"] and name in voidElements:
                for token in self.emptyTag(namespace, name, converted_attribs,
-                                           not next or next[0] != END
+                                           not next or next[0] != END or
-                                           or next[1] != tag):
+                                           next[1] != tag):
                    yield token
            else:
                yield self.startTag(namespace, name, converted_attribs)
@ -48,7 +48,7 @@ class TreeWalker(_base.TreeWalker):
        elif kind == END:
            name = data.localname
            namespace = data.namespace
-            if name not in voidElements:
+            if namespace != namespaces["html"] or name not in voidElements:
                yield self.endTag(namespace, name)
        elif kind == COMMENT: