Merge pull request #872 from JackDandy/feature/UpdateHtml5lib

Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72).
2025-01-06 01:53:37 +00:00 · 2017-02-01 03:46:23 +00:00 · 2017-02-01 03:46:23 +00:00 · 25566998ca
commit 25566998ca
parent d60ef4dc44 69a589a54c
34 changed files with 1684 additions and 1283 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -13,6 +13,7 @@
 * Update cachecontrol library 0.11.5 to 0.11.7 (3b3b776)
 * Update Certifi 2015.11.20.1 (385476b) to 2017.01.23 (9f9dc30)
 * Update feedparser library 5.2.0 (8c62940) to 5.2.1 (f1dd1bb)
+* Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72)


 [develop changelog]
--- a/lib/html5lib/init.py
+++ b/lib/html5lib/init.py
@ -22,4 +22,4 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
           "getTreeWalker", "serialize"]

 # this has to be at the top level, see how setup.py parses this
-__version__ = "0.99999999-dev"
+__version__ = "0.9999999999-dev"
--- a/lib/html5lib/_ihatexml.py
+++ b/lib/html5lib/_ihatexml.py
@ -175,9 +175,9 @@ def escapeRegexp(string):
    return string

 # output from the above
-nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa

-nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')  # noqa

 # Simpler things
 nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
@ -186,7 +186,7 @@ nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
 class InfosetFilter(object):
    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")

-    def __init__(self, replaceChars=None,
+    def __init__(self,
                 dropXmlnsLocalName=False,
                 dropXmlnsAttrNs=False,
                 preventDoubleDashComments=False,
@ -217,7 +217,7 @@ class InfosetFilter(object):
        else:
            return self.toXmlName(name)

-    def coerceElement(self, name, namespace=None):
+    def coerceElement(self, name):
        return self.toXmlName(name)

    def coerceComment(self, data):
@ -232,7 +232,7 @@ class InfosetFilter(object):

    def coerceCharacters(self, data):
        if self.replaceFormFeedCharacters:
-            for i in range(data.count("\x0C")):
+            for _ in range(data.count("\x0C")):
                warnings.warn("Text cannot contain U+000C", DataLossWarning)
            data = data.replace("\x0C", " ")
        # Other non-xml characters
--- a/lib/html5lib/_inputstream.py
+++ b/lib/html5lib/_inputstream.py
@ -1,14 +1,16 @@
 from __future__ import absolute_import, division, unicode_literals

-from six import text_type
+from six import text_type, binary_type
 from six.moves import http_client, urllib

 import codecs
 import re

+import webencodings
+
 from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
-from .constants import encodings, ReparseException
-from . import utils
+from .constants import ReparseException
+from . import _utils

 from io import StringIO

@ -17,12 +19,6 @@ try:
 except ImportError:
    BytesIO = StringIO

-try:
-    from io import BufferedIOBase
-except ImportError:
-    class BufferedIOBase(object):
-        pass
-
 # Non-unicode versions of constants for use in the pre-parser
 spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
 asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
@ -30,15 +26,17 @@ asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase
 spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])


-invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
+invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"  # noqa

-if utils.supports_lone_surrogates:
+if _utils.supports_lone_surrogates:
    # Use one extra step of indirection and create surrogates with
-    # unichr. Not using this indirection would introduce an illegal
+    # eval. Not using this indirection would introduce an illegal
    # unicode literal on platforms not supporting such lone
    # surrogates.
-    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
-                                    eval('"\\uD800-\\uDFFF"'))
+    assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
+    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
+                                    eval('"\\uD800-\\uDFFF"') +  # pylint:disable=eval-used
+                                    "]")
 else:
    invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)

@ -130,7 +128,7 @@ class BufferedStream(object):
        return b"".join(rv)


-def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
+def HTMLInputStream(source, **kwargs):
    # Work around Python bug #20007: read(0) closes the connection.
    # http://bugs.python.org/issue20007
    if (isinstance(source, http_client.HTTPResponse) or
@ -144,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
        isUnicode = isinstance(source, text_type)

    if isUnicode:
-        if encoding is not None:
-            raise TypeError("Cannot explicitly set an encoding with a unicode string")
+        encodings = [x for x in kwargs if x.endswith("_encoding")]
+        if encodings:
+            raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)

-        return HTMLUnicodeInputStream(source)
+        return HTMLUnicodeInputStream(source, **kwargs)
    else:
-        return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
+        return HTMLBinaryInputStream(source, **kwargs)


 class HTMLUnicodeInputStream(object):
@ -175,27 +174,21 @@ class HTMLUnicodeInputStream(object):
        regardless of any BOM or later declaration (such as in a meta
        element)

-        parseMeta - Look for a <meta> element containing encoding information
-
        """

-        if not utils.supports_lone_surrogates:
+        if not _utils.supports_lone_surrogates:
            # Such platforms will have already checked for such
            # surrogate errors, so no need to do this checking.
            self.reportCharacterErrors = None
-            self.replaceCharactersRegexp = None
        elif len("\U0010FFFF") == 1:
            self.reportCharacterErrors = self.characterErrorsUCS4
-            self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
        else:
            self.reportCharacterErrors = self.characterErrorsUCS2
-            self.replaceCharactersRegexp = re.compile(
-                eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))

        # List of where new lines occur
        self.newLines = [0]

-        self.charEncoding = ("utf-8", "certain")
+        self.charEncoding = (lookupEncoding("utf-8"), "certain")
        self.dataStream = self.openStream(source)

        self.reset()
@ -288,10 +281,7 @@ class HTMLUnicodeInputStream(object):
        if self.reportCharacterErrors:
            self.reportCharacterErrors(data)

-            # Replace invalid characters
-            # Note U+0000 is dealt with in the tokenizer
-            data = self.replaceCharactersRegexp.sub("\ufffd", data)
-
+        # Replace invalid characters
        data = data.replace("\r\n", "\n")
        data = data.replace("\r", "\n")

@ -301,7 +291,7 @@ class HTMLUnicodeInputStream(object):
        return True

    def characterErrorsUCS4(self, data):
-        for i in range(len(invalid_unicode_re.findall(data))):
+        for _ in range(len(invalid_unicode_re.findall(data))):
            self.errors.append("invalid-codepoint")

    def characterErrorsUCS2(self, data):
@ -314,9 +304,9 @@ class HTMLUnicodeInputStream(object):
            codepoint = ord(match.group())
            pos = match.start()
            # Pretty sure there should be endianness issues here
-            if utils.isSurrogatePair(data[pos:pos + 2]):
+            if _utils.isSurrogatePair(data[pos:pos + 2]):
                # We have a surrogate pair!
-                char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
+                char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
                if char_val in non_bmp_invalid_codepoints:
                    self.errors.append("invalid-codepoint")
                skip = True
@ -399,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):

    """

-    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
+    def __init__(self, source, override_encoding=None, transport_encoding=None,
+                 same_origin_parent_encoding=None, likely_encoding=None,
+                 default_encoding="windows-1252", useChardet=True):
        """Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
@ -412,8 +404,6 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
        regardless of any BOM or later declaration (such as in a meta
        element)

-        parseMeta - Look for a <meta> element containing encoding information
-
        """
        # Raw Stream - for unicode objects this will encode to utf-8 and set
        #              self.charEncoding as appropriate
@ -421,27 +411,28 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):

        HTMLUnicodeInputStream.__init__(self, self.rawStream)

-        self.charEncoding = (codecName(encoding), "certain")
-
        # Encoding Information
        # Number of bytes to use when looking for a meta element with
        # encoding information
-        self.numBytesMeta = 512
+        self.numBytesMeta = 1024
        # Number of bytes to use when using detecting encoding using chardet
        self.numBytesChardet = 100
-        # Encoding to use if no other information can be found
-        self.defaultEncoding = "windows-1252"
+        # Things from args
+        self.override_encoding = override_encoding
+        self.transport_encoding = transport_encoding
+        self.same_origin_parent_encoding = same_origin_parent_encoding
+        self.likely_encoding = likely_encoding
+        self.default_encoding = default_encoding

-        # Detect encoding iff no explicit "transport level" encoding is supplied
-        if (self.charEncoding[0] is None):
-            self.charEncoding = self.detectEncoding(parseMeta, chardet)
+        # Determine encoding
+        self.charEncoding = self.determineEncoding(useChardet)
+        assert self.charEncoding[0] is not None

        # Call superclass
        self.reset()

    def reset(self):
-        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
-                                                                 'replace')
+        self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
        HTMLUnicodeInputStream.reset(self)

    def openStream(self, source):
@ -458,29 +449,50 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):

        try:
            stream.seek(stream.tell())
-        except:
+        except:  # pylint:disable=bare-except
            stream = BufferedStream(stream)

        return stream

-    def detectEncoding(self, parseMeta=True, chardet=True):
-        # First look for a BOM
+    def determineEncoding(self, chardet=True):
+        # BOMs take precedence over everything
        # This will also read past the BOM if present
-        encoding = self.detectBOM()
-        confidence = "certain"
-        # If there is no BOM need to look for meta elements with encoding
-        # information
-        if encoding is None and parseMeta:
-            encoding = self.detectEncodingMeta()
-            confidence = "tentative"
+        charEncoding = self.detectBOM(), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # If we've been overriden, we've been overriden
+        charEncoding = lookupEncoding(self.override_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Now check the transport layer
+        charEncoding = lookupEncoding(self.transport_encoding), "certain"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Look for meta elements with encoding information
+        charEncoding = self.detectEncodingMeta(), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
+        # Parent document encoding
+        charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
+        if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
+            return charEncoding
+
+        # "likely" encoding
+        charEncoding = lookupEncoding(self.likely_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding
+
        # Guess with chardet, if available
-        if encoding is None and chardet:
-            confidence = "tentative"
+        if chardet:
            try:
-                try:
-                    from charade.universaldetector import UniversalDetector
-                except ImportError:
-                    from chardet.universaldetector import UniversalDetector
+                from chardet.universaldetector import UniversalDetector
+            except ImportError:
+                pass
+            else:
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
@ -491,36 +503,33 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
-                encoding = detector.result['encoding']
+                encoding = lookupEncoding(detector.result['encoding'])
                self.rawStream.seek(0)
-            except ImportError:
-                pass
-        # If all else fails use the default encoding
-        if encoding is None:
-            confidence = "tentative"
-            encoding = self.defaultEncoding
+                if encoding is not None:
+                    return encoding, "tentative"

-        # Substitute for equivalent encodings:
-        encodingSub = {"iso-8859-1": "windows-1252"}
+        # Try the default encoding
+        charEncoding = lookupEncoding(self.default_encoding), "tentative"
+        if charEncoding[0] is not None:
+            return charEncoding

-        if encoding.lower() in encodingSub:
-            encoding = encodingSub[encoding.lower()]
-
-        return encoding, confidence
+        # Fallback to html5lib's default if even that hasn't worked
+        return lookupEncoding("windows-1252"), "tentative"

    def changeEncoding(self, newEncoding):
        assert self.charEncoding[1] != "certain"
-        newEncoding = codecName(newEncoding)
-        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
-            newEncoding = "utf-8"
+        newEncoding = lookupEncoding(newEncoding)
        if newEncoding is None:
            return
+        if newEncoding.name in ("utf-16be", "utf-16le"):
+            newEncoding = lookupEncoding("utf-8")
+            assert newEncoding is not None
        elif newEncoding == self.charEncoding[0]:
            self.charEncoding = (self.charEncoding[0], "certain")
        else:
            self.rawStream.seek(0)
-            self.reset()
            self.charEncoding = (newEncoding, "certain")
+            self.reset()
            raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))

    def detectBOM(self):
@ -529,8 +538,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
        encoding otherwise return None"""
        bomDict = {
            codecs.BOM_UTF8: 'utf-8',
-            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
-            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
+            codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
+            codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
        }

        # Go to beginning of file and read in 4 bytes
@ -550,9 +559,12 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):

        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
-        self.rawStream.seek(encoding and seek or 0)
-
-        return encoding
+        if encoding:
+            self.rawStream.seek(seek)
+            return lookupEncoding(encoding)
+        else:
+            self.rawStream.seek(0)
+            return None

    def detectEncodingMeta(self):
        """Report the encoding declared by the meta element
@ -563,8 +575,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
        self.rawStream.seek(0)
        encoding = parser.getEncoding()

-        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
-            encoding = "utf-8"
+        if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
+            encoding = lookupEncoding("utf-8")

        return encoding

@ -578,6 +590,7 @@ class EncodingBytes(bytes):
        return bytes.__new__(self, value.lower())

    def __init__(self, value):
+        # pylint:disable=unused-argument
        self._position = -1

    def __iter__(self):
@ -688,7 +701,7 @@ class EncodingParser(object):
            (b"<!", self.handleOther),
            (b"<?", self.handleOther),
            (b"<", self.handlePossibleStartTag))
-        for byte in self.data:
+        for _ in self.data:
            keepParsing = True
            for key, method in methodDispatch:
                if self.data.matchBytes(key):
@ -727,7 +740,7 @@ class EncodingParser(object):
                        return False
                elif attr[0] == b"charset":
                    tentativeEncoding = attr[1]
-                    codec = codecName(tentativeEncoding)
+                    codec = lookupEncoding(tentativeEncoding)
                    if codec is not None:
                        self.encoding = codec
                        return False
@ -735,7 +748,7 @@ class EncodingParser(object):
                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
                    tentativeEncoding = contentParser.parse()
                    if tentativeEncoding is not None:
-                        codec = codecName(tentativeEncoding)
+                        codec = lookupEncoding(tentativeEncoding)
                        if codec is not None:
                            if hasPragma:
                                self.encoding = codec
@ -892,16 +905,19 @@ class ContentAttrParser(object):
            return None


-def codecName(encoding):
+def lookupEncoding(encoding):
    """Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding."""
-    if isinstance(encoding, bytes):
+    if isinstance(encoding, binary_type):
        try:
            encoding = encoding.decode("ascii")
        except UnicodeDecodeError:
            return None
-    if encoding:
-        canonicalName = ascii_punctuation_re.sub("", encoding).lower()
-        return encodings.get(canonicalName, None)
+
+    if encoding is not None:
+        try:
+            return webencodings.lookup(encoding)
+        except AttributeError:
+            return None
    else:
        return None
--- a/lib/html5lib/_tokenizer.py
+++ b/lib/html5lib/_tokenizer.py
@ -1,9 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals

-try:
-    chr = unichr # flake8: noqa
-except NameError:
-    pass
+from six import unichr as chr

 from collections import deque

@ -14,9 +11,9 @@ from .constants import digits, hexDigits, EOF
 from .constants import tokenTypes, tagTokenTypes
 from .constants import replacementCharacters

-from .inputstream import HTMLInputStream
+from ._inputstream import HTMLInputStream

-from .trie import Trie
+from ._trie import Trie

 entitiesTrie = Trie(entities)

@ -34,16 +31,11 @@ class HTMLTokenizer(object):
      Points to HTMLInputStream object.
    """

-    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
-                 lowercaseElementName=True, lowercaseAttrName=True, parser=None):
+    def __init__(self, stream, parser=None, **kwargs):

-        self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
+        self.stream = HTMLInputStream(stream, **kwargs)
        self.parser = parser

-        # Perform case conversions?
-        self.lowercaseElementName = lowercaseElementName
-        self.lowercaseAttrName = lowercaseAttrName
-
        # Setup the initial tokenizer state
        self.escapeFlag = False
        self.lastFourChars = []
@ -147,8 +139,8 @@ class HTMLTokenizer(object):
        output = "&"

        charStack = [self.stream.char()]
-        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
-                or (allowedChar is not None and allowedChar == charStack[0])):
+        if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
+                (allowedChar is not None and allowedChar == charStack[0])):
            self.stream.unget(charStack[0])

        elif charStack[0] == "#":
@ -235,8 +227,7 @@ class HTMLTokenizer(object):
        token = self.currentToken
        # Add token to the queue to be yielded
        if (token["type"] in tagTokenTypes):
-            if self.lowercaseElementName:
-                token["name"] = token["name"].translate(asciiUpper2Lower)
+            token["name"] = token["name"].translate(asciiUpper2Lower)
            if token["type"] == tokenTypes["EndTag"]:
                if token["data"]:
                    self.tokenQueue.append({"type": tokenTypes["ParseError"],
@ -921,10 +912,9 @@ class HTMLTokenizer(object):
            # Attributes are not dropped at this stage. That happens when the
            # start tag token is emitted so values can still be safely appended
            # to attributes, but we do want to report the parse error in time.
-            if self.lowercaseAttrName:
-                self.currentToken["data"][-1][0] = (
-                    self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
-            for name, value in self.currentToken["data"][:-1]:
+            self.currentToken["data"][-1][0] = (
+                self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
+            for name, _ in self.currentToken["data"][:-1]:
                if self.currentToken["data"][-1][0] == name:
                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
                                            "duplicate-attribute"})
@ -1716,11 +1706,11 @@ class HTMLTokenizer(object):
                else:
                    data.append(char)

-        data = "".join(data)
+        data = "".join(data)  # pylint:disable=redefined-variable-type
        # Deal with null here rather than in the parser
        nullCount = data.count("\u0000")
        if nullCount > 0:
-            for i in range(nullCount):
+            for _ in range(nullCount):
                self.tokenQueue.append({"type": tokenTypes["ParseError"],
                                        "data": "invalid-codepoint"})
            data = data.replace("\u0000", "\uFFFD")
--- a/lib/html5lib/_trie/init.py
+++ b/lib/html5lib/_trie/init.py
@ -4,9 +4,11 @@ from .py import Trie as PyTrie

 Trie = PyTrie

+# pylint:disable=wrong-import-position
 try:
    from .datrie import Trie as DATrie
 except ImportError:
    pass
 else:
    Trie = DATrie
+# pylint:enable=wrong-import-position
--- a/lib/html5lib/_trie/_base.py
+++ b/lib/html5lib/_trie/_base.py
@ -7,7 +7,8 @@ class Trie(Mapping):
    """Abstract base class for tries"""

    def keys(self, prefix=None):
-        keys = super().keys()
+        # pylint:disable=arguments-differ
+        keys = super(Trie, self).keys()

        if prefix is None:
            return set(keys)
--- a/lib/html5lib/_trie/datrie.py
+++ b/lib/html5lib/_trie/datrie.py
--- a/lib/html5lib/_trie/py.py
+++ b/lib/html5lib/_trie/py.py
--- a/lib/html5lib/_utils.py
+++ b/lib/html5lib/_utils.py
@ -1,5 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals

+import sys
 from types import ModuleType

 from six import text_type
@ -12,9 +13,11 @@ except ImportError:

 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
           "surrogatePairToCodepoint", "moduleFactoryFactory",
-           "supports_lone_surrogates"]
+           "supports_lone_surrogates", "PY27"]


+PY27 = sys.version_info[0] == 2 and sys.version_info[1] >= 7
+
 # Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
 # caught by the below test. In general this would be any platform
 # using UTF-16 as its encoding of unicode strings, such as
@ -22,12 +25,12 @@ __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
 # surrogates, and there is no mechanism to further escape such
 # escapes.
 try:
-    _x = eval('"\\uD800"')
+    _x = eval('"\\uD800"')  # pylint:disable=eval-used
    if not isinstance(_x, text_type):
        # We need this with u"" because of http://bugs.jython.org/issue2039
-        _x = eval('u"\\uD800"')
+        _x = eval('u"\\uD800"')  # pylint:disable=eval-used
        assert isinstance(_x, text_type)
-except:
+except:  # pylint:disable=bare-except
    supports_lone_surrogates = False
 else:
    supports_lone_surrogates = True
@ -52,12 +55,13 @@ class MethodDispatcher(dict):
        # anything here.
        _dictEntries = []
        for name, value in items:
-            if type(name) in (list, tuple, frozenset, set):
+            if isinstance(name, (list, tuple, frozenset, set)):
                for item in name:
                    _dictEntries.append((item, value))
            else:
                _dictEntries.append((name, value))
        dict.__init__(self, _dictEntries)
+        assert len(self) == len(_dictEntries)
        self.default = None

    def __getitem__(self, key):
@ -109,3 +113,15 @@ def moduleFactoryFactory(factory):
            return mod

    return moduleFactory
+
+
+def memoize(func):
+    cache = {}
+
+    def wrapped(*args, **kwargs):
+        key = (tuple(args), tuple(kwargs.items()))
+        if key not in cache:
+            cache[key] = func(*args, **kwargs)
+        return cache[key]
+
+    return wrapped
--- a/lib/html5lib/constants.py
+++ b/lib/html5lib/constants.py
@ -283,6 +283,12 @@ E = {
        "Element %(name)s not allowed in a non-html context",
    "unexpected-end-tag-before-html":
        "Unexpected end tag (%(name)s) before html.",
+    "unexpected-inhead-noscript-tag":
+        "Element %(name)s not allowed in a inhead-noscript context",
+    "eof-in-head-noscript":
+        "Unexpected end of file. Expected inhead-noscript content",
+    "char-in-head-noscript":
+        "Unexpected non-space character. Expected inhead-noscript content",
    "XXX-undefined-error":
        "Undefined error (this sucks and should be fixed)",
 }
@ -431,6 +437,73 @@ mathmlTextIntegrationPointElements = frozenset([
    (namespaces["mathml"], "mtext")
 ])

+adjustSVGAttributes = {
+    "attributename": "attributeName",
+    "attributetype": "attributeType",
+    "basefrequency": "baseFrequency",
+    "baseprofile": "baseProfile",
+    "calcmode": "calcMode",
+    "clippathunits": "clipPathUnits",
+    "contentscripttype": "contentScriptType",
+    "contentstyletype": "contentStyleType",
+    "diffuseconstant": "diffuseConstant",
+    "edgemode": "edgeMode",
+    "externalresourcesrequired": "externalResourcesRequired",
+    "filterres": "filterRes",
+    "filterunits": "filterUnits",
+    "glyphref": "glyphRef",
+    "gradienttransform": "gradientTransform",
+    "gradientunits": "gradientUnits",
+    "kernelmatrix": "kernelMatrix",
+    "kernelunitlength": "kernelUnitLength",
+    "keypoints": "keyPoints",
+    "keysplines": "keySplines",
+    "keytimes": "keyTimes",
+    "lengthadjust": "lengthAdjust",
+    "limitingconeangle": "limitingConeAngle",
+    "markerheight": "markerHeight",
+    "markerunits": "markerUnits",
+    "markerwidth": "markerWidth",
+    "maskcontentunits": "maskContentUnits",
+    "maskunits": "maskUnits",
+    "numoctaves": "numOctaves",
+    "pathlength": "pathLength",
+    "patterncontentunits": "patternContentUnits",
+    "patterntransform": "patternTransform",
+    "patternunits": "patternUnits",
+    "pointsatx": "pointsAtX",
+    "pointsaty": "pointsAtY",
+    "pointsatz": "pointsAtZ",
+    "preservealpha": "preserveAlpha",
+    "preserveaspectratio": "preserveAspectRatio",
+    "primitiveunits": "primitiveUnits",
+    "refx": "refX",
+    "refy": "refY",
+    "repeatcount": "repeatCount",
+    "repeatdur": "repeatDur",
+    "requiredextensions": "requiredExtensions",
+    "requiredfeatures": "requiredFeatures",
+    "specularconstant": "specularConstant",
+    "specularexponent": "specularExponent",
+    "spreadmethod": "spreadMethod",
+    "startoffset": "startOffset",
+    "stddeviation": "stdDeviation",
+    "stitchtiles": "stitchTiles",
+    "surfacescale": "surfaceScale",
+    "systemlanguage": "systemLanguage",
+    "tablevalues": "tableValues",
+    "targetx": "targetX",
+    "targety": "targetY",
+    "textlength": "textLength",
+    "viewbox": "viewBox",
+    "viewtarget": "viewTarget",
+    "xchannelselector": "xChannelSelector",
+    "ychannelselector": "yChannelSelector",
+    "zoomandpan": "zoomAndPan"
+}
+
+adjustMathMLAttributes = {"definitionurl": "definitionURL"}
+
 adjustForeignAttributes = {
    "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
    "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
@ -2813,7 +2886,6 @@ replacementCharacters = {
    0x0d: "\u000D",
    0x80: "\u20AC",
    0x81: "\u0081",
-    0x81: "\u0081",
    0x82: "\u201A",
    0x83: "\u0192",
    0x84: "\u201E",
@ -2846,235 +2918,6 @@ replacementCharacters = {
    0x9F: "\u0178",
 }

-encodings = {
-    '437': 'cp437',
-    '850': 'cp850',
-    '852': 'cp852',
-    '855': 'cp855',
-    '857': 'cp857',
-    '860': 'cp860',
-    '861': 'cp861',
-    '862': 'cp862',
-    '863': 'cp863',
-    '865': 'cp865',
-    '866': 'cp866',
-    '869': 'cp869',
-    'ansix341968': 'ascii',
-    'ansix341986': 'ascii',
-    'arabic': 'iso8859-6',
-    'ascii': 'ascii',
-    'asmo708': 'iso8859-6',
-    'big5': 'big5',
-    'big5hkscs': 'big5hkscs',
-    'chinese': 'gbk',
-    'cp037': 'cp037',
-    'cp1026': 'cp1026',
-    'cp154': 'ptcp154',
-    'cp367': 'ascii',
-    'cp424': 'cp424',
-    'cp437': 'cp437',
-    'cp500': 'cp500',
-    'cp775': 'cp775',
-    'cp819': 'windows-1252',
-    'cp850': 'cp850',
-    'cp852': 'cp852',
-    'cp855': 'cp855',
-    'cp857': 'cp857',
-    'cp860': 'cp860',
-    'cp861': 'cp861',
-    'cp862': 'cp862',
-    'cp863': 'cp863',
-    'cp864': 'cp864',
-    'cp865': 'cp865',
-    'cp866': 'cp866',
-    'cp869': 'cp869',
-    'cp936': 'gbk',
-    'cpgr': 'cp869',
-    'cpis': 'cp861',
-    'csascii': 'ascii',
-    'csbig5': 'big5',
-    'cseuckr': 'cp949',
-    'cseucpkdfmtjapanese': 'euc_jp',
-    'csgb2312': 'gbk',
-    'cshproman8': 'hp-roman8',
-    'csibm037': 'cp037',
-    'csibm1026': 'cp1026',
-    'csibm424': 'cp424',
-    'csibm500': 'cp500',
-    'csibm855': 'cp855',
-    'csibm857': 'cp857',
-    'csibm860': 'cp860',
-    'csibm861': 'cp861',
-    'csibm863': 'cp863',
-    'csibm864': 'cp864',
-    'csibm865': 'cp865',
-    'csibm866': 'cp866',
-    'csibm869': 'cp869',
-    'csiso2022jp': 'iso2022_jp',
-    'csiso2022jp2': 'iso2022_jp_2',
-    'csiso2022kr': 'iso2022_kr',
-    'csiso58gb231280': 'gbk',
-    'csisolatin1': 'windows-1252',
-    'csisolatin2': 'iso8859-2',
-    'csisolatin3': 'iso8859-3',
-    'csisolatin4': 'iso8859-4',
-    'csisolatin5': 'windows-1254',
-    'csisolatin6': 'iso8859-10',
-    'csisolatinarabic': 'iso8859-6',
-    'csisolatincyrillic': 'iso8859-5',
-    'csisolatingreek': 'iso8859-7',
-    'csisolatinhebrew': 'iso8859-8',
-    'cskoi8r': 'koi8-r',
-    'csksc56011987': 'cp949',
-    'cspc775baltic': 'cp775',
-    'cspc850multilingual': 'cp850',
-    'cspc862latinhebrew': 'cp862',
-    'cspc8codepage437': 'cp437',
-    'cspcp852': 'cp852',
-    'csptcp154': 'ptcp154',
-    'csshiftjis': 'shift_jis',
-    'csunicode11utf7': 'utf-7',
-    'cyrillic': 'iso8859-5',
-    'cyrillicasian': 'ptcp154',
-    'ebcdiccpbe': 'cp500',
-    'ebcdiccpca': 'cp037',
-    'ebcdiccpch': 'cp500',
-    'ebcdiccphe': 'cp424',
-    'ebcdiccpnl': 'cp037',
-    'ebcdiccpus': 'cp037',
-    'ebcdiccpwt': 'cp037',
-    'ecma114': 'iso8859-6',
-    'ecma118': 'iso8859-7',
-    'elot928': 'iso8859-7',
-    'eucjp': 'euc_jp',
-    'euckr': 'cp949',
-    'extendedunixcodepackedformatforjapanese': 'euc_jp',
-    'gb18030': 'gb18030',
-    'gb2312': 'gbk',
-    'gb231280': 'gbk',
-    'gbk': 'gbk',
-    'greek': 'iso8859-7',
-    'greek8': 'iso8859-7',
-    'hebrew': 'iso8859-8',
-    'hproman8': 'hp-roman8',
-    'hzgb2312': 'hz',
-    'ibm037': 'cp037',
-    'ibm1026': 'cp1026',
-    'ibm367': 'ascii',
-    'ibm424': 'cp424',
-    'ibm437': 'cp437',
-    'ibm500': 'cp500',
-    'ibm775': 'cp775',
-    'ibm819': 'windows-1252',
-    'ibm850': 'cp850',
-    'ibm852': 'cp852',
-    'ibm855': 'cp855',
-    'ibm857': 'cp857',
-    'ibm860': 'cp860',
-    'ibm861': 'cp861',
-    'ibm862': 'cp862',
-    'ibm863': 'cp863',
-    'ibm864': 'cp864',
-    'ibm865': 'cp865',
-    'ibm866': 'cp866',
-    'ibm869': 'cp869',
-    'iso2022jp': 'iso2022_jp',
-    'iso2022jp2': 'iso2022_jp_2',
-    'iso2022kr': 'iso2022_kr',
-    'iso646irv1991': 'ascii',
-    'iso646us': 'ascii',
-    'iso88591': 'windows-1252',
-    'iso885910': 'iso8859-10',
-    'iso8859101992': 'iso8859-10',
-    'iso885911987': 'windows-1252',
-    'iso885913': 'iso8859-13',
-    'iso885914': 'iso8859-14',
-    'iso8859141998': 'iso8859-14',
-    'iso885915': 'iso8859-15',
-    'iso885916': 'iso8859-16',
-    'iso8859162001': 'iso8859-16',
-    'iso88592': 'iso8859-2',
-    'iso885921987': 'iso8859-2',
-    'iso88593': 'iso8859-3',
-    'iso885931988': 'iso8859-3',
-    'iso88594': 'iso8859-4',
-    'iso885941988': 'iso8859-4',
-    'iso88595': 'iso8859-5',
-    'iso885951988': 'iso8859-5',
-    'iso88596': 'iso8859-6',
-    'iso885961987': 'iso8859-6',
-    'iso88597': 'iso8859-7',
-    'iso885971987': 'iso8859-7',
-    'iso88598': 'iso8859-8',
-    'iso885981988': 'iso8859-8',
-    'iso88599': 'windows-1254',
-    'iso885991989': 'windows-1254',
-    'isoceltic': 'iso8859-14',
-    'isoir100': 'windows-1252',
-    'isoir101': 'iso8859-2',
-    'isoir109': 'iso8859-3',
-    'isoir110': 'iso8859-4',
-    'isoir126': 'iso8859-7',
-    'isoir127': 'iso8859-6',
-    'isoir138': 'iso8859-8',
-    'isoir144': 'iso8859-5',
-    'isoir148': 'windows-1254',
-    'isoir149': 'cp949',
-    'isoir157': 'iso8859-10',
-    'isoir199': 'iso8859-14',
-    'isoir226': 'iso8859-16',
-    'isoir58': 'gbk',
-    'isoir6': 'ascii',
-    'koi8r': 'koi8-r',
-    'koi8u': 'koi8-u',
-    'korean': 'cp949',
-    'ksc5601': 'cp949',
-    'ksc56011987': 'cp949',
-    'ksc56011989': 'cp949',
-    'l1': 'windows-1252',
-    'l10': 'iso8859-16',
-    'l2': 'iso8859-2',
-    'l3': 'iso8859-3',
-    'l4': 'iso8859-4',
-    'l5': 'windows-1254',
-    'l6': 'iso8859-10',
-    'l8': 'iso8859-14',
-    'latin1': 'windows-1252',
-    'latin10': 'iso8859-16',
-    'latin2': 'iso8859-2',
-    'latin3': 'iso8859-3',
-    'latin4': 'iso8859-4',
-    'latin5': 'windows-1254',
-    'latin6': 'iso8859-10',
-    'latin8': 'iso8859-14',
-    'latin9': 'iso8859-15',
-    'ms936': 'gbk',
-    'mskanji': 'shift_jis',
-    'pt154': 'ptcp154',
-    'ptcp154': 'ptcp154',
-    'r8': 'hp-roman8',
-    'roman8': 'hp-roman8',
-    'shiftjis': 'shift_jis',
-    'tis620': 'cp874',
-    'unicode11utf7': 'utf-7',
-    'us': 'ascii',
-    'usascii': 'ascii',
-    'utf16': 'utf-16',
-    'utf16be': 'utf-16-be',
-    'utf16le': 'utf-16-le',
-    'utf8': 'utf-8',
-    'windows1250': 'cp1250',
-    'windows1251': 'cp1251',
-    'windows1252': 'cp1252',
-    'windows1253': 'cp1253',
-    'windows1254': 'cp1254',
-    'windows1255': 'cp1255',
-    'windows1256': 'cp1256',
-    'windows1257': 'cp1257',
-    'windows1258': 'cp1258',
-    'windows936': 'gbk',
-    'x-x-big5': 'big5'}
-
 tokenTypes = {
    "Doctype": 0,
    "Characters": 1,
--- a/lib/html5lib/filters/alphabeticalattributes.py
+++ b/lib/html5lib/filters/alphabeticalattributes.py
@ -1,6 +1,6 @@
 from __future__ import absolute_import, division, unicode_literals

-from . import _base
+from . import base

 try:
    from collections import OrderedDict
@ -8,9 +8,9 @@ except ImportError:
    from ordereddict import OrderedDict


-class Filter(_base.Filter):
+class Filter(base.Filter):
    def __iter__(self):
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            if token["type"] in ("StartTag", "EmptyTag"):
                attrs = OrderedDict()
                for name, value in sorted(token["data"].items(),
--- a/lib/html5lib/filters/_base.py
+++ b/lib/html5lib/filters/_base.py
--- a/lib/html5lib/filters/inject_meta_charset.py
+++ b/lib/html5lib/filters/inject_meta_charset.py
@ -1,11 +1,11 @@
 from __future__ import absolute_import, division, unicode_literals

-from . import _base
+from . import base


-class Filter(_base.Filter):
+class Filter(base.Filter):
    def __init__(self, source, encoding):
-        _base.Filter.__init__(self, source)
+        base.Filter.__init__(self, source)
        self.encoding = encoding

    def __iter__(self):
@ -13,7 +13,7 @@ class Filter(_base.Filter):
        meta_found = (self.encoding is None)
        pending = []

-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            type = token["type"]
            if type == "StartTag":
                if token["name"].lower() == "head":
--- a/lib/html5lib/filters/lint.py
+++ b/lib/html5lib/filters/lint.py
@ -1,90 +1,81 @@
 from __future__ import absolute_import, division, unicode_literals

-from . import _base
-from ..constants import cdataElements, rcdataElements, voidElements
+from six import text_type
+
+from . import base
+from ..constants import namespaces, voidElements

 from ..constants import spaceCharacters
 spaceCharacters = "".join(spaceCharacters)


-class LintError(Exception):
-    pass
+class Filter(base.Filter):
+    def __init__(self, source, require_matching_tags=True):
+        super(Filter, self).__init__(source)
+        self.require_matching_tags = require_matching_tags

-
-class Filter(_base.Filter):
    def __iter__(self):
        open_elements = []
-        contentModelFlag = "PCDATA"
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            type = token["type"]
            if type in ("StartTag", "EmptyTag"):
+                namespace = token["namespace"]
                name = token["name"]
-                if contentModelFlag != "PCDATA":
-                    raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
-                if not isinstance(name, str):
-                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
-                if not name:
-                    raise LintError("Empty tag name")
-                if type == "StartTag" and name in voidElements:
-                    raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
-                elif type == "EmptyTag" and name not in voidElements:
-                    raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
-                if type == "StartTag":
-                    open_elements.append(name)
-                for name, value in token["data"]:
-                    if not isinstance(name, str):
-                        raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
-                    if not name:
-                        raise LintError("Empty attribute name")
-                    if not isinstance(value, str):
-                        raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
-                if name in cdataElements:
-                    contentModelFlag = "CDATA"
-                elif name in rcdataElements:
-                    contentModelFlag = "RCDATA"
-                elif name == "plaintext":
-                    contentModelFlag = "PLAINTEXT"
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
+                assert isinstance(token["data"], dict)
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    assert type == "EmptyTag"
+                else:
+                    assert type == "StartTag"
+                if type == "StartTag" and self.require_matching_tags:
+                    open_elements.append((namespace, name))
+                for (namespace, name), value in token["data"].items():
+                    assert namespace is None or isinstance(namespace, text_type)
+                    assert namespace != ""
+                    assert isinstance(name, text_type)
+                    assert name != ""
+                    assert isinstance(value, text_type)

            elif type == "EndTag":
+                namespace = token["namespace"]
                name = token["name"]
-                if not isinstance(name, str):
-                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
-                if not name:
-                    raise LintError("Empty tag name")
-                if name in voidElements:
-                    raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
-                start_name = open_elements.pop()
-                if start_name != name:
-                    raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
-                contentModelFlag = "PCDATA"
+                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace != ""
+                assert isinstance(name, text_type)
+                assert name != ""
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
+                    assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
+                elif self.require_matching_tags:
+                    start = open_elements.pop()
+                    assert start == (namespace, name)

            elif type == "Comment":
-                if contentModelFlag != "PCDATA":
-                    raise LintError("Comment not in PCDATA content model flag")
+                data = token["data"]
+                assert isinstance(data, text_type)

            elif type in ("Characters", "SpaceCharacters"):
                data = token["data"]
-                if not isinstance(data, str):
-                    raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
-                if not data:
-                    raise LintError("%(type)s token with empty data" % {"type": type})
+                assert isinstance(data, text_type)
+                assert data != ""
                if type == "SpaceCharacters":
-                    data = data.strip(spaceCharacters)
-                    if data:
-                        raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
+                    assert data.strip(spaceCharacters) == ""

            elif type == "Doctype":
                name = token["name"]
-                if contentModelFlag != "PCDATA":
-                    raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
-                if not isinstance(name, str):
-                    raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
-                # XXX: what to do with token["data"] ?
+                assert name is None or isinstance(name, text_type)
+                assert token["publicId"] is None or isinstance(name, text_type)
+                assert token["systemId"] is None or isinstance(name, text_type)

-            elif type in ("ParseError", "SerializeError"):
-                pass
+            elif type == "Entity":
+                assert isinstance(token["name"], text_type)
+
+            elif type == "SerializerError":
+                assert isinstance(token["data"], text_type)

            else:
-                raise LintError("Unknown token type: %(type)s" % {"type": type})
+                assert False, "Unknown token type: %(type)s" % {"type": type}

            yield token
--- a/lib/html5lib/filters/optionaltags.py
+++ b/lib/html5lib/filters/optionaltags.py
@ -1,9 +1,9 @@
 from __future__ import absolute_import, division, unicode_literals

-from . import _base
+from . import base


-class Filter(_base.Filter):
+class Filter(base.Filter):
    def slider(self):
        previous1 = previous2 = None
        for token in self.source:
@ -11,7 +11,8 @@ class Filter(_base.Filter):
                yield previous2, previous1, token
            previous2 = previous1
            previous1 = token
-        yield previous2, previous1, None
+        if previous1 is not None:
+            yield previous2, previous1, None

    def __iter__(self):
        for previous, token, next in self.slider():
--- a/lib/html5lib/filters/sanitizer.py
+++ b/lib/html5lib/filters/sanitizer.py
@ -1,12 +1,865 @@
 from __future__ import absolute_import, division, unicode_literals

-from . import _base
-from ..sanitizer import HTMLSanitizerMixin
+import re
+from xml.sax.saxutils import escape, unescape
+
+from six.moves import urllib_parse as urlparse
+
+from . import base
+from ..constants import namespaces, prefixes
+
+__all__ = ["Filter"]


-class Filter(_base.Filter, HTMLSanitizerMixin):
+allowed_elements = frozenset((
+    (namespaces['html'], 'a'),
+    (namespaces['html'], 'abbr'),
+    (namespaces['html'], 'acronym'),
+    (namespaces['html'], 'address'),
+    (namespaces['html'], 'area'),
+    (namespaces['html'], 'article'),
+    (namespaces['html'], 'aside'),
+    (namespaces['html'], 'audio'),
+    (namespaces['html'], 'b'),
+    (namespaces['html'], 'big'),
+    (namespaces['html'], 'blockquote'),
+    (namespaces['html'], 'br'),
+    (namespaces['html'], 'button'),
+    (namespaces['html'], 'canvas'),
+    (namespaces['html'], 'caption'),
+    (namespaces['html'], 'center'),
+    (namespaces['html'], 'cite'),
+    (namespaces['html'], 'code'),
+    (namespaces['html'], 'col'),
+    (namespaces['html'], 'colgroup'),
+    (namespaces['html'], 'command'),
+    (namespaces['html'], 'datagrid'),
+    (namespaces['html'], 'datalist'),
+    (namespaces['html'], 'dd'),
+    (namespaces['html'], 'del'),
+    (namespaces['html'], 'details'),
+    (namespaces['html'], 'dfn'),
+    (namespaces['html'], 'dialog'),
+    (namespaces['html'], 'dir'),
+    (namespaces['html'], 'div'),
+    (namespaces['html'], 'dl'),
+    (namespaces['html'], 'dt'),
+    (namespaces['html'], 'em'),
+    (namespaces['html'], 'event-source'),
+    (namespaces['html'], 'fieldset'),
+    (namespaces['html'], 'figcaption'),
+    (namespaces['html'], 'figure'),
+    (namespaces['html'], 'footer'),
+    (namespaces['html'], 'font'),
+    (namespaces['html'], 'form'),
+    (namespaces['html'], 'header'),
+    (namespaces['html'], 'h1'),
+    (namespaces['html'], 'h2'),
+    (namespaces['html'], 'h3'),
+    (namespaces['html'], 'h4'),
+    (namespaces['html'], 'h5'),
+    (namespaces['html'], 'h6'),
+    (namespaces['html'], 'hr'),
+    (namespaces['html'], 'i'),
+    (namespaces['html'], 'img'),
+    (namespaces['html'], 'input'),
+    (namespaces['html'], 'ins'),
+    (namespaces['html'], 'keygen'),
+    (namespaces['html'], 'kbd'),
+    (namespaces['html'], 'label'),
+    (namespaces['html'], 'legend'),
+    (namespaces['html'], 'li'),
+    (namespaces['html'], 'm'),
+    (namespaces['html'], 'map'),
+    (namespaces['html'], 'menu'),
+    (namespaces['html'], 'meter'),
+    (namespaces['html'], 'multicol'),
+    (namespaces['html'], 'nav'),
+    (namespaces['html'], 'nextid'),
+    (namespaces['html'], 'ol'),
+    (namespaces['html'], 'output'),
+    (namespaces['html'], 'optgroup'),
+    (namespaces['html'], 'option'),
+    (namespaces['html'], 'p'),
+    (namespaces['html'], 'pre'),
+    (namespaces['html'], 'progress'),
+    (namespaces['html'], 'q'),
+    (namespaces['html'], 's'),
+    (namespaces['html'], 'samp'),
+    (namespaces['html'], 'section'),
+    (namespaces['html'], 'select'),
+    (namespaces['html'], 'small'),
+    (namespaces['html'], 'sound'),
+    (namespaces['html'], 'source'),
+    (namespaces['html'], 'spacer'),
+    (namespaces['html'], 'span'),
+    (namespaces['html'], 'strike'),
+    (namespaces['html'], 'strong'),
+    (namespaces['html'], 'sub'),
+    (namespaces['html'], 'sup'),
+    (namespaces['html'], 'table'),
+    (namespaces['html'], 'tbody'),
+    (namespaces['html'], 'td'),
+    (namespaces['html'], 'textarea'),
+    (namespaces['html'], 'time'),
+    (namespaces['html'], 'tfoot'),
+    (namespaces['html'], 'th'),
+    (namespaces['html'], 'thead'),
+    (namespaces['html'], 'tr'),
+    (namespaces['html'], 'tt'),
+    (namespaces['html'], 'u'),
+    (namespaces['html'], 'ul'),
+    (namespaces['html'], 'var'),
+    (namespaces['html'], 'video'),
+    (namespaces['mathml'], 'maction'),
+    (namespaces['mathml'], 'math'),
+    (namespaces['mathml'], 'merror'),
+    (namespaces['mathml'], 'mfrac'),
+    (namespaces['mathml'], 'mi'),
+    (namespaces['mathml'], 'mmultiscripts'),
+    (namespaces['mathml'], 'mn'),
+    (namespaces['mathml'], 'mo'),
+    (namespaces['mathml'], 'mover'),
+    (namespaces['mathml'], 'mpadded'),
+    (namespaces['mathml'], 'mphantom'),
+    (namespaces['mathml'], 'mprescripts'),
+    (namespaces['mathml'], 'mroot'),
+    (namespaces['mathml'], 'mrow'),
+    (namespaces['mathml'], 'mspace'),
+    (namespaces['mathml'], 'msqrt'),
+    (namespaces['mathml'], 'mstyle'),
+    (namespaces['mathml'], 'msub'),
+    (namespaces['mathml'], 'msubsup'),
+    (namespaces['mathml'], 'msup'),
+    (namespaces['mathml'], 'mtable'),
+    (namespaces['mathml'], 'mtd'),
+    (namespaces['mathml'], 'mtext'),
+    (namespaces['mathml'], 'mtr'),
+    (namespaces['mathml'], 'munder'),
+    (namespaces['mathml'], 'munderover'),
+    (namespaces['mathml'], 'none'),
+    (namespaces['svg'], 'a'),
+    (namespaces['svg'], 'animate'),
+    (namespaces['svg'], 'animateColor'),
+    (namespaces['svg'], 'animateMotion'),
+    (namespaces['svg'], 'animateTransform'),
+    (namespaces['svg'], 'clipPath'),
+    (namespaces['svg'], 'circle'),
+    (namespaces['svg'], 'defs'),
+    (namespaces['svg'], 'desc'),
+    (namespaces['svg'], 'ellipse'),
+    (namespaces['svg'], 'font-face'),
+    (namespaces['svg'], 'font-face-name'),
+    (namespaces['svg'], 'font-face-src'),
+    (namespaces['svg'], 'g'),
+    (namespaces['svg'], 'glyph'),
+    (namespaces['svg'], 'hkern'),
+    (namespaces['svg'], 'linearGradient'),
+    (namespaces['svg'], 'line'),
+    (namespaces['svg'], 'marker'),
+    (namespaces['svg'], 'metadata'),
+    (namespaces['svg'], 'missing-glyph'),
+    (namespaces['svg'], 'mpath'),
+    (namespaces['svg'], 'path'),
+    (namespaces['svg'], 'polygon'),
+    (namespaces['svg'], 'polyline'),
+    (namespaces['svg'], 'radialGradient'),
+    (namespaces['svg'], 'rect'),
+    (namespaces['svg'], 'set'),
+    (namespaces['svg'], 'stop'),
+    (namespaces['svg'], 'svg'),
+    (namespaces['svg'], 'switch'),
+    (namespaces['svg'], 'text'),
+    (namespaces['svg'], 'title'),
+    (namespaces['svg'], 'tspan'),
+    (namespaces['svg'], 'use'),
+))
+
+allowed_attributes = frozenset((
+    # HTML attributes
+    (None, 'abbr'),
+    (None, 'accept'),
+    (None, 'accept-charset'),
+    (None, 'accesskey'),
+    (None, 'action'),
+    (None, 'align'),
+    (None, 'alt'),
+    (None, 'autocomplete'),
+    (None, 'autofocus'),
+    (None, 'axis'),
+    (None, 'background'),
+    (None, 'balance'),
+    (None, 'bgcolor'),
+    (None, 'bgproperties'),
+    (None, 'border'),
+    (None, 'bordercolor'),
+    (None, 'bordercolordark'),
+    (None, 'bordercolorlight'),
+    (None, 'bottompadding'),
+    (None, 'cellpadding'),
+    (None, 'cellspacing'),
+    (None, 'ch'),
+    (None, 'challenge'),
+    (None, 'char'),
+    (None, 'charoff'),
+    (None, 'choff'),
+    (None, 'charset'),
+    (None, 'checked'),
+    (None, 'cite'),
+    (None, 'class'),
+    (None, 'clear'),
+    (None, 'color'),
+    (None, 'cols'),
+    (None, 'colspan'),
+    (None, 'compact'),
+    (None, 'contenteditable'),
+    (None, 'controls'),
+    (None, 'coords'),
+    (None, 'data'),
+    (None, 'datafld'),
+    (None, 'datapagesize'),
+    (None, 'datasrc'),
+    (None, 'datetime'),
+    (None, 'default'),
+    (None, 'delay'),
+    (None, 'dir'),
+    (None, 'disabled'),
+    (None, 'draggable'),
+    (None, 'dynsrc'),
+    (None, 'enctype'),
+    (None, 'end'),
+    (None, 'face'),
+    (None, 'for'),
+    (None, 'form'),
+    (None, 'frame'),
+    (None, 'galleryimg'),
+    (None, 'gutter'),
+    (None, 'headers'),
+    (None, 'height'),
+    (None, 'hidefocus'),
+    (None, 'hidden'),
+    (None, 'high'),
+    (None, 'href'),
+    (None, 'hreflang'),
+    (None, 'hspace'),
+    (None, 'icon'),
+    (None, 'id'),
+    (None, 'inputmode'),
+    (None, 'ismap'),
+    (None, 'keytype'),
+    (None, 'label'),
+    (None, 'leftspacing'),
+    (None, 'lang'),
+    (None, 'list'),
+    (None, 'longdesc'),
+    (None, 'loop'),
+    (None, 'loopcount'),
+    (None, 'loopend'),
+    (None, 'loopstart'),
+    (None, 'low'),
+    (None, 'lowsrc'),
+    (None, 'max'),
+    (None, 'maxlength'),
+    (None, 'media'),
+    (None, 'method'),
+    (None, 'min'),
+    (None, 'multiple'),
+    (None, 'name'),
+    (None, 'nohref'),
+    (None, 'noshade'),
+    (None, 'nowrap'),
+    (None, 'open'),
+    (None, 'optimum'),
+    (None, 'pattern'),
+    (None, 'ping'),
+    (None, 'point-size'),
+    (None, 'poster'),
+    (None, 'pqg'),
+    (None, 'preload'),
+    (None, 'prompt'),
+    (None, 'radiogroup'),
+    (None, 'readonly'),
+    (None, 'rel'),
+    (None, 'repeat-max'),
+    (None, 'repeat-min'),
+    (None, 'replace'),
+    (None, 'required'),
+    (None, 'rev'),
+    (None, 'rightspacing'),
+    (None, 'rows'),
+    (None, 'rowspan'),
+    (None, 'rules'),
+    (None, 'scope'),
+    (None, 'selected'),
+    (None, 'shape'),
+    (None, 'size'),
+    (None, 'span'),
+    (None, 'src'),
+    (None, 'start'),
+    (None, 'step'),
+    (None, 'style'),
+    (None, 'summary'),
+    (None, 'suppress'),
+    (None, 'tabindex'),
+    (None, 'target'),
+    (None, 'template'),
+    (None, 'title'),
+    (None, 'toppadding'),
+    (None, 'type'),
+    (None, 'unselectable'),
+    (None, 'usemap'),
+    (None, 'urn'),
+    (None, 'valign'),
+    (None, 'value'),
+    (None, 'variable'),
+    (None, 'volume'),
+    (None, 'vspace'),
+    (None, 'vrml'),
+    (None, 'width'),
+    (None, 'wrap'),
+    (namespaces['xml'], 'lang'),
+    # MathML attributes
+    (None, 'actiontype'),
+    (None, 'align'),
+    (None, 'columnalign'),
+    (None, 'columnalign'),
+    (None, 'columnalign'),
+    (None, 'columnlines'),
+    (None, 'columnspacing'),
+    (None, 'columnspan'),
+    (None, 'depth'),
+    (None, 'display'),
+    (None, 'displaystyle'),
+    (None, 'equalcolumns'),
+    (None, 'equalrows'),
+    (None, 'fence'),
+    (None, 'fontstyle'),
+    (None, 'fontweight'),
+    (None, 'frame'),
+    (None, 'height'),
+    (None, 'linethickness'),
+    (None, 'lspace'),
+    (None, 'mathbackground'),
+    (None, 'mathcolor'),
+    (None, 'mathvariant'),
+    (None, 'mathvariant'),
+    (None, 'maxsize'),
+    (None, 'minsize'),
+    (None, 'other'),
+    (None, 'rowalign'),
+    (None, 'rowalign'),
+    (None, 'rowalign'),
+    (None, 'rowlines'),
+    (None, 'rowspacing'),
+    (None, 'rowspan'),
+    (None, 'rspace'),
+    (None, 'scriptlevel'),
+    (None, 'selection'),
+    (None, 'separator'),
+    (None, 'stretchy'),
+    (None, 'width'),
+    (None, 'width'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xlink'], 'show'),
+    (namespaces['xlink'], 'type'),
+    # SVG attributes
+    (None, 'accent-height'),
+    (None, 'accumulate'),
+    (None, 'additive'),
+    (None, 'alphabetic'),
+    (None, 'arabic-form'),
+    (None, 'ascent'),
+    (None, 'attributeName'),
+    (None, 'attributeType'),
+    (None, 'baseProfile'),
+    (None, 'bbox'),
+    (None, 'begin'),
+    (None, 'by'),
+    (None, 'calcMode'),
+    (None, 'cap-height'),
+    (None, 'class'),
+    (None, 'clip-path'),
+    (None, 'color'),
+    (None, 'color-rendering'),
+    (None, 'content'),
+    (None, 'cx'),
+    (None, 'cy'),
+    (None, 'd'),
+    (None, 'dx'),
+    (None, 'dy'),
+    (None, 'descent'),
+    (None, 'display'),
+    (None, 'dur'),
+    (None, 'end'),
+    (None, 'fill'),
+    (None, 'fill-opacity'),
+    (None, 'fill-rule'),
+    (None, 'font-family'),
+    (None, 'font-size'),
+    (None, 'font-stretch'),
+    (None, 'font-style'),
+    (None, 'font-variant'),
+    (None, 'font-weight'),
+    (None, 'from'),
+    (None, 'fx'),
+    (None, 'fy'),
+    (None, 'g1'),
+    (None, 'g2'),
+    (None, 'glyph-name'),
+    (None, 'gradientUnits'),
+    (None, 'hanging'),
+    (None, 'height'),
+    (None, 'horiz-adv-x'),
+    (None, 'horiz-origin-x'),
+    (None, 'id'),
+    (None, 'ideographic'),
+    (None, 'k'),
+    (None, 'keyPoints'),
+    (None, 'keySplines'),
+    (None, 'keyTimes'),
+    (None, 'lang'),
+    (None, 'marker-end'),
+    (None, 'marker-mid'),
+    (None, 'marker-start'),
+    (None, 'markerHeight'),
+    (None, 'markerUnits'),
+    (None, 'markerWidth'),
+    (None, 'mathematical'),
+    (None, 'max'),
+    (None, 'min'),
+    (None, 'name'),
+    (None, 'offset'),
+    (None, 'opacity'),
+    (None, 'orient'),
+    (None, 'origin'),
+    (None, 'overline-position'),
+    (None, 'overline-thickness'),
+    (None, 'panose-1'),
+    (None, 'path'),
+    (None, 'pathLength'),
+    (None, 'points'),
+    (None, 'preserveAspectRatio'),
+    (None, 'r'),
+    (None, 'refX'),
+    (None, 'refY'),
+    (None, 'repeatCount'),
+    (None, 'repeatDur'),
+    (None, 'requiredExtensions'),
+    (None, 'requiredFeatures'),
+    (None, 'restart'),
+    (None, 'rotate'),
+    (None, 'rx'),
+    (None, 'ry'),
+    (None, 'slope'),
+    (None, 'stemh'),
+    (None, 'stemv'),
+    (None, 'stop-color'),
+    (None, 'stop-opacity'),
+    (None, 'strikethrough-position'),
+    (None, 'strikethrough-thickness'),
+    (None, 'stroke'),
+    (None, 'stroke-dasharray'),
+    (None, 'stroke-dashoffset'),
+    (None, 'stroke-linecap'),
+    (None, 'stroke-linejoin'),
+    (None, 'stroke-miterlimit'),
+    (None, 'stroke-opacity'),
+    (None, 'stroke-width'),
+    (None, 'systemLanguage'),
+    (None, 'target'),
+    (None, 'text-anchor'),
+    (None, 'to'),
+    (None, 'transform'),
+    (None, 'type'),
+    (None, 'u1'),
+    (None, 'u2'),
+    (None, 'underline-position'),
+    (None, 'underline-thickness'),
+    (None, 'unicode'),
+    (None, 'unicode-range'),
+    (None, 'units-per-em'),
+    (None, 'values'),
+    (None, 'version'),
+    (None, 'viewBox'),
+    (None, 'visibility'),
+    (None, 'width'),
+    (None, 'widths'),
+    (None, 'x'),
+    (None, 'x-height'),
+    (None, 'x1'),
+    (None, 'x2'),
+    (namespaces['xlink'], 'actuate'),
+    (namespaces['xlink'], 'arcrole'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xlink'], 'role'),
+    (namespaces['xlink'], 'show'),
+    (namespaces['xlink'], 'title'),
+    (namespaces['xlink'], 'type'),
+    (namespaces['xml'], 'base'),
+    (namespaces['xml'], 'lang'),
+    (namespaces['xml'], 'space'),
+    (None, 'y'),
+    (None, 'y1'),
+    (None, 'y2'),
+    (None, 'zoomAndPan'),
+))
+
+attr_val_is_uri = frozenset((
+    (None, 'href'),
+    (None, 'src'),
+    (None, 'cite'),
+    (None, 'action'),
+    (None, 'longdesc'),
+    (None, 'poster'),
+    (None, 'background'),
+    (None, 'datasrc'),
+    (None, 'dynsrc'),
+    (None, 'lowsrc'),
+    (None, 'ping'),
+    (namespaces['xlink'], 'href'),
+    (namespaces['xml'], 'base'),
+))
+
+svg_attr_val_allows_ref = frozenset((
+    (None, 'clip-path'),
+    (None, 'color-profile'),
+    (None, 'cursor'),
+    (None, 'fill'),
+    (None, 'filter'),
+    (None, 'marker'),
+    (None, 'marker-start'),
+    (None, 'marker-mid'),
+    (None, 'marker-end'),
+    (None, 'mask'),
+    (None, 'stroke'),
+))
+
+svg_allow_local_href = frozenset((
+    (None, 'altGlyph'),
+    (None, 'animate'),
+    (None, 'animateColor'),
+    (None, 'animateMotion'),
+    (None, 'animateTransform'),
+    (None, 'cursor'),
+    (None, 'feImage'),
+    (None, 'filter'),
+    (None, 'linearGradient'),
+    (None, 'pattern'),
+    (None, 'radialGradient'),
+    (None, 'textpath'),
+    (None, 'tref'),
+    (None, 'set'),
+    (None, 'use')
+))
+
+allowed_css_properties = frozenset((
+    'azimuth',
+    'background-color',
+    'border-bottom-color',
+    'border-collapse',
+    'border-color',
+    'border-left-color',
+    'border-right-color',
+    'border-top-color',
+    'clear',
+    'color',
+    'cursor',
+    'direction',
+    'display',
+    'elevation',
+    'float',
+    'font',
+    'font-family',
+    'font-size',
+    'font-style',
+    'font-variant',
+    'font-weight',
+    'height',
+    'letter-spacing',
+    'line-height',
+    'overflow',
+    'pause',
+    'pause-after',
+    'pause-before',
+    'pitch',
+    'pitch-range',
+    'richness',
+    'speak',
+    'speak-header',
+    'speak-numeral',
+    'speak-punctuation',
+    'speech-rate',
+    'stress',
+    'text-align',
+    'text-decoration',
+    'text-indent',
+    'unicode-bidi',
+    'vertical-align',
+    'voice-family',
+    'volume',
+    'white-space',
+    'width',
+))
+
+allowed_css_keywords = frozenset((
+    'auto',
+    'aqua',
+    'black',
+    'block',
+    'blue',
+    'bold',
+    'both',
+    'bottom',
+    'brown',
+    'center',
+    'collapse',
+    'dashed',
+    'dotted',
+    'fuchsia',
+    'gray',
+    'green',
+    '!important',
+    'italic',
+    'left',
+    'lime',
+    'maroon',
+    'medium',
+    'none',
+    'navy',
+    'normal',
+    'nowrap',
+    'olive',
+    'pointer',
+    'purple',
+    'red',
+    'right',
+    'solid',
+    'silver',
+    'teal',
+    'top',
+    'transparent',
+    'underline',
+    'white',
+    'yellow',
+))
+
+allowed_svg_properties = frozenset((
+    'fill',
+    'fill-opacity',
+    'fill-rule',
+    'stroke',
+    'stroke-width',
+    'stroke-linecap',
+    'stroke-linejoin',
+    'stroke-opacity',
+))
+
+allowed_protocols = frozenset((
+    'ed2k',
+    'ftp',
+    'http',
+    'https',
+    'irc',
+    'mailto',
+    'news',
+    'gopher',
+    'nntp',
+    'telnet',
+    'webcal',
+    'xmpp',
+    'callto',
+    'feed',
+    'urn',
+    'aim',
+    'rsync',
+    'tag',
+    'ssh',
+    'sftp',
+    'rtsp',
+    'afs',
+    'data',
+))
+
+allowed_content_types = frozenset((
+    'image/png',
+    'image/jpeg',
+    'image/gif',
+    'image/webp',
+    'image/bmp',
+    'text/plain',
+))
+
+
+data_content_type = re.compile(r'''
+                                ^
+                                # Match a content type <application>/<type>
+                                (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
+                                # Match any character set and encoding
+                                (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
+                                  |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
+                                # Assume the rest is data
+                                ,.*
+                                $
+                                ''',
+                               re.VERBOSE)
+
+
+class Filter(base.Filter):
+    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
+    def __init__(self,
+                 source,
+                 allowed_elements=allowed_elements,
+                 allowed_attributes=allowed_attributes,
+                 allowed_css_properties=allowed_css_properties,
+                 allowed_css_keywords=allowed_css_keywords,
+                 allowed_svg_properties=allowed_svg_properties,
+                 allowed_protocols=allowed_protocols,
+                 allowed_content_types=allowed_content_types,
+                 attr_val_is_uri=attr_val_is_uri,
+                 svg_attr_val_allows_ref=svg_attr_val_allows_ref,
+                 svg_allow_local_href=svg_allow_local_href):
+        super(Filter, self).__init__(source)
+        self.allowed_elements = allowed_elements
+        self.allowed_attributes = allowed_attributes
+        self.allowed_css_properties = allowed_css_properties
+        self.allowed_css_keywords = allowed_css_keywords
+        self.allowed_svg_properties = allowed_svg_properties
+        self.allowed_protocols = allowed_protocols
+        self.allowed_content_types = allowed_content_types
+        self.attr_val_is_uri = attr_val_is_uri
+        self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
+        self.svg_allow_local_href = svg_allow_local_href
+
    def __iter__(self):
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            token = self.sanitize_token(token)
            if token:
                yield token
+
+    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
+    # attributes are parsed, and a restricted set, # specified by
+    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
+    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
+    # in ALLOWED_PROTOCOLS are allowed.
+    #
+    #   sanitize_html('<script> do_nasty_stuff() </script>')
+    #    => &lt;script> do_nasty_stuff() &lt;/script>
+    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
+    #    => <a>Click here for $100</a>
+    def sanitize_token(self, token):
+
+        # accommodate filters which use token_type differently
+        token_type = token["type"]
+        if token_type in ("StartTag", "EndTag", "EmptyTag"):
+            name = token["name"]
+            namespace = token["namespace"]
+            if ((namespace, name) in self.allowed_elements or
+                (namespace is None and
+                 (namespaces["html"], name) in self.allowed_elements)):
+                return self.allowed_token(token)
+            else:
+                return self.disallowed_token(token)
+        elif token_type == "Comment":
+            pass
+        else:
+            return token
+
+    def allowed_token(self, token):
+        if "data" in token:
+            attrs = token["data"]
+            attr_names = set(attrs.keys())
+
+            # Remove forbidden attributes
+            for to_remove in (attr_names - self.allowed_attributes):
+                del token["data"][to_remove]
+                attr_names.remove(to_remove)
+
+            # Remove attributes with disallowed URL values
+            for attr in (attr_names & self.attr_val_is_uri):
+                assert attr in attrs
+                # I don't have a clue where this regexp comes from or why it matches those
+                # characters, nor why we call unescape. I just know it's always been here.
+                # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
+                # this will do is remove *more* than it otherwise would.
+                val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\s]+", '',
+                                       unescape(attrs[attr])).lower()
+                # remove replacement characters from unescaped characters
+                val_unescaped = val_unescaped.replace("\ufffd", "")
+                try:
+                    uri = urlparse.urlparse(val_unescaped)
+                except ValueError:
+                    uri = None
+                    del attrs[attr]
+                if uri and uri.scheme:
+                    if uri.scheme not in self.allowed_protocols:
+                        del attrs[attr]
+                    if uri.scheme == 'data':
+                        m = data_content_type.match(uri.path)
+                        if not m:
+                            del attrs[attr]
+                        elif m.group('content_type') not in self.allowed_content_types:
+                            del attrs[attr]
+
+            for attr in self.svg_attr_val_allows_ref:
+                if attr in attrs:
+                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                         ' ',
+                                         unescape(attrs[attr]))
+            if (token["name"] in self.svg_allow_local_href and
+                (namespaces['xlink'], 'href') in attrs and re.search('^\s*[^#\s].*',
+                                                                     attrs[(namespaces['xlink'], 'href')])):
+                del attrs[(namespaces['xlink'], 'href')]
+            if (None, 'style') in attrs:
+                attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
+            token["data"] = attrs
+        return token
+
+    def disallowed_token(self, token):
+        token_type = token["type"]
+        if token_type == "EndTag":
+            token["data"] = "</%s>" % token["name"]
+        elif token["data"]:
+            assert token_type in ("StartTag", "EmptyTag")
+            attrs = []
+            for (ns, name), v in token["data"].items():
+                attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
+            token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
+        else:
+            token["data"] = "<%s>" % token["name"]
+        if token.get("selfClosing"):
+            token["data"] = token["data"][:-1] + "/>"
+
+        token["type"] = "Characters"
+
+        del token["name"]
+        return token
+
+    def sanitize_css(self, style):
+        # disallow urls
+        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
+
+        # gauntlet
+        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
+            return ''
+        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
+            return ''
+
+        clean = []
+        for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
+            if not value:
+                continue
+            if prop.lower() in self.allowed_css_properties:
+                clean.append(prop + ': ' + value + ';')
+            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
+                                                'padding']:
+                for keyword in value.split():
+                    if keyword not in self.allowed_css_keywords and \
+                            not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):  # noqa
+                        break
+                else:
+                    clean.append(prop + ': ' + value + ';')
+            elif prop.lower() in self.allowed_svg_properties:
+                clean.append(prop + ': ' + value + ';')
+
+        return ' '.join(clean)
--- a/lib/html5lib/filters/whitespace.py
+++ b/lib/html5lib/filters/whitespace.py
@ -2,20 +2,20 @@ from __future__ import absolute_import, division, unicode_literals

 import re

-from . import _base
+from . import base
 from ..constants import rcdataElements, spaceCharacters
 spaceCharacters = "".join(spaceCharacters)

 SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)


-class Filter(_base.Filter):
+class Filter(base.Filter):

    spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))

    def __iter__(self):
        preserve = 0
-        for token in _base.Filter.__iter__(self):
+        for token in base.Filter.__iter__(self):
            type = token["type"]
            if type == "StartTag" \
                    and (preserve or token["name"] in self.spacePreserveElements):
--- a/lib/html5lib/html5parser.py
+++ b/lib/html5lib/html5parser.py
@ -1,39 +1,44 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import with_metaclass
+from six import with_metaclass, viewkeys, PY3

 import types

-from . import inputstream
-from . import tokenizer
+try:
+    from collections import OrderedDict
+except ImportError:
+    from ordereddict import OrderedDict
+
+from . import _inputstream
+from . import _tokenizer

 from . import treebuilders
-from .treebuilders._base import Marker
+from .treebuilders.base import Marker

-from . import utils
-from . import constants
-from .constants import spaceCharacters, asciiUpper2Lower
-from .constants import specialElements
-from .constants import headingElements
-from .constants import cdataElements, rcdataElements
-from .constants import tokenTypes, ReparseException, namespaces
-from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
-from .constants import adjustForeignAttributes as adjustForeignAttributesMap
-from .constants import E
+from . import _utils
+from .constants import (
+    spaceCharacters, asciiUpper2Lower,
+    specialElements, headingElements, cdataElements, rcdataElements,
+    tokenTypes, tagTokenTypes,
+    namespaces,
+    htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
+    adjustForeignAttributes as adjustForeignAttributesMap,
+    adjustMathMLAttributes, adjustSVGAttributes,
+    E,
+    ReparseException
+)


-def parse(doc, treebuilder="etree", encoding=None,
-          namespaceHTMLElements=True):
+def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
    """Parse a string or file-like object into a tree"""
    tb = treebuilders.getTreeBuilder(treebuilder)
    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
-    return p.parse(doc, encoding=encoding)
+    return p.parse(doc, **kwargs)


-def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
-                  namespaceHTMLElements=True):
+def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
    tb = treebuilders.getTreeBuilder(treebuilder)
    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
-    return p.parseFragment(doc, container=container, encoding=encoding)
+    return p.parseFragment(doc, container=container, **kwargs)


 def method_decorator_metaclass(function):
@ -52,18 +57,13 @@ class HTMLParser(object):
    """HTML parser. Generates a tree structure from a stream of (possibly
        malformed) HTML"""

-    def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
-                 strict=False, namespaceHTMLElements=True, debug=False):
+    def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
        """
        strict - raise an exception when a parse error is encountered

        tree - a treebuilder class controlling the type of tree that will be
        returned. Built in treebuilders can be accessed through
        html5lib.treebuilders.getTreeBuilder(treeType)
-
-        tokenizer - a class that provides a stream of tokens to the treebuilder.
-        This may be replaced for e.g. a sanitizer which converts some tags to
-        text
        """

        # Raise an exception on the first error encountered
@ -72,29 +72,24 @@ class HTMLParser(object):
        if tree is None:
            tree = treebuilders.getTreeBuilder("etree")
        self.tree = tree(namespaceHTMLElements)
-        self.tokenizer_class = tokenizer
        self.errors = []

        self.phases = dict([(name, cls(self, self.tree)) for name, cls in
                            getPhases(debug).items()])

-    def _parse(self, stream, innerHTML=False, container="div",
-               encoding=None, parseMeta=True, useChardet=True, **kwargs):
+    def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):

        self.innerHTMLMode = innerHTML
        self.container = container
-        self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
-                                              parseMeta=parseMeta,
-                                              useChardet=useChardet,
-                                              parser=self, **kwargs)
+        self.scripting = scripting
+        self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
        self.reset()

-        while True:
-            try:
-                self.mainLoop()
-                break
-            except ReparseException:
-                self.reset()
+        try:
+            self.mainLoop()
+        except ReparseException:
+            self.reset()
+            self.mainLoop()

    def reset(self):
        self.tree.reset()
@ -121,7 +116,7 @@ class HTMLParser(object):
            self.phase.insertHtmlElement()
            self.resetInsertionMode()
        else:
-            self.innerHTML = False
+            self.innerHTML = False  # pylint:disable=redefined-variable-type
            self.phase = self.phases["initial"]

        self.lastPhase = None
@ -139,7 +134,7 @@ class HTMLParser(object):
        """
        if not hasattr(self, 'tokenizer'):
            return None
-        return self.tokenizer.stream.charEncoding[0]
+        return self.tokenizer.stream.charEncoding[0].name

    def isHTMLIntegrationPoint(self, element):
        if (element.name == "annotation-xml" and
@ -164,8 +159,10 @@ class HTMLParser(object):
        ParseErrorToken = tokenTypes["ParseError"]

        for token in self.normalizedTokens():
+            prev_token = None
            new_token = token
            while new_token is not None:
+                prev_token = new_token
                currentNode = self.tree.openElements[-1] if self.tree.openElements else None
                currentNodeNamespace = currentNode.namespace if currentNode else None
                currentNodeName = currentNode.name if currentNode else None
@ -184,6 +181,7 @@ class HTMLParser(object):
                          type in (CharactersToken, SpaceCharactersToken))) or
                        (currentNodeNamespace == namespaces["mathml"] and
                         currentNodeName == "annotation-xml" and
+                         type == StartTagToken and
                         token["name"] == "svg") or
                        (self.isHTMLIntegrationPoint(currentNode) and
                         type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
@ -204,10 +202,10 @@ class HTMLParser(object):
                    elif type == DoctypeToken:
                        new_token = phase.processDoctype(new_token)

-            if (type == StartTagToken and token["selfClosing"]
-                    and not token["selfClosingAcknowledged"]):
+            if (type == StartTagToken and prev_token["selfClosing"] and
+                    not prev_token["selfClosingAcknowledged"]):
                self.parseError("non-void-element-with-trailing-solidus",
-                                {"name": token["name"]})
+                                {"name": prev_token["name"]})

        # When the loop finishes it's EOF
        reprocess = True
@ -222,7 +220,7 @@ class HTMLParser(object):
        for token in self.tokenizer:
            yield self.normalizeToken(token)

-    def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
+    def parse(self, stream, *args, **kwargs):
        """Parse a HTML document into a well-formed tree

        stream - a filelike object or string containing the HTML to be parsed
@ -231,13 +229,13 @@ class HTMLParser(object):
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)
+
+        scripting - treat noscript elements as if javascript was turned on
        """
-        self._parse(stream, innerHTML=False, encoding=encoding,
-                    parseMeta=parseMeta, useChardet=useChardet)
+        self._parse(stream, False, None, *args, **kwargs)
        return self.tree.getDocument()

-    def parseFragment(self, stream, container="div", encoding=None,
-                      parseMeta=False, useChardet=True):
+    def parseFragment(self, stream, *args, **kwargs):
        """Parse a HTML fragment into a well-formed tree fragment

        container - name of the element we're setting the innerHTML property
@ -249,12 +247,16 @@ class HTMLParser(object):
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)
+
+        scripting - treat noscript elements as if javascript was turned on
        """
-        self._parse(stream, True, container=container, encoding=encoding)
+        self._parse(stream, True, *args, **kwargs)
        return self.tree.getFragment()

-    def parseError(self, errorcode="XXX-undefined-error", datavars={}):
+    def parseError(self, errorcode="XXX-undefined-error", datavars=None):
        # XXX The idea is to make errorcode mandatory.
+        if datavars is None:
+            datavars = {}
        self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
        if self.strict:
            raise ParseError(E[errorcode] % datavars)
@ -263,98 +265,25 @@ class HTMLParser(object):
        """ HTML5 specific normalizations to the token stream """

        if token["type"] == tokenTypes["StartTag"]:
-            token["data"] = dict(token["data"][::-1])
+            raw = token["data"]
+            token["data"] = OrderedDict(raw)
+            if len(raw) > len(token["data"]):
+                # we had some duplicated attribute, fix so first wins
+                token["data"].update(raw[::-1])

        return token

    def adjustMathMLAttributes(self, token):
-        replacements = {"definitionurl": "definitionURL"}
-        for k, v in replacements.items():
-            if k in token["data"]:
-                token["data"][v] = token["data"][k]
-                del token["data"][k]
+        adjust_attributes(token, adjustMathMLAttributes)

    def adjustSVGAttributes(self, token):
-        replacements = {
-            "attributename": "attributeName",
-            "attributetype": "attributeType",
-            "basefrequency": "baseFrequency",
-            "baseprofile": "baseProfile",
-            "calcmode": "calcMode",
-            "clippathunits": "clipPathUnits",
-            "contentscripttype": "contentScriptType",
-            "contentstyletype": "contentStyleType",
-            "diffuseconstant": "diffuseConstant",
-            "edgemode": "edgeMode",
-            "externalresourcesrequired": "externalResourcesRequired",
-            "filterres": "filterRes",
-            "filterunits": "filterUnits",
-            "glyphref": "glyphRef",
-            "gradienttransform": "gradientTransform",
-            "gradientunits": "gradientUnits",
-            "kernelmatrix": "kernelMatrix",
-            "kernelunitlength": "kernelUnitLength",
-            "keypoints": "keyPoints",
-            "keysplines": "keySplines",
-            "keytimes": "keyTimes",
-            "lengthadjust": "lengthAdjust",
-            "limitingconeangle": "limitingConeAngle",
-            "markerheight": "markerHeight",
-            "markerunits": "markerUnits",
-            "markerwidth": "markerWidth",
-            "maskcontentunits": "maskContentUnits",
-            "maskunits": "maskUnits",
-            "numoctaves": "numOctaves",
-            "pathlength": "pathLength",
-            "patterncontentunits": "patternContentUnits",
-            "patterntransform": "patternTransform",
-            "patternunits": "patternUnits",
-            "pointsatx": "pointsAtX",
-            "pointsaty": "pointsAtY",
-            "pointsatz": "pointsAtZ",
-            "preservealpha": "preserveAlpha",
-            "preserveaspectratio": "preserveAspectRatio",
-            "primitiveunits": "primitiveUnits",
-            "refx": "refX",
-            "refy": "refY",
-            "repeatcount": "repeatCount",
-            "repeatdur": "repeatDur",
-            "requiredextensions": "requiredExtensions",
-            "requiredfeatures": "requiredFeatures",
-            "specularconstant": "specularConstant",
-            "specularexponent": "specularExponent",
-            "spreadmethod": "spreadMethod",
-            "startoffset": "startOffset",
-            "stddeviation": "stdDeviation",
-            "stitchtiles": "stitchTiles",
-            "surfacescale": "surfaceScale",
-            "systemlanguage": "systemLanguage",
-            "tablevalues": "tableValues",
-            "targetx": "targetX",
-            "targety": "targetY",
-            "textlength": "textLength",
-            "viewbox": "viewBox",
-            "viewtarget": "viewTarget",
-            "xchannelselector": "xChannelSelector",
-            "ychannelselector": "yChannelSelector",
-            "zoomandpan": "zoomAndPan"
-        }
-        for originalName in list(token["data"].keys()):
-            if originalName in replacements:
-                svgName = replacements[originalName]
-                token["data"][svgName] = token["data"][originalName]
-                del token["data"][originalName]
+        adjust_attributes(token, adjustSVGAttributes)

    def adjustForeignAttributes(self, token):
-        replacements = adjustForeignAttributesMap
-
-        for originalName in token["data"].keys():
-            if originalName in replacements:
-                foreignName = replacements[originalName]
-                token["data"][foreignName] = token["data"][originalName]
-                del token["data"][originalName]
+        adjust_attributes(token, adjustForeignAttributesMap)

    def reparseTokenNormal(self, token):
+        # pylint:disable=unused-argument
        self.parser.phase()

    def resetInsertionMode(self):
@ -419,11 +348,12 @@ class HTMLParser(object):
        self.phase = self.phases["text"]


+@_utils.memoize
 def getPhases(debug):
    def log(function):
        """Logger that records which phase processes each token"""
        type_names = dict((value, key) for key, value in
-                          constants.tokenTypes.items())
+                          tokenTypes.items())

        def wrapped(self, *args, **kwargs):
            if function.__name__.startswith("process") and len(args) > 0:
@ -432,7 +362,7 @@ def getPhases(debug):
                    info = {"type": type_names[token['type']]}
                except:
                    raise
-                if token['type'] in constants.tagTokenTypes:
+                if token['type'] in tagTokenTypes:
                    info["name"] = token['name']

                self.parser.log.append((self.parser.tokenizer.state.__name__,
@ -451,6 +381,7 @@ def getPhases(debug):
        else:
            return type

+    # pylint:disable=unused-argument
    class Phase(with_metaclass(getMetaclass(debug, log))):
        """Base class for helper object that implements each phase of processing
        """
@ -517,77 +448,76 @@ def getPhases(debug):
            if publicId != "":
                publicId = publicId.translate(asciiUpper2Lower)

-            if (not correct or token["name"] != "html"
-                or publicId.startswith(
-                    ("+//silmaril//dtd html pro v0r11 19970101//",
-                     "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
-                     "-//as//dtd html 3.0 aswedit + extensions//",
-                     "-//ietf//dtd html 2.0 level 1//",
-                     "-//ietf//dtd html 2.0 level 2//",
-                     "-//ietf//dtd html 2.0 strict level 1//",
-                     "-//ietf//dtd html 2.0 strict level 2//",
-                     "-//ietf//dtd html 2.0 strict//",
-                     "-//ietf//dtd html 2.0//",
-                     "-//ietf//dtd html 2.1e//",
-                     "-//ietf//dtd html 3.0//",
-                     "-//ietf//dtd html 3.2 final//",
-                     "-//ietf//dtd html 3.2//",
-                     "-//ietf//dtd html 3//",
-                     "-//ietf//dtd html level 0//",
-                     "-//ietf//dtd html level 1//",
-                     "-//ietf//dtd html level 2//",
-                     "-//ietf//dtd html level 3//",
-                     "-//ietf//dtd html strict level 0//",
-                     "-//ietf//dtd html strict level 1//",
-                     "-//ietf//dtd html strict level 2//",
-                     "-//ietf//dtd html strict level 3//",
-                     "-//ietf//dtd html strict//",
-                     "-//ietf//dtd html//",
-                     "-//metrius//dtd metrius presentational//",
-                     "-//microsoft//dtd internet explorer 2.0 html strict//",
-                     "-//microsoft//dtd internet explorer 2.0 html//",
-                     "-//microsoft//dtd internet explorer 2.0 tables//",
-                     "-//microsoft//dtd internet explorer 3.0 html strict//",
-                     "-//microsoft//dtd internet explorer 3.0 html//",
-                     "-//microsoft//dtd internet explorer 3.0 tables//",
-                     "-//netscape comm. corp.//dtd html//",
-                     "-//netscape comm. corp.//dtd strict html//",
-                     "-//o'reilly and associates//dtd html 2.0//",
-                     "-//o'reilly and associates//dtd html extended 1.0//",
-                     "-//o'reilly and associates//dtd html extended relaxed 1.0//",
-                     "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
-                     "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
-                     "-//spyglass//dtd html 2.0 extended//",
-                     "-//sq//dtd html 2.0 hotmetal + extensions//",
-                     "-//sun microsystems corp.//dtd hotjava html//",
-                     "-//sun microsystems corp.//dtd hotjava strict html//",
-                     "-//w3c//dtd html 3 1995-03-24//",
-                     "-//w3c//dtd html 3.2 draft//",
-                     "-//w3c//dtd html 3.2 final//",
-                     "-//w3c//dtd html 3.2//",
-                     "-//w3c//dtd html 3.2s draft//",
-                     "-//w3c//dtd html 4.0 frameset//",
-                     "-//w3c//dtd html 4.0 transitional//",
-                     "-//w3c//dtd html experimental 19960712//",
-                     "-//w3c//dtd html experimental 970421//",
-                     "-//w3c//dtd w3 html//",
-                     "-//w3o//dtd w3 html 3.0//",
-                     "-//webtechs//dtd mozilla html 2.0//",
-                     "-//webtechs//dtd mozilla html//"))
-                or publicId in
-                    ("-//w3o//dtd w3 html strict 3.0//en//",
-                     "-/w3c/dtd html 4.0 transitional/en",
-                     "html")
-                or publicId.startswith(
-                    ("-//w3c//dtd html 4.01 frameset//",
-                     "-//w3c//dtd html 4.01 transitional//")) and
-                    systemId is None
-                    or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
+            if (not correct or token["name"] != "html" or
+                    publicId.startswith(
+                        ("+//silmaril//dtd html pro v0r11 19970101//",
+                         "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
+                         "-//as//dtd html 3.0 aswedit + extensions//",
+                         "-//ietf//dtd html 2.0 level 1//",
+                         "-//ietf//dtd html 2.0 level 2//",
+                         "-//ietf//dtd html 2.0 strict level 1//",
+                         "-//ietf//dtd html 2.0 strict level 2//",
+                         "-//ietf//dtd html 2.0 strict//",
+                         "-//ietf//dtd html 2.0//",
+                         "-//ietf//dtd html 2.1e//",
+                         "-//ietf//dtd html 3.0//",
+                         "-//ietf//dtd html 3.2 final//",
+                         "-//ietf//dtd html 3.2//",
+                         "-//ietf//dtd html 3//",
+                         "-//ietf//dtd html level 0//",
+                         "-//ietf//dtd html level 1//",
+                         "-//ietf//dtd html level 2//",
+                         "-//ietf//dtd html level 3//",
+                         "-//ietf//dtd html strict level 0//",
+                         "-//ietf//dtd html strict level 1//",
+                         "-//ietf//dtd html strict level 2//",
+                         "-//ietf//dtd html strict level 3//",
+                         "-//ietf//dtd html strict//",
+                         "-//ietf//dtd html//",
+                         "-//metrius//dtd metrius presentational//",
+                         "-//microsoft//dtd internet explorer 2.0 html strict//",
+                         "-//microsoft//dtd internet explorer 2.0 html//",
+                         "-//microsoft//dtd internet explorer 2.0 tables//",
+                         "-//microsoft//dtd internet explorer 3.0 html strict//",
+                         "-//microsoft//dtd internet explorer 3.0 html//",
+                         "-//microsoft//dtd internet explorer 3.0 tables//",
+                         "-//netscape comm. corp.//dtd html//",
+                         "-//netscape comm. corp.//dtd strict html//",
+                         "-//o'reilly and associates//dtd html 2.0//",
+                         "-//o'reilly and associates//dtd html extended 1.0//",
+                         "-//o'reilly and associates//dtd html extended relaxed 1.0//",
+                         "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
+                         "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
+                         "-//spyglass//dtd html 2.0 extended//",
+                         "-//sq//dtd html 2.0 hotmetal + extensions//",
+                         "-//sun microsystems corp.//dtd hotjava html//",
+                         "-//sun microsystems corp.//dtd hotjava strict html//",
+                         "-//w3c//dtd html 3 1995-03-24//",
+                         "-//w3c//dtd html 3.2 draft//",
+                         "-//w3c//dtd html 3.2 final//",
+                         "-//w3c//dtd html 3.2//",
+                         "-//w3c//dtd html 3.2s draft//",
+                         "-//w3c//dtd html 4.0 frameset//",
+                         "-//w3c//dtd html 4.0 transitional//",
+                         "-//w3c//dtd html experimental 19960712//",
+                         "-//w3c//dtd html experimental 970421//",
+                         "-//w3c//dtd w3 html//",
+                         "-//w3o//dtd w3 html 3.0//",
+                         "-//webtechs//dtd mozilla html 2.0//",
+                         "-//webtechs//dtd mozilla html//")) or
+                    publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
+                                 "-/w3c/dtd html 4.0 transitional/en",
+                                 "html") or
+                    publicId.startswith(
+                        ("-//w3c//dtd html 4.01 frameset//",
+                         "-//w3c//dtd html 4.01 transitional//")) and
+                    systemId is None or
+                    systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
                self.parser.compatMode = "quirks"
            elif (publicId.startswith(
                    ("-//w3c//dtd xhtml 1.0 frameset//",
-                     "-//w3c//dtd xhtml 1.0 transitional//"))
-                  or publicId.startswith(
+                     "-//w3c//dtd xhtml 1.0 transitional//")) or
+                  publicId.startswith(
                      ("-//w3c//dtd html 4.01 frameset//",
                       "-//w3c//dtd html 4.01 transitional//")) and
                  systemId is not None):
@ -660,13 +590,13 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)

-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("head", self.startTagHead)
            ])
            self.startTagHandler.default = self.startTagOther

-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                (("head", "body", "html", "br"), self.endTagImplyHead)
            ])
            self.endTagHandler.default = self.endTagOther
@ -706,10 +636,11 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)

-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("title", self.startTagTitle),
-                (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
+                (("noframes", "style"), self.startTagNoFramesStyle),
+                ("noscript", self.startTagNoscript),
                ("script", self.startTagScript),
                (("base", "basefont", "bgsound", "command", "link"),
                 self.startTagBaseLinkCommand),
@ -718,7 +649,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther

-            self. endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("head", self.endTagHead),
                (("br", "html", "body"), self.endTagHtmlBodyBr)
            ])
@ -760,18 +691,25 @@ def getPhases(debug):
                    # the abstract Unicode string, and just use the
                    # ContentAttrParser on that, but using UTF-8 allows all chars
                    # to be encoded and as a ASCII-superset works.
-                    data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
-                    parser = inputstream.ContentAttrParser(data)
+                    data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
+                    parser = _inputstream.ContentAttrParser(data)
                    codec = parser.parse()
                    self.parser.tokenizer.stream.changeEncoding(codec)

        def startTagTitle(self, token):
            self.parser.parseRCDataRawtext(token, "RCDATA")

-        def startTagNoScriptNoFramesStyle(self, token):
+        def startTagNoFramesStyle(self, token):
            # Need to decide whether to implement the scripting-disabled case
            self.parser.parseRCDataRawtext(token, "RAWTEXT")

+        def startTagNoscript(self, token):
+            if self.parser.scripting:
+                self.parser.parseRCDataRawtext(token, "RAWTEXT")
+            else:
+                self.tree.insertElement(token)
+                self.parser.phase = self.parser.phases["inHeadNoscript"]
+
        def startTagScript(self, token):
            self.tree.insertElement(token)
            self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
@ -797,15 +735,75 @@ def getPhases(debug):
        def anythingElse(self):
            self.endTagHead(impliedTagToken("head"))

-    # XXX If we implement a parser for which scripting is disabled we need to
-    # implement this phase.
-    #
-    # class InHeadNoScriptPhase(Phase):
+    class InHeadNoscriptPhase(Phase):
+        def __init__(self, parser, tree):
+            Phase.__init__(self, parser, tree)
+
+            self.startTagHandler = _utils.MethodDispatcher([
+                ("html", self.startTagHtml),
+                (("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
+                (("head", "noscript"), self.startTagHeadNoscript),
+            ])
+            self.startTagHandler.default = self.startTagOther
+
+            self.endTagHandler = _utils.MethodDispatcher([
+                ("noscript", self.endTagNoscript),
+                ("br", self.endTagBr),
+            ])
+            self.endTagHandler.default = self.endTagOther
+
+        def processEOF(self):
+            self.parser.parseError("eof-in-head-noscript")
+            self.anythingElse()
+            return True
+
+        def processComment(self, token):
+            return self.parser.phases["inHead"].processComment(token)
+
+        def processCharacters(self, token):
+            self.parser.parseError("char-in-head-noscript")
+            self.anythingElse()
+            return token
+
+        def processSpaceCharacters(self, token):
+            return self.parser.phases["inHead"].processSpaceCharacters(token)
+
+        def startTagHtml(self, token):
+            return self.parser.phases["inBody"].processStartTag(token)
+
+        def startTagBaseLinkCommand(self, token):
+            return self.parser.phases["inHead"].processStartTag(token)
+
+        def startTagHeadNoscript(self, token):
+            self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+
+        def startTagOther(self, token):
+            self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
+            self.anythingElse()
+            return token
+
+        def endTagNoscript(self, token):
+            node = self.parser.tree.openElements.pop()
+            assert node.name == "noscript", "Expected noscript got %s" % node.name
+            self.parser.phase = self.parser.phases["inHead"]
+
+        def endTagBr(self, token):
+            self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
+            self.anythingElse()
+            return token
+
+        def endTagOther(self, token):
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
+
+        def anythingElse(self):
+            # Caller must raise parse error first!
+            self.endTagNoscript(impliedTagToken("noscript"))
+
    class AfterHeadPhase(Phase):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)

-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("body", self.startTagBody),
                ("frameset", self.startTagFrameset),
@ -815,8 +813,8 @@ def getPhases(debug):
                ("head", self.startTagHead)
            ])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"),
-                                                          self.endTagHtmlBodyBr)])
+            self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
+                                                           self.endTagHtmlBodyBr)])
            self.endTagHandler.default = self.endTagOther

        def processEOF(self):
@ -874,10 +872,10 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)

-            # Keep a ref to this for special handling of whitespace in <pre>
-            self.processSpaceCharactersNonPre = self.processSpaceCharacters
+            # Set this to the default handler
+            self.processSpaceCharacters = self.processSpaceCharactersNonPre

-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                (("base", "basefont", "bgsound", "command", "link", "meta",
                  "script", "style", "title"),
@ -885,7 +883,7 @@ def getPhases(debug):
                ("body", self.startTagBody),
                ("frameset", self.startTagFrameset),
                (("address", "article", "aside", "blockquote", "center", "details",
-                  "details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
+                  "dir", "div", "dl", "fieldset", "figcaption", "figure",
                  "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
                  "section", "summary", "ul"),
                 self.startTagCloseP),
@ -911,7 +909,8 @@ def getPhases(debug):
                ("isindex", self.startTagIsIndex),
                ("textarea", self.startTagTextarea),
                ("iframe", self.startTagIFrame),
-                (("noembed", "noframes", "noscript"), self.startTagRawtext),
+                ("noscript", self.startTagNoscript),
+                (("noembed", "noframes"), self.startTagRawtext),
                ("select", self.startTagSelect),
                (("rp", "rt"), self.startTagRpRt),
                (("option", "optgroup"), self.startTagOpt),
@ -923,7 +922,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther

-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("body", self.endTagBody),
                ("html", self.endTagHtml),
                (("address", "article", "aside", "blockquote", "button", "center",
@ -942,17 +941,9 @@ def getPhases(debug):
            self.endTagHandler.default = self.endTagOther

        def isMatchingFormattingElement(self, node1, node2):
-            if node1.name != node2.name or node1.namespace != node2.namespace:
-                return False
-            elif len(node1.attributes) != len(node2.attributes):
-                return False
-            else:
-                attributes1 = sorted(node1.attributes.items())
-                attributes2 = sorted(node2.attributes.items())
-                for attr1, attr2 in zip(attributes1, attributes2):
-                    if attr1 != attr2:
-                        return False
-            return True
+            return (node1.name == node2.name and
+                    node1.namespace == node2.namespace and
+                    node1.attributes == node2.attributes)

        # helper
        def addFormattingElement(self, token):
@ -988,8 +979,8 @@ def getPhases(debug):
            data = token["data"]
            self.processSpaceCharacters = self.processSpaceCharactersNonPre
            if (data.startswith("\n") and
-                self.tree.openElements[-1].name in ("pre", "listing", "textarea")
-                    and not self.tree.openElements[-1].hasContent()):
+                self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
+                    not self.tree.openElements[-1].hasContent()):
                data = data[1:]
            if data:
                self.tree.reconstructActiveFormattingElements()
@ -1007,7 +998,7 @@ def getPhases(debug):
                     for char in token["data"]])):
                self.parser.framesetOK = False

-        def processSpaceCharacters(self, token):
+        def processSpaceCharactersNonPre(self, token):
            self.tree.reconstructActiveFormattingElements()
            self.tree.insertText(token["data"])

@ -1016,8 +1007,8 @@ def getPhases(debug):

        def startTagBody(self, token):
            self.parser.parseError("unexpected-start-tag", {"name": "body"})
-            if (len(self.tree.openElements) == 1
-                    or self.tree.openElements[1].name != "body"):
+            if (len(self.tree.openElements) == 1 or
+                    self.tree.openElements[1].name != "body"):
                assert self.parser.innerHTML
            else:
                self.parser.framesetOK = False
@ -1232,6 +1223,12 @@ def getPhases(debug):
            self.parser.framesetOK = False
            self.startTagRawtext(token)

+        def startTagNoscript(self, token):
+            if self.parser.scripting:
+                self.startTagRawtext(token)
+            else:
+                self.startTagOther(token)
+
        def startTagRawtext(self, token):
            """iframe, noembed noframes, noscript(if scripting enabled)"""
            self.parser.parseRCDataRawtext(token, "RAWTEXT")
@ -1595,9 +1592,9 @@ def getPhases(debug):
    class TextPhase(Phase):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([])
+            self.startTagHandler = _utils.MethodDispatcher([])
            self.startTagHandler.default = self.startTagOther
-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("script", self.endTagScript)])
            self.endTagHandler.default = self.endTagOther

@ -1629,7 +1626,7 @@ def getPhases(debug):
        # http://www.whatwg.org/specs/web-apps/current-work/#in-table
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("caption", self.startTagCaption),
                ("colgroup", self.startTagColgroup),
@ -1643,7 +1640,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther

-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("table", self.endTagTable),
                (("body", "caption", "col", "colgroup", "html", "tbody", "td",
                  "tfoot", "th", "thead", "tr"), self.endTagIgnore)
@ -1820,14 +1817,14 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)

-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
                  "thead", "tr"), self.startTagTableElement)
            ])
            self.startTagHandler.default = self.startTagOther

-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("caption", self.endTagCaption),
                ("table", self.endTagTable),
                (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
@ -1892,13 +1889,13 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)

-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("col", self.startTagCol)
            ])
            self.startTagHandler.default = self.startTagOther

-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("colgroup", self.endTagColgroup),
                ("col", self.endTagCol)
            ])
@ -1926,6 +1923,7 @@ def getPhases(debug):
        def startTagCol(self, token):
            self.tree.insertElement(token)
            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True

        def startTagOther(self, token):
            ignoreEndTag = self.ignoreEndTagColgroup()
@ -1955,7 +1953,7 @@ def getPhases(debug):
        # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("tr", self.startTagTr),
                (("td", "th"), self.startTagTableCell),
@ -1964,7 +1962,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther

-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
                ("table", self.endTagTable),
                (("body", "caption", "col", "colgroup", "html", "td", "th",
@ -2053,7 +2051,7 @@ def getPhases(debug):
        # http://www.whatwg.org/specs/web-apps/current-work/#in-row
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                (("td", "th"), self.startTagTableCell),
                (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
@ -2061,7 +2059,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther

-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("tr", self.endTagTr),
                ("table", self.endTagTable),
                (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
@ -2142,14 +2140,14 @@ def getPhases(debug):
        # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
                  "thead", "tr"), self.startTagTableOther)
            ])
            self.startTagHandler.default = self.startTagOther

-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                (("td", "th"), self.endTagTableCell),
                (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
                (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
@ -2218,7 +2216,7 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)

-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("option", self.startTagOption),
                ("optgroup", self.startTagOptgroup),
@ -2228,7 +2226,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther

-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("option", self.endTagOption),
                ("optgroup", self.endTagOptgroup),
                ("select", self.endTagSelect)
@ -2318,13 +2316,13 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)

-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
                 self.startTagTable)
            ])
            self.startTagHandler.default = self.startTagOther

-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
                 self.endTagTable)
            ])
@ -2445,7 +2443,7 @@ def getPhases(debug):
        def processEndTag(self, token):
            nodeIndex = len(self.tree.openElements) - 1
            node = self.tree.openElements[-1]
-            if node.name != token["name"]:
+            if node.name.translate(asciiUpper2Lower) != token["name"]:
                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

            while True:
@ -2472,12 +2470,12 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)

-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml)
            ])
            self.startTagHandler.default = self.startTagOther

-            self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
+            self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
            self.endTagHandler.default = self.endTagOther

        def processEOF(self):
@ -2520,7 +2518,7 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)

-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("frameset", self.startTagFrameset),
                ("frame", self.startTagFrame),
@ -2528,7 +2526,7 @@ def getPhases(debug):
            ])
            self.startTagHandler.default = self.startTagOther

-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("frameset", self.endTagFrameset)
            ])
            self.endTagHandler.default = self.endTagOther
@ -2577,13 +2575,13 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)

-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("noframes", self.startTagNoframes)
            ])
            self.startTagHandler.default = self.startTagOther

-            self.endTagHandler = utils.MethodDispatcher([
+            self.endTagHandler = _utils.MethodDispatcher([
                ("html", self.endTagHtml)
            ])
            self.endTagHandler.default = self.endTagOther
@ -2613,7 +2611,7 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)

-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml)
            ])
            self.startTagHandler.default = self.startTagOther
@ -2651,7 +2649,7 @@ def getPhases(debug):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)

-            self.startTagHandler = utils.MethodDispatcher([
+            self.startTagHandler = _utils.MethodDispatcher([
                ("html", self.startTagHtml),
                ("noframes", self.startTagNoFrames)
            ])
@ -2682,13 +2680,14 @@ def getPhases(debug):
        def processEndTag(self, token):
            self.parser.parseError("expected-eof-but-got-end-tag",
                                   {"name": token["name"]})
+    # pylint:enable=unused-argument

    return {
        "initial": InitialPhase,
        "beforeHtml": BeforeHtmlPhase,
        "beforeHead": BeforeHeadPhase,
        "inHead": InHeadPhase,
-        # XXX "inHeadNoscript": InHeadNoScriptPhase,
+        "inHeadNoscript": InHeadNoscriptPhase,
        "afterHead": AfterHeadPhase,
        "inBody": InBodyPhase,
        "text": TextPhase,
@ -2711,6 +2710,16 @@ def getPhases(debug):
    }


+def adjust_attributes(token, replacements):
+    if PY3 or _utils.PY27:
+        needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
+    else:
+        needs_adjustment = frozenset(token['data']) & frozenset(replacements)
+    if needs_adjustment:
+        token['data'] = OrderedDict((replacements.get(k, k), v)
+                                    for k, v in token['data'].items())
+
+
 def impliedTagToken(name, type="EndTag", attributes=None,
                    selfClosing=False):
    if attributes is None:
--- a/lib/html5lib/sanitizer.py
+++ b/lib/html5lib/sanitizer.py
@ -1,300 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-
-import re
-from xml.sax.saxutils import escape, unescape
-from six.moves import urllib_parse as urlparse
-
-from .tokenizer import HTMLTokenizer
-from .constants import tokenTypes
-
-
-content_type_rgx = re.compile(r'''
-                               ^
-                               # Match a content type <application>/<type>
-                               (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
-                               # Match any character set and encoding
-                               (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
-                                 |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
-                               # Assume the rest is data
-                               ,.*
-                               $
-                               ''',
-                              re.VERBOSE)
-
-
-class HTMLSanitizerMixin(object):
-    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
-
-    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
-                           'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
-                           'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
-                           'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
-                           'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
-                           'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
-                           'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
-                           'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
-                           'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
-                           'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
-                           'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
-                           'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
-                           'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
-
-    mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
-                       'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
-                       'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
-                       'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
-                       'munderover', 'none']
-
-    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
-                    'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
-                    'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
-                    'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
-                    'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
-                    'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
-
-    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
-                             'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
-                             'background', 'balance', 'bgcolor', 'bgproperties', 'border',
-                             'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
-                             'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
-                             'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
-                             'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
-                             'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
-                             'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
-                             'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
-                             'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
-                             'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
-                             'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
-                             'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
-                             'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
-                             'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
-                             'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
-                             'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
-                             'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
-                             'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
-                             'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
-                             'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
-                             'width', 'wrap', 'xml:lang']
-
-    mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
-                         'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
-                         'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
-                         'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
-                         'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
-                         'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
-                         'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
-                         'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
-                         'xlink:type', 'xmlns', 'xmlns:xlink']
-
-    svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
-                      'arabic-form', 'ascent', 'attributeName', 'attributeType',
-                      'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
-                      'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
-                      'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
-                      'fill-opacity', 'fill-rule', 'font-family', 'font-size',
-                      'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
-                      'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
-                      'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
-                      'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
-                      'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
-                      'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
-                      'opacity', 'orient', 'origin', 'overline-position',
-                      'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
-                      'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
-                      'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
-                      'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
-                      'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
-                      'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
-                      'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
-                      'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
-                      'transform', 'type', 'u1', 'u2', 'underline-position',
-                      'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
-                      'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
-                      'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
-                      'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
-                      'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
-                      'y1', 'y2', 'zoomAndPan']
-
-    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
-                       'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
-
-    svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
-                               'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
-                               'mask', 'stroke']
-
-    svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
-                            'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
-                            'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
-                            'set', 'use']
-
-    acceptable_css_properties = ['azimuth', 'background-color',
-                                 'border-bottom-color', 'border-collapse', 'border-color',
-                                 'border-left-color', 'border-right-color', 'border-top-color', 'clear',
-                                 'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
-                                 'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
-                                 'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
-                                 'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
-                                 'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
-                                 'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
-                                 'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
-                                 'white-space', 'width']
-
-    acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
-                               'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
-                               'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
-                               'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
-                               'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
-                               'transparent', 'underline', 'white', 'yellow']
-
-    acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
-                                 'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
-                                 'stroke-opacity']
-
-    acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
-                            'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
-                            'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
-                            'ssh', 'sftp', 'rtsp', 'afs', 'data']
-
-    acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
-
-    # subclasses may define their own versions of these constants
-    allowed_elements = acceptable_elements + mathml_elements + svg_elements
-    allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
-    allowed_css_properties = acceptable_css_properties
-    allowed_css_keywords = acceptable_css_keywords
-    allowed_svg_properties = acceptable_svg_properties
-    allowed_protocols = acceptable_protocols
-    allowed_content_types = acceptable_content_types
-
-    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
-    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
-    # attributes are parsed, and a restricted set, # specified by
-    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
-    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
-    # in ALLOWED_PROTOCOLS are allowed.
-    #
-    #   sanitize_html('<script> do_nasty_stuff() </script>')
-    #    => &lt;script> do_nasty_stuff() &lt;/script>
-    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
-    #    => <a>Click here for $100</a>
-    def sanitize_token(self, token):
-
-        # accommodate filters which use token_type differently
-        token_type = token["type"]
-        if token_type in list(tokenTypes.keys()):
-            token_type = tokenTypes[token_type]
-
-        if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
-                          tokenTypes["EmptyTag"]):
-            if token["name"] in self.allowed_elements:
-                return self.allowed_token(token, token_type)
-            else:
-                return self.disallowed_token(token, token_type)
-        elif token_type == tokenTypes["Comment"]:
-            pass
-        else:
-            return token
-
-    def allowed_token(self, token, token_type):
-        if "data" in token:
-            attrs = dict([(name, val) for name, val in
-                          token["data"][::-1]
-                          if name in self.allowed_attributes])
-            for attr in self.attr_val_is_uri:
-                if attr not in attrs:
-                    continue
-                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
-                                       unescape(attrs[attr])).lower()
-                # remove replacement characters from unescaped characters
-                val_unescaped = val_unescaped.replace("\ufffd", "")
-                try:
-                    uri = urlparse.urlparse(val_unescaped)
-                except ValueError:
-                    uri = None
-                    del attrs[attr]
-                if uri and uri.scheme:
-                    if uri.scheme not in self.allowed_protocols:
-                        del attrs[attr]
-                    if uri.scheme == 'data':
-                        m = content_type_rgx.match(uri.path)
-                        if not m:
-                            del attrs[attr]
-                        elif m.group('content_type') not in self.allowed_content_types:
-                            del attrs[attr]
-
-            for attr in self.svg_attr_val_allows_ref:
-                if attr in attrs:
-                    attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
-                                         ' ',
-                                         unescape(attrs[attr]))
-            if (token["name"] in self.svg_allow_local_href and
-                'xlink:href' in attrs and re.search('^\s*[^#\s].*',
-                                                    attrs['xlink:href'])):
-                del attrs['xlink:href']
-            if 'style' in attrs:
-                attrs['style'] = self.sanitize_css(attrs['style'])
-            token["data"] = [[name, val] for name, val in list(attrs.items())]
-        return token
-
-    def disallowed_token(self, token, token_type):
-        if token_type == tokenTypes["EndTag"]:
-            token["data"] = "</%s>" % token["name"]
-        elif token["data"]:
-            attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
-            token["data"] = "<%s%s>" % (token["name"], attrs)
-        else:
-            token["data"] = "<%s>" % token["name"]
-        if token.get("selfClosing"):
-            token["data"] = token["data"][:-1] + "/>"
-
-        if token["type"] in list(tokenTypes.keys()):
-            token["type"] = "Characters"
-        else:
-            token["type"] = tokenTypes["Characters"]
-
-        del token["name"]
-        return token
-
-    def sanitize_css(self, style):
-        # disallow urls
-        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
-
-        # gauntlet
-        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
-            return ''
-        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
-            return ''
-
-        clean = []
-        for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
-            if not value:
-                continue
-            if prop.lower() in self.allowed_css_properties:
-                clean.append(prop + ': ' + value + ';')
-            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
-                                                'padding']:
-                for keyword in value.split():
-                    if keyword not in self.acceptable_css_keywords and \
-                            not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
-                        break
-                else:
-                    clean.append(prop + ': ' + value + ';')
-            elif prop.lower() in self.allowed_svg_properties:
-                clean.append(prop + ': ' + value + ';')
-
-        return ' '.join(clean)
-
-
-class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
-    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
-                 lowercaseElementName=False, lowercaseAttrName=False, parser=None):
-        # Change case matching defaults as we only output lowercase html anyway
-        # This solution doesn't seem ideal...
-        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
-                               lowercaseElementName, lowercaseAttrName, parser=parser)
-
-    def __iter__(self):
-        for token in HTMLTokenizer.__iter__(self):
-            token = self.sanitize_token(token)
-            if token:
-                yield token
--- a/lib/html5lib/serializer/htmlserializer.py
+++ b/lib/html5lib/serializer/htmlserializer.py
@ -1,79 +1,87 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type

-try:
-    from functools import reduce
-except ImportError:
-    pass
+import re

-from ..constants import voidElements, booleanAttributes, spaceCharacters
-from ..constants import rcdataElements, entities, xmlEntities
-from .. import utils
+from codecs import register_error, xmlcharrefreplace_errors
+
+from .constants import voidElements, booleanAttributes, spaceCharacters
+from .constants import rcdataElements, entities, xmlEntities
+from . import treewalkers, _utils
 from xml.sax.saxutils import escape

-spaceCharacters = "".join(spaceCharacters)
+_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
+_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
+_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
+                                   "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
+                                   "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
+                                   "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
+                                   "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
+                                   "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
+                                   "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
+                                   "\u3000]")

-try:
-    from codecs import register_error, xmlcharrefreplace_errors
-except ImportError:
-    unicode_encode_errors = "strict"
-else:
-    unicode_encode_errors = "htmlentityreplace"

-    encode_entity_map = {}
-    is_ucs4 = len("\U0010FFFF") == 1
-    for k, v in list(entities.items()):
-        # skip multi-character entities
-        if ((is_ucs4 and len(v) > 1) or
-                (not is_ucs4 and len(v) > 2)):
-            continue
-        if v != "&":
-            if len(v) == 2:
-                v = utils.surrogatePairToCodepoint(v)
-            else:
-                v = ord(v)
-            if v not in encode_entity_map or k.islower():
-                # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
-                encode_entity_map[v] = k
-
-    def htmlentityreplace_errors(exc):
-        if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
-            res = []
-            codepoints = []
-            skip = False
-            for i, c in enumerate(exc.object[exc.start:exc.end]):
-                if skip:
-                    skip = False
-                    continue
-                index = i + exc.start
-                if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
-                    codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
-                    skip = True
-                else:
-                    codepoint = ord(c)
-                codepoints.append(codepoint)
-            for cp in codepoints:
-                e = encode_entity_map.get(cp)
-                if e:
-                    res.append("&")
-                    res.append(e)
-                    if not e.endswith(";"):
-                        res.append(";")
-                else:
-                    res.append("&#x%s;" % (hex(cp)[2:]))
-            return ("".join(res), exc.end)
+_encode_entity_map = {}
+_is_ucs4 = len("\U0010FFFF") == 1
+for k, v in list(entities.items()):
+    # skip multi-character entities
+    if ((_is_ucs4 and len(v) > 1) or
+            (not _is_ucs4 and len(v) > 2)):
+        continue
+    if v != "&":
+        if len(v) == 2:
+            v = _utils.surrogatePairToCodepoint(v)
        else:
-            return xmlcharrefreplace_errors(exc)
+            v = ord(v)
+        if v not in _encode_entity_map or k.islower():
+            # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
+            _encode_entity_map[v] = k

-    register_error(unicode_encode_errors, htmlentityreplace_errors)

-    del register_error
+def htmlentityreplace_errors(exc):
+    if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+        res = []
+        codepoints = []
+        skip = False
+        for i, c in enumerate(exc.object[exc.start:exc.end]):
+            if skip:
+                skip = False
+                continue
+            index = i + exc.start
+            if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
+                codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
+                skip = True
+            else:
+                codepoint = ord(c)
+            codepoints.append(codepoint)
+        for cp in codepoints:
+            e = _encode_entity_map.get(cp)
+            if e:
+                res.append("&")
+                res.append(e)
+                if not e.endswith(";"):
+                    res.append(";")
+            else:
+                res.append("&#x%s;" % (hex(cp)[2:]))
+        return ("".join(res), exc.end)
+    else:
+        return xmlcharrefreplace_errors(exc)
+
+register_error("htmlentityreplace", htmlentityreplace_errors)
+
+
+def serialize(input, tree="etree", encoding=None, **serializer_opts):
+    # XXX: Should we cache this?
+    walker = treewalkers.getTreeWalker(tree)
+    s = HTMLSerializer(**serializer_opts)
+    return s.render(walker(input), encoding)


 class HTMLSerializer(object):

    # attribute quoting options
-    quote_attr_values = False
+    quote_attr_values = "legacy"  # be secure by default
    quote_char = '"'
    use_best_quote_char = True

@ -109,9 +117,9 @@ class HTMLSerializer(object):
        inject_meta_charset=True|False
          Whether it insert a meta element to define the character set of the
          document.
-        quote_attr_values=True|False
+        quote_attr_values="legacy"|"spec"|"always"
          Whether to quote attribute values that don't require quoting
-          per HTML5 parsing rules.
+          per legacy browser behaviour, when required by the standard, or always.
        quote_char=u'"'|u"'"
          Use given quote character for attribute quoting. Default is to
          use double quote unless attribute value contains a double quote,
@ -147,6 +155,9 @@ class HTMLSerializer(object):

        .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
        """
+        unexpected_args = frozenset(kwargs) - frozenset(self.options)
+        if len(unexpected_args) > 0:
+            raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
        if 'quote_char' in kwargs:
            self.use_best_quote_char = False
        for attr in self.options:
@ -157,7 +168,7 @@ class HTMLSerializer(object):
    def encode(self, string):
        assert(isinstance(string, text_type))
        if self.encoding:
-            return string.encode(self.encoding, unicode_encode_errors)
+            return string.encode(self.encoding, "htmlentityreplace")
        else:
            return string

@ -169,28 +180,30 @@ class HTMLSerializer(object):
            return string

    def serialize(self, treewalker, encoding=None):
+        # pylint:disable=too-many-nested-blocks
        self.encoding = encoding
        in_cdata = False
        self.errors = []

        if encoding and self.inject_meta_charset:
-            from ..filters.inject_meta_charset import Filter
+            from .filters.inject_meta_charset import Filter
            treewalker = Filter(treewalker, encoding)
+        # Alphabetical attributes is here under the assumption that none of
+        # the later filters add or change order of attributes; it needs to be
+        # before the sanitizer so escaped elements come out correctly
+        if self.alphabetical_attributes:
+            from .filters.alphabeticalattributes import Filter
+            treewalker = Filter(treewalker)
        # WhitespaceFilter should be used before OptionalTagFilter
        # for maximum efficiently of this latter filter
        if self.strip_whitespace:
-            from ..filters.whitespace import Filter
+            from .filters.whitespace import Filter
            treewalker = Filter(treewalker)
        if self.sanitize:
-            from ..filters.sanitizer import Filter
+            from .filters.sanitizer import Filter
            treewalker = Filter(treewalker)
        if self.omit_optional_tags:
-            from ..filters.optionaltags import Filter
-            treewalker = Filter(treewalker)
-        # Alphabetical attributes must be last, as other filters
-        # could add attributes and alter the order
-        if self.alphabetical_attributes:
-            from ..filters.alphabeticalattributes import Filter
+            from .filters.optionaltags import Filter
            treewalker = Filter(treewalker)

        for token in treewalker:
@ -229,7 +242,7 @@ class HTMLSerializer(object):
                    in_cdata = True
                elif in_cdata:
                    self.serializeError("Unexpected child element of a CDATA element")
-                for (attr_namespace, attr_name), attr_value in token["data"].items():
+                for (_, attr_name), attr_value in token["data"].items():
                    # TODO: Add namespace support here
                    k = attr_name
                    v = attr_value
@ -237,14 +250,18 @@ class HTMLSerializer(object):

                    yield self.encodeStrict(k)
                    if not self.minimize_boolean_attributes or \
-                        (k not in booleanAttributes.get(name, tuple())
-                         and k not in booleanAttributes.get("", tuple())):
+                        (k not in booleanAttributes.get(name, tuple()) and
+                         k not in booleanAttributes.get("", tuple())):
                        yield self.encodeStrict("=")
-                        if self.quote_attr_values or not v:
+                        if self.quote_attr_values == "always" or len(v) == 0:
                            quote_attr = True
+                        elif self.quote_attr_values == "spec":
+                            quote_attr = _quoteAttributeSpec.search(v) is not None
+                        elif self.quote_attr_values == "legacy":
+                            quote_attr = _quoteAttributeLegacy.search(v) is not None
                        else:
-                            quote_attr = reduce(lambda x, y: x or (y in v),
-                                                spaceCharacters + ">\"'=", False)
+                            raise ValueError("quote_attr_values must be one of: "
+                                             "'always', 'spec', or 'legacy'")
                        v = v.replace("&", "&amp;")
                        if self.escape_lt_in_attrs:
                            v = v.replace("<", "&lt;")
@ -312,6 +329,6 @@ class HTMLSerializer(object):
            raise SerializeError


-def SerializeError(Exception):
+class SerializeError(Exception):
    """Error in serialized tree"""
    pass
--- a/lib/html5lib/serializer/init.py
+++ b/lib/html5lib/serializer/init.py
@ -1,16 +0,0 @@
-from __future__ import absolute_import, division, unicode_literals
-
-from .. import treewalkers
-
-from .htmlserializer import HTMLSerializer
-
-
-def serialize(input, tree="etree", format="html", encoding=None,
-              **serializer_opts):
-    # XXX: Should we cache this?
-    walker = treewalkers.getTreeWalker(tree)
-    if format == "html":
-        s = HTMLSerializer(**serializer_opts)
-    else:
-        raise ValueError("type must be html")
-    return s.render(walker(input), encoding)
--- a/lib/html5lib/treeadapters/init.py
+++ b/lib/html5lib/treeadapters/init.py
@ -5,7 +5,7 @@ from . import sax
 __all__ = ["sax"]

 try:
-    from . import genshi  # flake8: noqa
+    from . import genshi  # noqa
 except ImportError:
    pass
 else:
--- a/lib/html5lib/treebuilders/init.py
+++ b/lib/html5lib/treebuilders/init.py
@ -28,7 +28,7 @@ to the format used in the unittests

 from __future__ import absolute_import, division, unicode_literals

-from ..utils import default_etree
+from .._utils import default_etree

 treeBuilderCache = {}

--- a/lib/html5lib/treebuilders/_base.py
+++ b/lib/html5lib/treebuilders/_base.py
@ -126,6 +126,7 @@ class TreeBuilder(object):
    commentClass - the class to use for comments
    doctypeClass - the class to use for doctypes
    """
+    # pylint:disable=not-callable

    # Document class
    documentClass = None
@ -166,12 +167,17 @@ class TreeBuilder(object):
        # If we pass a node in we match that. if we pass a string
        # match any node with that name
        exactNode = hasattr(target, "nameTuple")
+        if not exactNode:
+            if isinstance(target, text_type):
+                target = (namespaces["html"], target)
+            assert isinstance(target, tuple)

        listElements, invert = listElementsMap[variant]

        for node in reversed(self.openElements):
-            if (node.name == target and not exactNode or
-                    node == target and exactNode):
+            if exactNode and node == target:
+                return True
+            elif not exactNode and node.nameTuple == target:
                return True
            elif (invert ^ (node.nameTuple in listElements)):
                return False
@ -353,8 +359,8 @@ class TreeBuilder(object):
    def generateImpliedEndTags(self, exclude=None):
        name = self.openElements[-1].name
        # XXX td, th and tr are not actually needed
-        if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
-                and name != exclude):
+        if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
+                name != exclude):
            self.openElements.pop()
            # XXX This is not entirely what the specification says. We should
            # investigate it more closely.
--- a/lib/html5lib/treebuilders/dom.py
+++ b/lib/html5lib/treebuilders/dom.py
@ -1,54 +1,62 @@
 from __future__ import absolute_import, division, unicode_literals


+from collections import MutableMapping
 from xml.dom import minidom, Node
 import weakref

-from . import _base
+from . import base
 from .. import constants
 from ..constants import namespaces
-from ..utils import moduleFactoryFactory
+from .._utils import moduleFactoryFactory


 def getDomBuilder(DomImplementation):
    Dom = DomImplementation

-    class AttrList(object):
+    class AttrList(MutableMapping):
        def __init__(self, element):
            self.element = element

        def __iter__(self):
-            return list(self.element.attributes.items()).__iter__()
+            return iter(self.element.attributes.keys())

        def __setitem__(self, name, value):
-            self.element.setAttribute(name, value)
-
-        def __len__(self):
-            return len(list(self.element.attributes.items()))
-
-        def items(self):
-            return [(item[0], item[1]) for item in
-                    list(self.element.attributes.items())]
-
-        def keys(self):
-            return list(self.element.attributes.keys())
-
-        def __getitem__(self, name):
-            return self.element.getAttribute(name)
-
-        def __contains__(self, name):
            if isinstance(name, tuple):
                raise NotImplementedError
            else:
-                return self.element.hasAttribute(name)
+                attr = self.element.ownerDocument.createAttribute(name)
+                attr.value = value
+                self.element.attributes[name] = attr

-    class NodeBuilder(_base.Node):
+        def __len__(self):
+            return len(self.element.attributes)
+
+        def items(self):
+            return list(self.element.attributes.items())
+
+        def values(self):
+            return list(self.element.attributes.values())
+
+        def __getitem__(self, name):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                return self.element.attributes[name].value
+
+        def __delitem__(self, name):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                del self.element.attributes[name]
+
+    class NodeBuilder(base.Node):
        def __init__(self, element):
-            _base.Node.__init__(self, element.nodeName)
+            base.Node.__init__(self, element.nodeName)
            self.element = element

-        namespace = property(lambda self: hasattr(self.element, "namespaceURI")
-                             and self.element.namespaceURI or None)
+        namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
+                             self.element.namespaceURI or None)

        def appendChild(self, node):
            node.parent = self
@ -109,7 +117,7 @@ def getDomBuilder(DomImplementation):

        nameTuple = property(getNameTuple)

-    class TreeBuilder(_base.TreeBuilder):
+    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
        def documentClass(self):
            self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
            return weakref.proxy(self)
@ -149,15 +157,16 @@ def getDomBuilder(DomImplementation):
            return self.dom

        def getFragment(self):
-            return _base.TreeBuilder.getFragment(self).element
+            return base.TreeBuilder.getFragment(self).element

        def insertText(self, data, parent=None):
            data = data
            if parent != self:
-                _base.TreeBuilder.insertText(self, data, parent)
+                base.TreeBuilder.insertText(self, data, parent)
            else:
                # HACK: allow text nodes as children of the document node
                if hasattr(self.dom, '_child_node_types'):
+                    # pylint:disable=protected-access
                    if Node.TEXT_NODE not in self.dom._child_node_types:
                        self.dom._child_node_types = list(self.dom._child_node_types)
                        self.dom._child_node_types.append(Node.TEXT_NODE)
--- a/lib/html5lib/treebuilders/etree.py
+++ b/lib/html5lib/treebuilders/etree.py
@ -1,13 +1,15 @@
 from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access
+
 from six import text_type

 import re

-from . import _base
-from .. import ihatexml
+from . import base
+from .. import _ihatexml
 from .. import constants
 from ..constants import namespaces
-from ..utils import moduleFactoryFactory
+from .._utils import moduleFactoryFactory

 tag_regexp = re.compile("{([^}]*)}(.*)")

@ -16,7 +18,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
    ElementTree = ElementTreeImplementation
    ElementTreeCommentType = ElementTree.Comment("asd").tag

-    class Element(_base.Node):
+    class Element(base.Node):
        def __init__(self, name, namespace=None):
            self._name = name
            self._namespace = namespace
@ -98,6 +100,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            node.parent = self

        def removeChild(self, node):
+            self._childNodes.remove(node)
            self._element.remove(node._element)
            node.parent = None

@ -139,7 +142,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
                if self._element.text is not None:
                    newParent._element.text += self._element.text
            self._element.text = ""
-            _base.Node.reparentChildren(self, newParent)
+            base.Node.reparentChildren(self, newParent)

    class Comment(Element):
        def __init__(self, data):
@ -253,10 +256,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):

        return "\n".join(rv)

-    def tostring(element):
+    def tostring(element):  # pylint:disable=unused-variable
        """Serialize an element and its child nodes to a string"""
        rv = []
-        filter = ihatexml.InfosetFilter()
+        filter = _ihatexml.InfosetFilter()

        def serializeElement(element):
            if isinstance(element, ElementTree.ElementTree):
@ -307,7 +310,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):

        return "".join(rv)

-    class TreeBuilder(_base.TreeBuilder):
+    class TreeBuilder(base.TreeBuilder):  # pylint:disable=unused-variable
        documentClass = Document
        doctypeClass = DocumentType
        elementClass = Element
@ -329,7 +332,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
                    return self.document._element.find("html")

        def getFragment(self):
-            return _base.TreeBuilder.getFragment(self)._element
+            return base.TreeBuilder.getFragment(self)._element

    return locals()

--- a/lib/html5lib/treebuilders/etree_lxml.py
+++ b/lib/html5lib/treebuilders/etree_lxml.py
@ -10,16 +10,17 @@ When any of these things occur, we emit a DataLossWarning
 """

 from __future__ import absolute_import, division, unicode_literals
+# pylint:disable=protected-access

 import warnings
 import re
 import sys

-from . import _base
+from . import base
 from ..constants import DataLossWarning
 from .. import constants
 from . import etree as etree_builders
-from .. import ihatexml
+from .. import _ihatexml

 import lxml.etree as etree

@ -53,8 +54,7 @@ class Document(object):

 def testSerializer(element):
    rv = []
-    finalText = None
-    infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
+    infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)

    def serializeElement(element, indent=0):
        if not hasattr(element, "tag"):
@ -128,16 +128,12 @@ def testSerializer(element):
                rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
    serializeElement(element, 0)

-    if finalText is not None:
-        rv.append("|%s\"%s\"" % (' ' * 2, finalText))
-
    return "\n".join(rv)


 def tostring(element):
    """Serialize an element and its child nodes to a string"""
    rv = []
-    finalText = None

    def serializeElement(element):
        if not hasattr(element, "tag"):
@ -173,13 +169,10 @@ def tostring(element):

    serializeElement(element)

-    if finalText is not None:
-        rv.append("%s\"" % (' ' * 2, finalText))
-
    return "".join(rv)


-class TreeBuilder(_base.TreeBuilder):
+class TreeBuilder(base.TreeBuilder):
    documentClass = Document
    doctypeClass = DocumentType
    elementClass = None
@ -189,13 +182,15 @@ class TreeBuilder(_base.TreeBuilder):

    def __init__(self, namespaceHTMLElements, fullTree=False):
        builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
-        infosetFilter = self.infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
+        infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
        self.namespaceHTMLElements = namespaceHTMLElements

        class Attributes(dict):
-            def __init__(self, element, value={}):
+            def __init__(self, element, value=None):
+                if value is None:
+                    value = {}
                self._element = element
-                dict.__init__(self, value)
+                dict.__init__(self, value)  # pylint:disable=non-parent-init-called
                for key, value in self.items():
                    if isinstance(key, tuple):
                        name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
@ -259,10 +254,10 @@ class TreeBuilder(_base.TreeBuilder):
        self.elementClass = Element
        self.commentClass = Comment
        # self.fragmentClass = builder.DocumentFragment
-        _base.TreeBuilder.__init__(self, namespaceHTMLElements)
+        base.TreeBuilder.__init__(self, namespaceHTMLElements)

    def reset(self):
-        _base.TreeBuilder.reset(self)
+        base.TreeBuilder.reset(self)
        self.insertComment = self.insertCommentInitial
        self.initial_comments = []
        self.doctype = None
@ -303,12 +298,14 @@ class TreeBuilder(_base.TreeBuilder):
            self.doctype = doctype

    def insertCommentInitial(self, data, parent=None):
+        assert parent is None or parent is self.document
+        assert self.document._elementTree is None
        self.initial_comments.append(data)

    def insertCommentMain(self, data, parent=None):
        if (parent == self.document and
                self.document._elementTree.getroot()[-1].tag == comment_type):
-                warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
+            warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
        super(TreeBuilder, self).insertComment(data, parent)

    def insertRoot(self, token):
--- a/lib/html5lib/treewalkers/init.py
+++ b/lib/html5lib/treewalkers/init.py
@ -10,10 +10,10 @@ returning an iterator generating tokens.

 from __future__ import absolute_import, division, unicode_literals

-__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"]
-
 from .. import constants
-from ..utils import default_etree
+from .._utils import default_etree
+
+__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"]

 treeWalkerCache = {}

@ -43,11 +43,11 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
            from . import dom
            treeWalkerCache[treeType] = dom.TreeWalker
        elif treeType == "genshi":
-            from . import genshistream
-            treeWalkerCache[treeType] = genshistream.TreeWalker
+            from . import genshi
+            treeWalkerCache[treeType] = genshi.TreeWalker
        elif treeType == "lxml":
-            from . import lxmletree
-            treeWalkerCache[treeType] = lxmletree.TreeWalker
+            from . import etree_lxml
+            treeWalkerCache[treeType] = etree_lxml.TreeWalker
        elif treeType == "etree":
            from . import etree
            if implementation is None:
--- a/lib/html5lib/treewalkers/_base.py
+++ b/lib/html5lib/treewalkers/_base.py
@ -1,11 +1,11 @@
 from __future__ import absolute_import, division, unicode_literals
-from six import text_type, string_types
+
+from xml.dom import Node
+from ..constants import namespaces, voidElements, spaceCharacters

 __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
           "TreeWalker", "NonRecursiveTreeWalker"]

-from xml.dom import Node
-
 DOCUMENT = Node.DOCUMENT_NODE
 DOCTYPE = Node.DOCUMENT_TYPE_NODE
 TEXT = Node.TEXT_NODE
@ -14,28 +14,9 @@ COMMENT = Node.COMMENT_NODE
 ENTITY = Node.ENTITY_NODE
 UNKNOWN = "<#UNKNOWN#>"

-from ..constants import voidElements, spaceCharacters
 spaceCharacters = "".join(spaceCharacters)


-def to_text(s, blank_if_none=True):
-    """Wrapper around six.text_type to convert None to empty string"""
-    if s is None:
-        if blank_if_none:
-            return ""
-        else:
-            return None
-    elif isinstance(s, text_type):
-        return s
-    else:
-        return text_type(s)
-
-
-def is_text_or_none(string):
-    """Wrapper around isinstance(string_types) or is None"""
-    return string is None or isinstance(string, string_types)
-
-
 class TreeWalker(object):
    def __init__(self, tree):
        self.tree = tree
@ -47,47 +28,25 @@ class TreeWalker(object):
        return {"type": "SerializeError", "data": msg}

    def emptyTag(self, namespace, name, attrs, hasChildren=False):
-        assert namespace is None or isinstance(namespace, string_types), type(namespace)
-        assert isinstance(name, string_types), type(name)
-        assert all((namespace is None or isinstance(namespace, string_types)) and
-                   isinstance(name, string_types) and
-                   isinstance(value, string_types)
-                   for (namespace, name), value in attrs.items())
-
-        yield {"type": "EmptyTag", "name": to_text(name, False),
-               "namespace": to_text(namespace),
+        yield {"type": "EmptyTag", "name": name,
+               "namespace": namespace,
               "data": attrs}
        if hasChildren:
            yield self.error("Void element has children")

    def startTag(self, namespace, name, attrs):
-        assert namespace is None or isinstance(namespace, string_types), type(namespace)
-        assert isinstance(name, string_types), type(name)
-        assert all((namespace is None or isinstance(namespace, string_types)) and
-                   isinstance(name, string_types) and
-                   isinstance(value, string_types)
-                   for (namespace, name), value in attrs.items())
-
        return {"type": "StartTag",
-                "name": text_type(name),
-                "namespace": to_text(namespace),
-                "data": dict(((to_text(namespace, False), to_text(name)),
-                              to_text(value, False))
-                             for (namespace, name), value in attrs.items())}
+                "name": name,
+                "namespace": namespace,
+                "data": attrs}

    def endTag(self, namespace, name):
-        assert namespace is None or isinstance(namespace, string_types), type(namespace)
-        assert isinstance(name, string_types), type(namespace)
-
        return {"type": "EndTag",
-                "name": to_text(name, False),
-                "namespace": to_text(namespace),
-                "data": {}}
+                "name": name,
+                "namespace": namespace}

    def text(self, data):
-        assert isinstance(data, string_types), type(data)
-
-        data = to_text(data)
+        data = data
        middle = data.lstrip(spaceCharacters)
        left = data[:len(data) - len(middle)]
        if left:
@ -101,25 +60,16 @@ class TreeWalker(object):
            yield {"type": "SpaceCharacters", "data": right}

    def comment(self, data):
-        assert isinstance(data, string_types), type(data)
-
-        return {"type": "Comment", "data": text_type(data)}
-
-    def doctype(self, name, publicId=None, systemId=None, correct=True):
-        assert is_text_or_none(name), type(name)
-        assert is_text_or_none(publicId), type(publicId)
-        assert is_text_or_none(systemId), type(systemId)
+        return {"type": "Comment", "data": data}

+    def doctype(self, name, publicId=None, systemId=None):
        return {"type": "Doctype",
-                "name": to_text(name),
-                "publicId": to_text(publicId),
-                "systemId": to_text(systemId),
-                "correct": to_text(correct)}
+                "name": name,
+                "publicId": publicId,
+                "systemId": systemId}

    def entity(self, name):
-        assert isinstance(name, string_types), type(name)
-
-        return {"type": "Entity", "name": text_type(name)}
+        return {"type": "Entity", "name": name}

    def unknown(self, nodeType):
        return self.error("Unknown node type: " + nodeType)
@ -154,7 +104,7 @@ class NonRecursiveTreeWalker(TreeWalker):

            elif type == ELEMENT:
                namespace, name, attributes, hasChildren = details
-                if name in voidElements:
+                if (not namespace or namespace == namespaces["html"]) and name in voidElements:
                    for token in self.emptyTag(namespace, name, attributes,
                                               hasChildren):
                        yield token
@ -187,7 +137,7 @@ class NonRecursiveTreeWalker(TreeWalker):
                    type, details = details[0], details[1:]
                    if type == ELEMENT:
                        namespace, name, attributes, hasChildren = details
-                        if name not in voidElements:
+                        if (namespace and namespace != namespaces["html"]) or name not in voidElements:
                            yield self.endTag(namespace, name)
                    if self.tree is currentNode:
                        currentNode = None
--- a/lib/html5lib/treewalkers/dom.py
+++ b/lib/html5lib/treewalkers/dom.py
@ -2,16 +2,16 @@ from __future__ import absolute_import, division, unicode_literals

 from xml.dom import Node

-from . import _base
+from . import base


-class TreeWalker(_base.NonRecursiveTreeWalker):
+class TreeWalker(base.NonRecursiveTreeWalker):
    def getNodeDetails(self, node):
        if node.nodeType == Node.DOCUMENT_TYPE_NODE:
-            return _base.DOCTYPE, node.name, node.publicId, node.systemId
+            return base.DOCTYPE, node.name, node.publicId, node.systemId

        elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
-            return _base.TEXT, node.nodeValue
+            return base.TEXT, node.nodeValue

        elif node.nodeType == Node.ELEMENT_NODE:
            attrs = {}
@ -21,17 +21,17 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
                    attrs[(attr.namespaceURI, attr.localName)] = attr.value
                else:
                    attrs[(None, attr.name)] = attr.value
-            return (_base.ELEMENT, node.namespaceURI, node.nodeName,
+            return (base.ELEMENT, node.namespaceURI, node.nodeName,
                    attrs, node.hasChildNodes())

        elif node.nodeType == Node.COMMENT_NODE:
-            return _base.COMMENT, node.nodeValue
+            return base.COMMENT, node.nodeValue

        elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
-            return (_base.DOCUMENT,)
+            return (base.DOCUMENT,)

        else:
-            return _base.UNKNOWN, node.nodeType
+            return base.UNKNOWN, node.nodeType

    def getFirstChild(self, node):
        return node.firstChild
--- a/lib/html5lib/treewalkers/etree.py
+++ b/lib/html5lib/treewalkers/etree.py
@ -12,8 +12,8 @@ import re

 from six import string_types

-from . import _base
-from ..utils import moduleFactoryFactory
+from . import base
+from .._utils import moduleFactoryFactory

 tag_regexp = re.compile("{([^}]*)}(.*)")

@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation):
    ElementTree = ElementTreeImplementation
    ElementTreeCommentType = ElementTree.Comment("asd").tag

-    class TreeWalker(_base.NonRecursiveTreeWalker):
+    class TreeWalker(base.NonRecursiveTreeWalker):  # pylint:disable=unused-variable
        """Given the particular ElementTree representation, this implementation,
        to avoid using recursion, returns "nodes" as tuples with the following
        content:
@ -38,9 +38,9 @@ def getETreeBuilder(ElementTreeImplementation):
        """
        def getNodeDetails(self, node):
            if isinstance(node, tuple):  # It might be the root Element
-                elt, key, parents, flag = node
+                elt, _, _, flag = node
                if flag in ("text", "tail"):
-                    return _base.TEXT, getattr(elt, flag)
+                    return base.TEXT, getattr(elt, flag)
                else:
                    node = elt

@ -48,14 +48,14 @@ def getETreeBuilder(ElementTreeImplementation):
                node = node.getroot()

            if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
-                return (_base.DOCUMENT,)
+                return (base.DOCUMENT,)

            elif node.tag == "<!DOCTYPE>":
-                return (_base.DOCTYPE, node.text,
+                return (base.DOCTYPE, node.text,
                        node.get("publicId"), node.get("systemId"))

            elif node.tag == ElementTreeCommentType:
-                return _base.COMMENT, node.text
+                return base.COMMENT, node.text

            else:
                assert isinstance(node.tag, string_types), type(node.tag)
@ -73,7 +73,7 @@ def getETreeBuilder(ElementTreeImplementation):
                        attrs[(match.group(1), match.group(2))] = value
                    else:
                        attrs[(None, name)] = value
-                return (_base.ELEMENT, namespace, tag,
+                return (base.ELEMENT, namespace, tag,
                        attrs, len(node) or node.text)

        def getFirstChild(self, node):
--- a/lib/html5lib/treewalkers/etree_lxml.py
+++ b/lib/html5lib/treewalkers/etree_lxml.py
@ -4,9 +4,9 @@ from six import text_type
 from lxml import etree
 from ..treebuilders.etree import tag_regexp

-from . import _base
+from . import base

-from .. import ihatexml
+from .. import _ihatexml


 def ensure_str(s):
@ -15,20 +15,27 @@ def ensure_str(s):
    elif isinstance(s, text_type):
        return s
    else:
-        return s.decode("utf-8", "strict")
+        return s.decode("ascii", "strict")


 class Root(object):
    def __init__(self, et):
        self.elementtree = et
        self.children = []
-        if et.docinfo.internalDTD:
-            self.children.append(Doctype(self,
-                                         ensure_str(et.docinfo.root_name),
-                                         ensure_str(et.docinfo.public_id),
-                                         ensure_str(et.docinfo.system_url)))
-        root = et.getroot()
-        node = root
+
+        try:
+            if et.docinfo.internalDTD:
+                self.children.append(Doctype(self,
+                                             ensure_str(et.docinfo.root_name),
+                                             ensure_str(et.docinfo.public_id),
+                                             ensure_str(et.docinfo.system_url)))
+        except AttributeError:
+            pass
+
+        try:
+            node = et.getroot()
+        except AttributeError:
+            node = et

        while node.getprevious() is not None:
            node = node.getprevious()
@ -115,35 +122,38 @@ class FragmentWrapper(object):
        return len(self.obj)


-class TreeWalker(_base.NonRecursiveTreeWalker):
+class TreeWalker(base.NonRecursiveTreeWalker):
    def __init__(self, tree):
-        if hasattr(tree, "getroot"):
-            tree = Root(tree)
-        elif isinstance(tree, list):
+        # pylint:disable=redefined-variable-type
+        if isinstance(tree, list):
+            self.fragmentChildren = set(tree)
            tree = FragmentRoot(tree)
-        _base.NonRecursiveTreeWalker.__init__(self, tree)
-        self.filter = ihatexml.InfosetFilter()
+        else:
+            self.fragmentChildren = set()
+            tree = Root(tree)
+        base.NonRecursiveTreeWalker.__init__(self, tree)
+        self.filter = _ihatexml.InfosetFilter()

    def getNodeDetails(self, node):
        if isinstance(node, tuple):  # Text node
            node, key = node
            assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
-            return _base.TEXT, ensure_str(getattr(node, key))
+            return base.TEXT, ensure_str(getattr(node, key))

        elif isinstance(node, Root):
-            return (_base.DOCUMENT,)
+            return (base.DOCUMENT,)

        elif isinstance(node, Doctype):
-            return _base.DOCTYPE, node.name, node.public_id, node.system_id
+            return base.DOCTYPE, node.name, node.public_id, node.system_id

        elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
-            return _base.TEXT, node.obj
+            return base.TEXT, ensure_str(node.obj)

        elif node.tag == etree.Comment:
-            return _base.COMMENT, ensure_str(node.text)
+            return base.COMMENT, ensure_str(node.text)

        elif node.tag == etree.Entity:
-            return _base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;
+            return base.ENTITY, ensure_str(node.text)[1:-1]  # strip &;

        else:
            # This is assumed to be an ordinary element
@ -162,7 +172,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
                    attrs[(match.group(1), match.group(2))] = value
                else:
                    attrs[(None, name)] = value
-            return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
+            return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
                    attrs, len(node) > 0 or node.text)

    def getFirstChild(self, node):
@ -197,5 +207,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
            if key == "text":
                return node
            # else: fallback to "normal" processing
+        elif node in self.fragmentChildren:
+            return None

        return node.getparent()
--- a/lib/html5lib/treewalkers/genshistream.py
+++ b/lib/html5lib/treewalkers/genshistream.py
@ -4,12 +4,12 @@ from genshi.core import QName
 from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
 from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT

-from . import _base
+from . import base

 from ..constants import voidElements, namespaces


-class TreeWalker(_base.TreeWalker):
+class TreeWalker(base.TreeWalker):
    def __iter__(self):
        # Buffer the events so we can pass in the following one
        previous = None
@ -25,7 +25,7 @@ class TreeWalker(_base.TreeWalker):
                yield token

    def tokens(self, event, next):
-        kind, data, pos = event
+        kind, data, _ = event
        if kind == START:
            tag, attribs = data
            name = tag.localname
@ -39,8 +39,8 @@ class TreeWalker(_base.TreeWalker):

            if namespace == namespaces["html"] and name in voidElements:
                for token in self.emptyTag(namespace, name, converted_attribs,
-                                           not next or next[0] != END
-                                           or next[1] != tag):
+                                           not next or next[0] != END or
+                                           next[1] != tag):
                    yield token
            else:
                yield self.startTag(namespace, name, converted_attribs)
@ -48,7 +48,7 @@ class TreeWalker(_base.TreeWalker):
        elif kind == END:
            name = data.localname
            namespace = data.namespace
-            if name not in voidElements:
+            if namespace != namespaces["html"] or name not in voidElements:
                yield self.endTag(namespace, name)

        elif kind == COMMENT: