Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72).

This commit is contained in:
JackDandy 2017-01-27 14:24:24 +00:00
parent d60ef4dc44
commit 69a589a54c
34 changed files with 1684 additions and 1283 deletions

View file

@ -13,6 +13,7 @@
* Update cachecontrol library 0.11.5 to 0.11.7 (3b3b776) * Update cachecontrol library 0.11.5 to 0.11.7 (3b3b776)
* Update Certifi 2015.11.20.1 (385476b) to 2017.01.23 (9f9dc30) * Update Certifi 2015.11.20.1 (385476b) to 2017.01.23 (9f9dc30)
* Update feedparser library 5.2.0 (8c62940) to 5.2.1 (f1dd1bb) * Update feedparser library 5.2.0 (8c62940) to 5.2.1 (f1dd1bb)
* Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72)
[develop changelog] [develop changelog]

View file

@ -22,4 +22,4 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
"getTreeWalker", "serialize"] "getTreeWalker", "serialize"]
# this has to be at the top level, see how setup.py parses this # this has to be at the top level, see how setup.py parses this
__version__ = "0.99999999-dev" __version__ = "0.9999999999-dev"

View file

@ -175,9 +175,9 @@ def escapeRegexp(string):
return string return string
# output from the above # output from the above
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
# Simpler things # Simpler things
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]") nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
@ -186,7 +186,7 @@ nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
class InfosetFilter(object): class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}") replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
def __init__(self, replaceChars=None, def __init__(self,
dropXmlnsLocalName=False, dropXmlnsLocalName=False,
dropXmlnsAttrNs=False, dropXmlnsAttrNs=False,
preventDoubleDashComments=False, preventDoubleDashComments=False,
@ -217,7 +217,7 @@ class InfosetFilter(object):
else: else:
return self.toXmlName(name) return self.toXmlName(name)
def coerceElement(self, name, namespace=None): def coerceElement(self, name):
return self.toXmlName(name) return self.toXmlName(name)
def coerceComment(self, data): def coerceComment(self, data):
@ -232,7 +232,7 @@ class InfosetFilter(object):
def coerceCharacters(self, data): def coerceCharacters(self, data):
if self.replaceFormFeedCharacters: if self.replaceFormFeedCharacters:
for i in range(data.count("\x0C")): for _ in range(data.count("\x0C")):
warnings.warn("Text cannot contain U+000C", DataLossWarning) warnings.warn("Text cannot contain U+000C", DataLossWarning)
data = data.replace("\x0C", " ") data = data.replace("\x0C", " ")
# Other non-xml characters # Other non-xml characters

View file

@ -1,14 +1,16 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from six import text_type from six import text_type, binary_type
from six.moves import http_client, urllib from six.moves import http_client, urllib
import codecs import codecs
import re import re
import webencodings
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from .constants import encodings, ReparseException from .constants import ReparseException
from . import utils from . import _utils
from io import StringIO from io import StringIO
@ -17,12 +19,6 @@ try:
except ImportError: except ImportError:
BytesIO = StringIO BytesIO = StringIO
try:
from io import BufferedIOBase
except ImportError:
class BufferedIOBase(object):
pass
# Non-unicode versions of constants for use in the pre-parser # Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters]) spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters]) asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
@ -30,15 +26,17 @@ asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"]) spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
if utils.supports_lone_surrogates: if _utils.supports_lone_surrogates:
# Use one extra step of indirection and create surrogates with # Use one extra step of indirection and create surrogates with
# unichr. Not using this indirection would introduce an illegal # eval. Not using this indirection would introduce an illegal
# unicode literal on platforms not supporting such lone # unicode literal on platforms not supporting such lone
# surrogates. # surrogates.
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate + assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
eval('"\\uD800-\\uDFFF"')) invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
"]")
else: else:
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate) invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
@ -130,7 +128,7 @@ class BufferedStream(object):
return b"".join(rv) return b"".join(rv)
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True): def HTMLInputStream(source, **kwargs):
# Work around Python bug #20007: read(0) closes the connection. # Work around Python bug #20007: read(0) closes the connection.
# http://bugs.python.org/issue20007 # http://bugs.python.org/issue20007
if (isinstance(source, http_client.HTTPResponse) or if (isinstance(source, http_client.HTTPResponse) or
@ -144,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
isUnicode = isinstance(source, text_type) isUnicode = isinstance(source, text_type)
if isUnicode: if isUnicode:
if encoding is not None: encodings = [x for x in kwargs if x.endswith("_encoding")]
raise TypeError("Cannot explicitly set an encoding with a unicode string") if encodings:
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
return HTMLUnicodeInputStream(source) return HTMLUnicodeInputStream(source, **kwargs)
else: else:
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet) return HTMLBinaryInputStream(source, **kwargs)
class HTMLUnicodeInputStream(object): class HTMLUnicodeInputStream(object):
@ -175,27 +174,21 @@ class HTMLUnicodeInputStream(object):
regardless of any BOM or later declaration (such as in a meta regardless of any BOM or later declaration (such as in a meta
element) element)
parseMeta - Look for a <meta> element containing encoding information
""" """
if not utils.supports_lone_surrogates: if not _utils.supports_lone_surrogates:
# Such platforms will have already checked for such # Such platforms will have already checked for such
# surrogate errors, so no need to do this checking. # surrogate errors, so no need to do this checking.
self.reportCharacterErrors = None self.reportCharacterErrors = None
self.replaceCharactersRegexp = None
elif len("\U0010FFFF") == 1: elif len("\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4 self.reportCharacterErrors = self.characterErrorsUCS4
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
else: else:
self.reportCharacterErrors = self.characterErrorsUCS2 self.reportCharacterErrors = self.characterErrorsUCS2
self.replaceCharactersRegexp = re.compile(
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
# List of where new lines occur # List of where new lines occur
self.newLines = [0] self.newLines = [0]
self.charEncoding = ("utf-8", "certain") self.charEncoding = (lookupEncoding("utf-8"), "certain")
self.dataStream = self.openStream(source) self.dataStream = self.openStream(source)
self.reset() self.reset()
@ -288,10 +281,7 @@ class HTMLUnicodeInputStream(object):
if self.reportCharacterErrors: if self.reportCharacterErrors:
self.reportCharacterErrors(data) self.reportCharacterErrors(data)
# Replace invalid characters # Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)
data = data.replace("\r\n", "\n") data = data.replace("\r\n", "\n")
data = data.replace("\r", "\n") data = data.replace("\r", "\n")
@ -301,7 +291,7 @@ class HTMLUnicodeInputStream(object):
return True return True
def characterErrorsUCS4(self, data): def characterErrorsUCS4(self, data):
for i in range(len(invalid_unicode_re.findall(data))): for _ in range(len(invalid_unicode_re.findall(data))):
self.errors.append("invalid-codepoint") self.errors.append("invalid-codepoint")
def characterErrorsUCS2(self, data): def characterErrorsUCS2(self, data):
@ -314,9 +304,9 @@ class HTMLUnicodeInputStream(object):
codepoint = ord(match.group()) codepoint = ord(match.group())
pos = match.start() pos = match.start()
# Pretty sure there should be endianness issues here # Pretty sure there should be endianness issues here
if utils.isSurrogatePair(data[pos:pos + 2]): if _utils.isSurrogatePair(data[pos:pos + 2]):
# We have a surrogate pair! # We have a surrogate pair!
char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2]) char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
if char_val in non_bmp_invalid_codepoints: if char_val in non_bmp_invalid_codepoints:
self.errors.append("invalid-codepoint") self.errors.append("invalid-codepoint")
skip = True skip = True
@ -399,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
""" """
def __init__(self, source, encoding=None, parseMeta=True, chardet=True): def __init__(self, source, override_encoding=None, transport_encoding=None,
same_origin_parent_encoding=None, likely_encoding=None,
default_encoding="windows-1252", useChardet=True):
"""Initialises the HTMLInputStream. """Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source HTMLInputStream(source, [encoding]) -> Normalized stream from source
@ -412,8 +404,6 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
regardless of any BOM or later declaration (such as in a meta regardless of any BOM or later declaration (such as in a meta
element) element)
parseMeta - Look for a <meta> element containing encoding information
""" """
# Raw Stream - for unicode objects this will encode to utf-8 and set # Raw Stream - for unicode objects this will encode to utf-8 and set
# self.charEncoding as appropriate # self.charEncoding as appropriate
@ -421,27 +411,28 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
HTMLUnicodeInputStream.__init__(self, self.rawStream) HTMLUnicodeInputStream.__init__(self, self.rawStream)
self.charEncoding = (codecName(encoding), "certain")
# Encoding Information # Encoding Information
# Number of bytes to use when looking for a meta element with # Number of bytes to use when looking for a meta element with
# encoding information # encoding information
self.numBytesMeta = 512 self.numBytesMeta = 1024
# Number of bytes to use when using detecting encoding using chardet # Number of bytes to use when using detecting encoding using chardet
self.numBytesChardet = 100 self.numBytesChardet = 100
# Encoding to use if no other information can be found # Things from args
self.defaultEncoding = "windows-1252" self.override_encoding = override_encoding
self.transport_encoding = transport_encoding
self.same_origin_parent_encoding = same_origin_parent_encoding
self.likely_encoding = likely_encoding
self.default_encoding = default_encoding
# Detect encoding iff no explicit "transport level" encoding is supplied # Determine encoding
if (self.charEncoding[0] is None): self.charEncoding = self.determineEncoding(useChardet)
self.charEncoding = self.detectEncoding(parseMeta, chardet) assert self.charEncoding[0] is not None
# Call superclass # Call superclass
self.reset() self.reset()
def reset(self): def reset(self):
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream, self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
'replace')
HTMLUnicodeInputStream.reset(self) HTMLUnicodeInputStream.reset(self)
def openStream(self, source): def openStream(self, source):
@ -458,29 +449,50 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
try: try:
stream.seek(stream.tell()) stream.seek(stream.tell())
except: except: # pylint:disable=bare-except
stream = BufferedStream(stream) stream = BufferedStream(stream)
return stream return stream
def detectEncoding(self, parseMeta=True, chardet=True): def determineEncoding(self, chardet=True):
# First look for a BOM # BOMs take precedence over everything
# This will also read past the BOM if present # This will also read past the BOM if present
encoding = self.detectBOM() charEncoding = self.detectBOM(), "certain"
confidence = "certain" if charEncoding[0] is not None:
# If there is no BOM need to look for meta elements with encoding return charEncoding
# information
if encoding is None and parseMeta: # If we've been overriden, we've been overriden
encoding = self.detectEncodingMeta() charEncoding = lookupEncoding(self.override_encoding), "certain"
confidence = "tentative" if charEncoding[0] is not None:
return charEncoding
# Now check the transport layer
charEncoding = lookupEncoding(self.transport_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding
# Look for meta elements with encoding information
charEncoding = self.detectEncodingMeta(), "tentative"
if charEncoding[0] is not None:
return charEncoding
# Parent document encoding
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
return charEncoding
# "likely" encoding
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding
# Guess with chardet, if available # Guess with chardet, if available
if encoding is None and chardet: if chardet:
confidence = "tentative"
try: try:
try: from chardet.universaldetector import UniversalDetector
from charade.universaldetector import UniversalDetector except ImportError:
except ImportError: pass
from chardet.universaldetector import UniversalDetector else:
buffers = [] buffers = []
detector = UniversalDetector() detector = UniversalDetector()
while not detector.done: while not detector.done:
@ -491,36 +503,33 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
buffers.append(buffer) buffers.append(buffer)
detector.feed(buffer) detector.feed(buffer)
detector.close() detector.close()
encoding = detector.result['encoding'] encoding = lookupEncoding(detector.result['encoding'])
self.rawStream.seek(0) self.rawStream.seek(0)
except ImportError: if encoding is not None:
pass return encoding, "tentative"
# If all else fails use the default encoding
if encoding is None:
confidence = "tentative"
encoding = self.defaultEncoding
# Substitute for equivalent encodings: # Try the default encoding
encodingSub = {"iso-8859-1": "windows-1252"} charEncoding = lookupEncoding(self.default_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding
if encoding.lower() in encodingSub: # Fallback to html5lib's default if even that hasn't worked
encoding = encodingSub[encoding.lower()] return lookupEncoding("windows-1252"), "tentative"
return encoding, confidence
def changeEncoding(self, newEncoding): def changeEncoding(self, newEncoding):
assert self.charEncoding[1] != "certain" assert self.charEncoding[1] != "certain"
newEncoding = codecName(newEncoding) newEncoding = lookupEncoding(newEncoding)
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
newEncoding = "utf-8"
if newEncoding is None: if newEncoding is None:
return return
if newEncoding.name in ("utf-16be", "utf-16le"):
newEncoding = lookupEncoding("utf-8")
assert newEncoding is not None
elif newEncoding == self.charEncoding[0]: elif newEncoding == self.charEncoding[0]:
self.charEncoding = (self.charEncoding[0], "certain") self.charEncoding = (self.charEncoding[0], "certain")
else: else:
self.rawStream.seek(0) self.rawStream.seek(0)
self.reset()
self.charEncoding = (newEncoding, "certain") self.charEncoding = (newEncoding, "certain")
self.reset()
raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding)) raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
def detectBOM(self): def detectBOM(self):
@ -529,8 +538,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
encoding otherwise return None""" encoding otherwise return None"""
bomDict = { bomDict = {
codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF8: 'utf-8',
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be', codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be' codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
} }
# Go to beginning of file and read in 4 bytes # Go to beginning of file and read in 4 bytes
@ -550,9 +559,12 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
# Set the read position past the BOM if one was found, otherwise # Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream # set it to the start of the stream
self.rawStream.seek(encoding and seek or 0) if encoding:
self.rawStream.seek(seek)
return encoding return lookupEncoding(encoding)
else:
self.rawStream.seek(0)
return None
def detectEncodingMeta(self): def detectEncodingMeta(self):
"""Report the encoding declared by the meta element """Report the encoding declared by the meta element
@ -563,8 +575,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
self.rawStream.seek(0) self.rawStream.seek(0)
encoding = parser.getEncoding() encoding = parser.getEncoding()
if encoding in ("utf-16", "utf-16-be", "utf-16-le"): if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
encoding = "utf-8" encoding = lookupEncoding("utf-8")
return encoding return encoding
@ -578,6 +590,7 @@ class EncodingBytes(bytes):
return bytes.__new__(self, value.lower()) return bytes.__new__(self, value.lower())
def __init__(self, value): def __init__(self, value):
# pylint:disable=unused-argument
self._position = -1 self._position = -1
def __iter__(self): def __iter__(self):
@ -688,7 +701,7 @@ class EncodingParser(object):
(b"<!", self.handleOther), (b"<!", self.handleOther),
(b"<?", self.handleOther), (b"<?", self.handleOther),
(b"<", self.handlePossibleStartTag)) (b"<", self.handlePossibleStartTag))
for byte in self.data: for _ in self.data:
keepParsing = True keepParsing = True
for key, method in methodDispatch: for key, method in methodDispatch:
if self.data.matchBytes(key): if self.data.matchBytes(key):
@ -727,7 +740,7 @@ class EncodingParser(object):
return False return False
elif attr[0] == b"charset": elif attr[0] == b"charset":
tentativeEncoding = attr[1] tentativeEncoding = attr[1]
codec = codecName(tentativeEncoding) codec = lookupEncoding(tentativeEncoding)
if codec is not None: if codec is not None:
self.encoding = codec self.encoding = codec
return False return False
@ -735,7 +748,7 @@ class EncodingParser(object):
contentParser = ContentAttrParser(EncodingBytes(attr[1])) contentParser = ContentAttrParser(EncodingBytes(attr[1]))
tentativeEncoding = contentParser.parse() tentativeEncoding = contentParser.parse()
if tentativeEncoding is not None: if tentativeEncoding is not None:
codec = codecName(tentativeEncoding) codec = lookupEncoding(tentativeEncoding)
if codec is not None: if codec is not None:
if hasPragma: if hasPragma:
self.encoding = codec self.encoding = codec
@ -892,16 +905,19 @@ class ContentAttrParser(object):
return None return None
def codecName(encoding): def lookupEncoding(encoding):
"""Return the python codec name corresponding to an encoding or None if the """Return the python codec name corresponding to an encoding or None if the
string doesn't correspond to a valid encoding.""" string doesn't correspond to a valid encoding."""
if isinstance(encoding, bytes): if isinstance(encoding, binary_type):
try: try:
encoding = encoding.decode("ascii") encoding = encoding.decode("ascii")
except UnicodeDecodeError: except UnicodeDecodeError:
return None return None
if encoding:
canonicalName = ascii_punctuation_re.sub("", encoding).lower() if encoding is not None:
return encodings.get(canonicalName, None) try:
return webencodings.lookup(encoding)
except AttributeError:
return None
else: else:
return None return None

View file

@ -1,9 +1,6 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
try: from six import unichr as chr
chr = unichr # flake8: noqa
except NameError:
pass
from collections import deque from collections import deque
@ -14,9 +11,9 @@ from .constants import digits, hexDigits, EOF
from .constants import tokenTypes, tagTokenTypes from .constants import tokenTypes, tagTokenTypes
from .constants import replacementCharacters from .constants import replacementCharacters
from .inputstream import HTMLInputStream from ._inputstream import HTMLInputStream
from .trie import Trie from ._trie import Trie
entitiesTrie = Trie(entities) entitiesTrie = Trie(entities)
@ -34,16 +31,11 @@ class HTMLTokenizer(object):
Points to HTMLInputStream object. Points to HTMLInputStream object.
""" """
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, def __init__(self, stream, parser=None, **kwargs):
lowercaseElementName=True, lowercaseAttrName=True, parser=None):
self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet) self.stream = HTMLInputStream(stream, **kwargs)
self.parser = parser self.parser = parser
# Perform case conversions?
self.lowercaseElementName = lowercaseElementName
self.lowercaseAttrName = lowercaseAttrName
# Setup the initial tokenizer state # Setup the initial tokenizer state
self.escapeFlag = False self.escapeFlag = False
self.lastFourChars = [] self.lastFourChars = []
@ -147,8 +139,8 @@ class HTMLTokenizer(object):
output = "&" output = "&"
charStack = [self.stream.char()] charStack = [self.stream.char()]
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
or (allowedChar is not None and allowedChar == charStack[0])): (allowedChar is not None and allowedChar == charStack[0])):
self.stream.unget(charStack[0]) self.stream.unget(charStack[0])
elif charStack[0] == "#": elif charStack[0] == "#":
@ -235,8 +227,7 @@ class HTMLTokenizer(object):
token = self.currentToken token = self.currentToken
# Add token to the queue to be yielded # Add token to the queue to be yielded
if (token["type"] in tagTokenTypes): if (token["type"] in tagTokenTypes):
if self.lowercaseElementName: token["name"] = token["name"].translate(asciiUpper2Lower)
token["name"] = token["name"].translate(asciiUpper2Lower)
if token["type"] == tokenTypes["EndTag"]: if token["type"] == tokenTypes["EndTag"]:
if token["data"]: if token["data"]:
self.tokenQueue.append({"type": tokenTypes["ParseError"], self.tokenQueue.append({"type": tokenTypes["ParseError"],
@ -921,10 +912,9 @@ class HTMLTokenizer(object):
# Attributes are not dropped at this stage. That happens when the # Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended # start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time. # to attributes, but we do want to report the parse error in time.
if self.lowercaseAttrName: self.currentToken["data"][-1][0] = (
self.currentToken["data"][-1][0] = ( self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) for name, _ in self.currentToken["data"][:-1]:
for name, value in self.currentToken["data"][:-1]:
if self.currentToken["data"][-1][0] == name: if self.currentToken["data"][-1][0] == name:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"duplicate-attribute"}) "duplicate-attribute"})
@ -1716,11 +1706,11 @@ class HTMLTokenizer(object):
else: else:
data.append(char) data.append(char)
data = "".join(data) data = "".join(data) # pylint:disable=redefined-variable-type
# Deal with null here rather than in the parser # Deal with null here rather than in the parser
nullCount = data.count("\u0000") nullCount = data.count("\u0000")
if nullCount > 0: if nullCount > 0:
for i in range(nullCount): for _ in range(nullCount):
self.tokenQueue.append({"type": tokenTypes["ParseError"], self.tokenQueue.append({"type": tokenTypes["ParseError"],
"data": "invalid-codepoint"}) "data": "invalid-codepoint"})
data = data.replace("\u0000", "\uFFFD") data = data.replace("\u0000", "\uFFFD")

View file

@ -4,9 +4,11 @@ from .py import Trie as PyTrie
Trie = PyTrie Trie = PyTrie
# pylint:disable=wrong-import-position
try: try:
from .datrie import Trie as DATrie from .datrie import Trie as DATrie
except ImportError: except ImportError:
pass pass
else: else:
Trie = DATrie Trie = DATrie
# pylint:enable=wrong-import-position

View file

@ -7,7 +7,8 @@ class Trie(Mapping):
"""Abstract base class for tries""" """Abstract base class for tries"""
def keys(self, prefix=None): def keys(self, prefix=None):
keys = super().keys() # pylint:disable=arguments-differ
keys = super(Trie, self).keys()
if prefix is None: if prefix is None:
return set(keys) return set(keys)

View file

@ -1,5 +1,6 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
import sys
from types import ModuleType from types import ModuleType
from six import text_type from six import text_type
@ -12,9 +13,11 @@ except ImportError:
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair", __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
"surrogatePairToCodepoint", "moduleFactoryFactory", "surrogatePairToCodepoint", "moduleFactoryFactory",
"supports_lone_surrogates"] "supports_lone_surrogates", "PY27"]
PY27 = sys.version_info[0] == 2 and sys.version_info[1] >= 7
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be # Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
# caught by the below test. In general this would be any platform # caught by the below test. In general this would be any platform
# using UTF-16 as its encoding of unicode strings, such as # using UTF-16 as its encoding of unicode strings, such as
@ -22,12 +25,12 @@ __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
# surrogates, and there is no mechanism to further escape such # surrogates, and there is no mechanism to further escape such
# escapes. # escapes.
try: try:
_x = eval('"\\uD800"') _x = eval('"\\uD800"') # pylint:disable=eval-used
if not isinstance(_x, text_type): if not isinstance(_x, text_type):
# We need this with u"" because of http://bugs.jython.org/issue2039 # We need this with u"" because of http://bugs.jython.org/issue2039
_x = eval('u"\\uD800"') _x = eval('u"\\uD800"') # pylint:disable=eval-used
assert isinstance(_x, text_type) assert isinstance(_x, text_type)
except: except: # pylint:disable=bare-except
supports_lone_surrogates = False supports_lone_surrogates = False
else: else:
supports_lone_surrogates = True supports_lone_surrogates = True
@ -52,12 +55,13 @@ class MethodDispatcher(dict):
# anything here. # anything here.
_dictEntries = [] _dictEntries = []
for name, value in items: for name, value in items:
if type(name) in (list, tuple, frozenset, set): if isinstance(name, (list, tuple, frozenset, set)):
for item in name: for item in name:
_dictEntries.append((item, value)) _dictEntries.append((item, value))
else: else:
_dictEntries.append((name, value)) _dictEntries.append((name, value))
dict.__init__(self, _dictEntries) dict.__init__(self, _dictEntries)
assert len(self) == len(_dictEntries)
self.default = None self.default = None
def __getitem__(self, key): def __getitem__(self, key):
@ -109,3 +113,15 @@ def moduleFactoryFactory(factory):
return mod return mod
return moduleFactory return moduleFactory
def memoize(func):
cache = {}
def wrapped(*args, **kwargs):
key = (tuple(args), tuple(kwargs.items()))
if key not in cache:
cache[key] = func(*args, **kwargs)
return cache[key]
return wrapped

View file

@ -283,6 +283,12 @@ E = {
"Element %(name)s not allowed in a non-html context", "Element %(name)s not allowed in a non-html context",
"unexpected-end-tag-before-html": "unexpected-end-tag-before-html":
"Unexpected end tag (%(name)s) before html.", "Unexpected end tag (%(name)s) before html.",
"unexpected-inhead-noscript-tag":
"Element %(name)s not allowed in a inhead-noscript context",
"eof-in-head-noscript":
"Unexpected end of file. Expected inhead-noscript content",
"char-in-head-noscript":
"Unexpected non-space character. Expected inhead-noscript content",
"XXX-undefined-error": "XXX-undefined-error":
"Undefined error (this sucks and should be fixed)", "Undefined error (this sucks and should be fixed)",
} }
@ -431,6 +437,73 @@ mathmlTextIntegrationPointElements = frozenset([
(namespaces["mathml"], "mtext") (namespaces["mathml"], "mtext")
]) ])
adjustSVGAttributes = {
"attributename": "attributeName",
"attributetype": "attributeType",
"basefrequency": "baseFrequency",
"baseprofile": "baseProfile",
"calcmode": "calcMode",
"clippathunits": "clipPathUnits",
"contentscripttype": "contentScriptType",
"contentstyletype": "contentStyleType",
"diffuseconstant": "diffuseConstant",
"edgemode": "edgeMode",
"externalresourcesrequired": "externalResourcesRequired",
"filterres": "filterRes",
"filterunits": "filterUnits",
"glyphref": "glyphRef",
"gradienttransform": "gradientTransform",
"gradientunits": "gradientUnits",
"kernelmatrix": "kernelMatrix",
"kernelunitlength": "kernelUnitLength",
"keypoints": "keyPoints",
"keysplines": "keySplines",
"keytimes": "keyTimes",
"lengthadjust": "lengthAdjust",
"limitingconeangle": "limitingConeAngle",
"markerheight": "markerHeight",
"markerunits": "markerUnits",
"markerwidth": "markerWidth",
"maskcontentunits": "maskContentUnits",
"maskunits": "maskUnits",
"numoctaves": "numOctaves",
"pathlength": "pathLength",
"patterncontentunits": "patternContentUnits",
"patterntransform": "patternTransform",
"patternunits": "patternUnits",
"pointsatx": "pointsAtX",
"pointsaty": "pointsAtY",
"pointsatz": "pointsAtZ",
"preservealpha": "preserveAlpha",
"preserveaspectratio": "preserveAspectRatio",
"primitiveunits": "primitiveUnits",
"refx": "refX",
"refy": "refY",
"repeatcount": "repeatCount",
"repeatdur": "repeatDur",
"requiredextensions": "requiredExtensions",
"requiredfeatures": "requiredFeatures",
"specularconstant": "specularConstant",
"specularexponent": "specularExponent",
"spreadmethod": "spreadMethod",
"startoffset": "startOffset",
"stddeviation": "stdDeviation",
"stitchtiles": "stitchTiles",
"surfacescale": "surfaceScale",
"systemlanguage": "systemLanguage",
"tablevalues": "tableValues",
"targetx": "targetX",
"targety": "targetY",
"textlength": "textLength",
"viewbox": "viewBox",
"viewtarget": "viewTarget",
"xchannelselector": "xChannelSelector",
"ychannelselector": "yChannelSelector",
"zoomandpan": "zoomAndPan"
}
adjustMathMLAttributes = {"definitionurl": "definitionURL"}
adjustForeignAttributes = { adjustForeignAttributes = {
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]), "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]), "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
@ -2813,7 +2886,6 @@ replacementCharacters = {
0x0d: "\u000D", 0x0d: "\u000D",
0x80: "\u20AC", 0x80: "\u20AC",
0x81: "\u0081", 0x81: "\u0081",
0x81: "\u0081",
0x82: "\u201A", 0x82: "\u201A",
0x83: "\u0192", 0x83: "\u0192",
0x84: "\u201E", 0x84: "\u201E",
@ -2846,235 +2918,6 @@ replacementCharacters = {
0x9F: "\u0178", 0x9F: "\u0178",
} }
encodings = {
'437': 'cp437',
'850': 'cp850',
'852': 'cp852',
'855': 'cp855',
'857': 'cp857',
'860': 'cp860',
'861': 'cp861',
'862': 'cp862',
'863': 'cp863',
'865': 'cp865',
'866': 'cp866',
'869': 'cp869',
'ansix341968': 'ascii',
'ansix341986': 'ascii',
'arabic': 'iso8859-6',
'ascii': 'ascii',
'asmo708': 'iso8859-6',
'big5': 'big5',
'big5hkscs': 'big5hkscs',
'chinese': 'gbk',
'cp037': 'cp037',
'cp1026': 'cp1026',
'cp154': 'ptcp154',
'cp367': 'ascii',
'cp424': 'cp424',
'cp437': 'cp437',
'cp500': 'cp500',
'cp775': 'cp775',
'cp819': 'windows-1252',
'cp850': 'cp850',
'cp852': 'cp852',
'cp855': 'cp855',
'cp857': 'cp857',
'cp860': 'cp860',
'cp861': 'cp861',
'cp862': 'cp862',
'cp863': 'cp863',
'cp864': 'cp864',
'cp865': 'cp865',
'cp866': 'cp866',
'cp869': 'cp869',
'cp936': 'gbk',
'cpgr': 'cp869',
'cpis': 'cp861',
'csascii': 'ascii',
'csbig5': 'big5',
'cseuckr': 'cp949',
'cseucpkdfmtjapanese': 'euc_jp',
'csgb2312': 'gbk',
'cshproman8': 'hp-roman8',
'csibm037': 'cp037',
'csibm1026': 'cp1026',
'csibm424': 'cp424',
'csibm500': 'cp500',
'csibm855': 'cp855',
'csibm857': 'cp857',
'csibm860': 'cp860',
'csibm861': 'cp861',
'csibm863': 'cp863',
'csibm864': 'cp864',
'csibm865': 'cp865',
'csibm866': 'cp866',
'csibm869': 'cp869',
'csiso2022jp': 'iso2022_jp',
'csiso2022jp2': 'iso2022_jp_2',
'csiso2022kr': 'iso2022_kr',
'csiso58gb231280': 'gbk',
'csisolatin1': 'windows-1252',
'csisolatin2': 'iso8859-2',
'csisolatin3': 'iso8859-3',
'csisolatin4': 'iso8859-4',
'csisolatin5': 'windows-1254',
'csisolatin6': 'iso8859-10',
'csisolatinarabic': 'iso8859-6',
'csisolatincyrillic': 'iso8859-5',
'csisolatingreek': 'iso8859-7',
'csisolatinhebrew': 'iso8859-8',
'cskoi8r': 'koi8-r',
'csksc56011987': 'cp949',
'cspc775baltic': 'cp775',
'cspc850multilingual': 'cp850',
'cspc862latinhebrew': 'cp862',
'cspc8codepage437': 'cp437',
'cspcp852': 'cp852',
'csptcp154': 'ptcp154',
'csshiftjis': 'shift_jis',
'csunicode11utf7': 'utf-7',
'cyrillic': 'iso8859-5',
'cyrillicasian': 'ptcp154',
'ebcdiccpbe': 'cp500',
'ebcdiccpca': 'cp037',
'ebcdiccpch': 'cp500',
'ebcdiccphe': 'cp424',
'ebcdiccpnl': 'cp037',
'ebcdiccpus': 'cp037',
'ebcdiccpwt': 'cp037',
'ecma114': 'iso8859-6',
'ecma118': 'iso8859-7',
'elot928': 'iso8859-7',
'eucjp': 'euc_jp',
'euckr': 'cp949',
'extendedunixcodepackedformatforjapanese': 'euc_jp',
'gb18030': 'gb18030',
'gb2312': 'gbk',
'gb231280': 'gbk',
'gbk': 'gbk',
'greek': 'iso8859-7',
'greek8': 'iso8859-7',
'hebrew': 'iso8859-8',
'hproman8': 'hp-roman8',
'hzgb2312': 'hz',
'ibm037': 'cp037',
'ibm1026': 'cp1026',
'ibm367': 'ascii',
'ibm424': 'cp424',
'ibm437': 'cp437',
'ibm500': 'cp500',
'ibm775': 'cp775',
'ibm819': 'windows-1252',
'ibm850': 'cp850',
'ibm852': 'cp852',
'ibm855': 'cp855',
'ibm857': 'cp857',
'ibm860': 'cp860',
'ibm861': 'cp861',
'ibm862': 'cp862',
'ibm863': 'cp863',
'ibm864': 'cp864',
'ibm865': 'cp865',
'ibm866': 'cp866',
'ibm869': 'cp869',
'iso2022jp': 'iso2022_jp',
'iso2022jp2': 'iso2022_jp_2',
'iso2022kr': 'iso2022_kr',
'iso646irv1991': 'ascii',
'iso646us': 'ascii',
'iso88591': 'windows-1252',
'iso885910': 'iso8859-10',
'iso8859101992': 'iso8859-10',
'iso885911987': 'windows-1252',
'iso885913': 'iso8859-13',
'iso885914': 'iso8859-14',
'iso8859141998': 'iso8859-14',
'iso885915': 'iso8859-15',
'iso885916': 'iso8859-16',
'iso8859162001': 'iso8859-16',
'iso88592': 'iso8859-2',
'iso885921987': 'iso8859-2',
'iso88593': 'iso8859-3',
'iso885931988': 'iso8859-3',
'iso88594': 'iso8859-4',
'iso885941988': 'iso8859-4',
'iso88595': 'iso8859-5',
'iso885951988': 'iso8859-5',
'iso88596': 'iso8859-6',
'iso885961987': 'iso8859-6',
'iso88597': 'iso8859-7',
'iso885971987': 'iso8859-7',
'iso88598': 'iso8859-8',
'iso885981988': 'iso8859-8',
'iso88599': 'windows-1254',
'iso885991989': 'windows-1254',
'isoceltic': 'iso8859-14',
'isoir100': 'windows-1252',
'isoir101': 'iso8859-2',
'isoir109': 'iso8859-3',
'isoir110': 'iso8859-4',
'isoir126': 'iso8859-7',
'isoir127': 'iso8859-6',
'isoir138': 'iso8859-8',
'isoir144': 'iso8859-5',
'isoir148': 'windows-1254',
'isoir149': 'cp949',
'isoir157': 'iso8859-10',
'isoir199': 'iso8859-14',
'isoir226': 'iso8859-16',
'isoir58': 'gbk',
'isoir6': 'ascii',
'koi8r': 'koi8-r',
'koi8u': 'koi8-u',
'korean': 'cp949',
'ksc5601': 'cp949',
'ksc56011987': 'cp949',
'ksc56011989': 'cp949',
'l1': 'windows-1252',
'l10': 'iso8859-16',
'l2': 'iso8859-2',
'l3': 'iso8859-3',
'l4': 'iso8859-4',
'l5': 'windows-1254',
'l6': 'iso8859-10',
'l8': 'iso8859-14',
'latin1': 'windows-1252',
'latin10': 'iso8859-16',
'latin2': 'iso8859-2',
'latin3': 'iso8859-3',
'latin4': 'iso8859-4',
'latin5': 'windows-1254',
'latin6': 'iso8859-10',
'latin8': 'iso8859-14',
'latin9': 'iso8859-15',
'ms936': 'gbk',
'mskanji': 'shift_jis',
'pt154': 'ptcp154',
'ptcp154': 'ptcp154',
'r8': 'hp-roman8',
'roman8': 'hp-roman8',
'shiftjis': 'shift_jis',
'tis620': 'cp874',
'unicode11utf7': 'utf-7',
'us': 'ascii',
'usascii': 'ascii',
'utf16': 'utf-16',
'utf16be': 'utf-16-be',
'utf16le': 'utf-16-le',
'utf8': 'utf-8',
'windows1250': 'cp1250',
'windows1251': 'cp1251',
'windows1252': 'cp1252',
'windows1253': 'cp1253',
'windows1254': 'cp1254',
'windows1255': 'cp1255',
'windows1256': 'cp1256',
'windows1257': 'cp1257',
'windows1258': 'cp1258',
'windows936': 'gbk',
'x-x-big5': 'big5'}
tokenTypes = { tokenTypes = {
"Doctype": 0, "Doctype": 0,
"Characters": 1, "Characters": 1,

View file

@ -1,6 +1,6 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from . import _base from . import base
try: try:
from collections import OrderedDict from collections import OrderedDict
@ -8,9 +8,9 @@ except ImportError:
from ordereddict import OrderedDict from ordereddict import OrderedDict
class Filter(_base.Filter): class Filter(base.Filter):
def __iter__(self): def __iter__(self):
for token in _base.Filter.__iter__(self): for token in base.Filter.__iter__(self):
if token["type"] in ("StartTag", "EmptyTag"): if token["type"] in ("StartTag", "EmptyTag"):
attrs = OrderedDict() attrs = OrderedDict()
for name, value in sorted(token["data"].items(), for name, value in sorted(token["data"].items(),

View file

@ -1,11 +1,11 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from . import _base from . import base
class Filter(_base.Filter): class Filter(base.Filter):
def __init__(self, source, encoding): def __init__(self, source, encoding):
_base.Filter.__init__(self, source) base.Filter.__init__(self, source)
self.encoding = encoding self.encoding = encoding
def __iter__(self): def __iter__(self):
@ -13,7 +13,7 @@ class Filter(_base.Filter):
meta_found = (self.encoding is None) meta_found = (self.encoding is None)
pending = [] pending = []
for token in _base.Filter.__iter__(self): for token in base.Filter.__iter__(self):
type = token["type"] type = token["type"]
if type == "StartTag": if type == "StartTag":
if token["name"].lower() == "head": if token["name"].lower() == "head":

View file

@ -1,90 +1,81 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from . import _base from six import text_type
from ..constants import cdataElements, rcdataElements, voidElements
from . import base
from ..constants import namespaces, voidElements
from ..constants import spaceCharacters from ..constants import spaceCharacters
spaceCharacters = "".join(spaceCharacters) spaceCharacters = "".join(spaceCharacters)
class LintError(Exception): class Filter(base.Filter):
pass def __init__(self, source, require_matching_tags=True):
super(Filter, self).__init__(source)
self.require_matching_tags = require_matching_tags
class Filter(_base.Filter):
def __iter__(self): def __iter__(self):
open_elements = [] open_elements = []
contentModelFlag = "PCDATA" for token in base.Filter.__iter__(self):
for token in _base.Filter.__iter__(self):
type = token["type"] type = token["type"]
if type in ("StartTag", "EmptyTag"): if type in ("StartTag", "EmptyTag"):
namespace = token["namespace"]
name = token["name"] name = token["name"]
if contentModelFlag != "PCDATA": assert namespace is None or isinstance(namespace, text_type)
raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name}) assert namespace != ""
if not isinstance(name, str): assert isinstance(name, text_type)
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) assert name != ""
if not name: assert isinstance(token["data"], dict)
raise LintError("Empty tag name") if (not namespace or namespace == namespaces["html"]) and name in voidElements:
if type == "StartTag" and name in voidElements: assert type == "EmptyTag"
raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name}) else:
elif type == "EmptyTag" and name not in voidElements: assert type == "StartTag"
raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]}) if type == "StartTag" and self.require_matching_tags:
if type == "StartTag": open_elements.append((namespace, name))
open_elements.append(name) for (namespace, name), value in token["data"].items():
for name, value in token["data"]: assert namespace is None or isinstance(namespace, text_type)
if not isinstance(name, str): assert namespace != ""
raise LintError("Attribute name is not a string: %(name)r" % {"name": name}) assert isinstance(name, text_type)
if not name: assert name != ""
raise LintError("Empty attribute name") assert isinstance(value, text_type)
if not isinstance(value, str):
raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
if name in cdataElements:
contentModelFlag = "CDATA"
elif name in rcdataElements:
contentModelFlag = "RCDATA"
elif name == "plaintext":
contentModelFlag = "PLAINTEXT"
elif type == "EndTag": elif type == "EndTag":
namespace = token["namespace"]
name = token["name"] name = token["name"]
if not isinstance(name, str): assert namespace is None or isinstance(namespace, text_type)
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name}) assert namespace != ""
if not name: assert isinstance(name, text_type)
raise LintError("Empty tag name") assert name != ""
if name in voidElements: if (not namespace or namespace == namespaces["html"]) and name in voidElements:
raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name}) assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
start_name = open_elements.pop() elif self.require_matching_tags:
if start_name != name: start = open_elements.pop()
raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name}) assert start == (namespace, name)
contentModelFlag = "PCDATA"
elif type == "Comment": elif type == "Comment":
if contentModelFlag != "PCDATA": data = token["data"]
raise LintError("Comment not in PCDATA content model flag") assert isinstance(data, text_type)
elif type in ("Characters", "SpaceCharacters"): elif type in ("Characters", "SpaceCharacters"):
data = token["data"] data = token["data"]
if not isinstance(data, str): assert isinstance(data, text_type)
raise LintError("Attribute name is not a string: %(name)r" % {"name": data}) assert data != ""
if not data:
raise LintError("%(type)s token with empty data" % {"type": type})
if type == "SpaceCharacters": if type == "SpaceCharacters":
data = data.strip(spaceCharacters) assert data.strip(spaceCharacters) == ""
if data:
raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
elif type == "Doctype": elif type == "Doctype":
name = token["name"] name = token["name"]
if contentModelFlag != "PCDATA": assert name is None or isinstance(name, text_type)
raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name}) assert token["publicId"] is None or isinstance(name, text_type)
if not isinstance(name, str): assert token["systemId"] is None or isinstance(name, text_type)
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
# XXX: what to do with token["data"] ?
elif type in ("ParseError", "SerializeError"): elif type == "Entity":
pass assert isinstance(token["name"], text_type)
elif type == "SerializerError":
assert isinstance(token["data"], text_type)
else: else:
raise LintError("Unknown token type: %(type)s" % {"type": type}) assert False, "Unknown token type: %(type)s" % {"type": type}
yield token yield token

View file

@ -1,9 +1,9 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from . import _base from . import base
class Filter(_base.Filter): class Filter(base.Filter):
def slider(self): def slider(self):
previous1 = previous2 = None previous1 = previous2 = None
for token in self.source: for token in self.source:
@ -11,7 +11,8 @@ class Filter(_base.Filter):
yield previous2, previous1, token yield previous2, previous1, token
previous2 = previous1 previous2 = previous1
previous1 = token previous1 = token
yield previous2, previous1, None if previous1 is not None:
yield previous2, previous1, None
def __iter__(self): def __iter__(self):
for previous, token, next in self.slider(): for previous, token, next in self.slider():

View file

@ -1,12 +1,865 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from . import _base import re
from ..sanitizer import HTMLSanitizerMixin from xml.sax.saxutils import escape, unescape
from six.moves import urllib_parse as urlparse
from . import base
from ..constants import namespaces, prefixes
__all__ = ["Filter"]
class Filter(_base.Filter, HTMLSanitizerMixin): allowed_elements = frozenset((
(namespaces['html'], 'a'),
(namespaces['html'], 'abbr'),
(namespaces['html'], 'acronym'),
(namespaces['html'], 'address'),
(namespaces['html'], 'area'),
(namespaces['html'], 'article'),
(namespaces['html'], 'aside'),
(namespaces['html'], 'audio'),
(namespaces['html'], 'b'),
(namespaces['html'], 'big'),
(namespaces['html'], 'blockquote'),
(namespaces['html'], 'br'),
(namespaces['html'], 'button'),
(namespaces['html'], 'canvas'),
(namespaces['html'], 'caption'),
(namespaces['html'], 'center'),
(namespaces['html'], 'cite'),
(namespaces['html'], 'code'),
(namespaces['html'], 'col'),
(namespaces['html'], 'colgroup'),
(namespaces['html'], 'command'),
(namespaces['html'], 'datagrid'),
(namespaces['html'], 'datalist'),
(namespaces['html'], 'dd'),
(namespaces['html'], 'del'),
(namespaces['html'], 'details'),
(namespaces['html'], 'dfn'),
(namespaces['html'], 'dialog'),
(namespaces['html'], 'dir'),
(namespaces['html'], 'div'),
(namespaces['html'], 'dl'),
(namespaces['html'], 'dt'),
(namespaces['html'], 'em'),
(namespaces['html'], 'event-source'),
(namespaces['html'], 'fieldset'),
(namespaces['html'], 'figcaption'),
(namespaces['html'], 'figure'),
(namespaces['html'], 'footer'),
(namespaces['html'], 'font'),
(namespaces['html'], 'form'),
(namespaces['html'], 'header'),
(namespaces['html'], 'h1'),
(namespaces['html'], 'h2'),
(namespaces['html'], 'h3'),
(namespaces['html'], 'h4'),
(namespaces['html'], 'h5'),
(namespaces['html'], 'h6'),
(namespaces['html'], 'hr'),
(namespaces['html'], 'i'),
(namespaces['html'], 'img'),
(namespaces['html'], 'input'),
(namespaces['html'], 'ins'),
(namespaces['html'], 'keygen'),
(namespaces['html'], 'kbd'),
(namespaces['html'], 'label'),
(namespaces['html'], 'legend'),
(namespaces['html'], 'li'),
(namespaces['html'], 'm'),
(namespaces['html'], 'map'),
(namespaces['html'], 'menu'),
(namespaces['html'], 'meter'),
(namespaces['html'], 'multicol'),
(namespaces['html'], 'nav'),
(namespaces['html'], 'nextid'),
(namespaces['html'], 'ol'),
(namespaces['html'], 'output'),
(namespaces['html'], 'optgroup'),
(namespaces['html'], 'option'),
(namespaces['html'], 'p'),
(namespaces['html'], 'pre'),
(namespaces['html'], 'progress'),
(namespaces['html'], 'q'),
(namespaces['html'], 's'),
(namespaces['html'], 'samp'),
(namespaces['html'], 'section'),
(namespaces['html'], 'select'),
(namespaces['html'], 'small'),
(namespaces['html'], 'sound'),
(namespaces['html'], 'source'),
(namespaces['html'], 'spacer'),
(namespaces['html'], 'span'),
(namespaces['html'], 'strike'),
(namespaces['html'], 'strong'),
(namespaces['html'], 'sub'),
(namespaces['html'], 'sup'),
(namespaces['html'], 'table'),
(namespaces['html'], 'tbody'),
(namespaces['html'], 'td'),
(namespaces['html'], 'textarea'),
(namespaces['html'], 'time'),
(namespaces['html'], 'tfoot'),
(namespaces['html'], 'th'),
(namespaces['html'], 'thead'),
(namespaces['html'], 'tr'),
(namespaces['html'], 'tt'),
(namespaces['html'], 'u'),
(namespaces['html'], 'ul'),
(namespaces['html'], 'var'),
(namespaces['html'], 'video'),
(namespaces['mathml'], 'maction'),
(namespaces['mathml'], 'math'),
(namespaces['mathml'], 'merror'),
(namespaces['mathml'], 'mfrac'),
(namespaces['mathml'], 'mi'),
(namespaces['mathml'], 'mmultiscripts'),
(namespaces['mathml'], 'mn'),
(namespaces['mathml'], 'mo'),
(namespaces['mathml'], 'mover'),
(namespaces['mathml'], 'mpadded'),
(namespaces['mathml'], 'mphantom'),
(namespaces['mathml'], 'mprescripts'),
(namespaces['mathml'], 'mroot'),
(namespaces['mathml'], 'mrow'),
(namespaces['mathml'], 'mspace'),
(namespaces['mathml'], 'msqrt'),
(namespaces['mathml'], 'mstyle'),
(namespaces['mathml'], 'msub'),
(namespaces['mathml'], 'msubsup'),
(namespaces['mathml'], 'msup'),
(namespaces['mathml'], 'mtable'),
(namespaces['mathml'], 'mtd'),
(namespaces['mathml'], 'mtext'),
(namespaces['mathml'], 'mtr'),
(namespaces['mathml'], 'munder'),
(namespaces['mathml'], 'munderover'),
(namespaces['mathml'], 'none'),
(namespaces['svg'], 'a'),
(namespaces['svg'], 'animate'),
(namespaces['svg'], 'animateColor'),
(namespaces['svg'], 'animateMotion'),
(namespaces['svg'], 'animateTransform'),
(namespaces['svg'], 'clipPath'),
(namespaces['svg'], 'circle'),
(namespaces['svg'], 'defs'),
(namespaces['svg'], 'desc'),
(namespaces['svg'], 'ellipse'),
(namespaces['svg'], 'font-face'),
(namespaces['svg'], 'font-face-name'),
(namespaces['svg'], 'font-face-src'),
(namespaces['svg'], 'g'),
(namespaces['svg'], 'glyph'),
(namespaces['svg'], 'hkern'),
(namespaces['svg'], 'linearGradient'),
(namespaces['svg'], 'line'),
(namespaces['svg'], 'marker'),
(namespaces['svg'], 'metadata'),
(namespaces['svg'], 'missing-glyph'),
(namespaces['svg'], 'mpath'),
(namespaces['svg'], 'path'),
(namespaces['svg'], 'polygon'),
(namespaces['svg'], 'polyline'),
(namespaces['svg'], 'radialGradient'),
(namespaces['svg'], 'rect'),
(namespaces['svg'], 'set'),
(namespaces['svg'], 'stop'),
(namespaces['svg'], 'svg'),
(namespaces['svg'], 'switch'),
(namespaces['svg'], 'text'),
(namespaces['svg'], 'title'),
(namespaces['svg'], 'tspan'),
(namespaces['svg'], 'use'),
))
allowed_attributes = frozenset((
# HTML attributes
(None, 'abbr'),
(None, 'accept'),
(None, 'accept-charset'),
(None, 'accesskey'),
(None, 'action'),
(None, 'align'),
(None, 'alt'),
(None, 'autocomplete'),
(None, 'autofocus'),
(None, 'axis'),
(None, 'background'),
(None, 'balance'),
(None, 'bgcolor'),
(None, 'bgproperties'),
(None, 'border'),
(None, 'bordercolor'),
(None, 'bordercolordark'),
(None, 'bordercolorlight'),
(None, 'bottompadding'),
(None, 'cellpadding'),
(None, 'cellspacing'),
(None, 'ch'),
(None, 'challenge'),
(None, 'char'),
(None, 'charoff'),
(None, 'choff'),
(None, 'charset'),
(None, 'checked'),
(None, 'cite'),
(None, 'class'),
(None, 'clear'),
(None, 'color'),
(None, 'cols'),
(None, 'colspan'),
(None, 'compact'),
(None, 'contenteditable'),
(None, 'controls'),
(None, 'coords'),
(None, 'data'),
(None, 'datafld'),
(None, 'datapagesize'),
(None, 'datasrc'),
(None, 'datetime'),
(None, 'default'),
(None, 'delay'),
(None, 'dir'),
(None, 'disabled'),
(None, 'draggable'),
(None, 'dynsrc'),
(None, 'enctype'),
(None, 'end'),
(None, 'face'),
(None, 'for'),
(None, 'form'),
(None, 'frame'),
(None, 'galleryimg'),
(None, 'gutter'),
(None, 'headers'),
(None, 'height'),
(None, 'hidefocus'),
(None, 'hidden'),
(None, 'high'),
(None, 'href'),
(None, 'hreflang'),
(None, 'hspace'),
(None, 'icon'),
(None, 'id'),
(None, 'inputmode'),
(None, 'ismap'),
(None, 'keytype'),
(None, 'label'),
(None, 'leftspacing'),
(None, 'lang'),
(None, 'list'),
(None, 'longdesc'),
(None, 'loop'),
(None, 'loopcount'),
(None, 'loopend'),
(None, 'loopstart'),
(None, 'low'),
(None, 'lowsrc'),
(None, 'max'),
(None, 'maxlength'),
(None, 'media'),
(None, 'method'),
(None, 'min'),
(None, 'multiple'),
(None, 'name'),
(None, 'nohref'),
(None, 'noshade'),
(None, 'nowrap'),
(None, 'open'),
(None, 'optimum'),
(None, 'pattern'),
(None, 'ping'),
(None, 'point-size'),
(None, 'poster'),
(None, 'pqg'),
(None, 'preload'),
(None, 'prompt'),
(None, 'radiogroup'),
(None, 'readonly'),
(None, 'rel'),
(None, 'repeat-max'),
(None, 'repeat-min'),
(None, 'replace'),
(None, 'required'),
(None, 'rev'),
(None, 'rightspacing'),
(None, 'rows'),
(None, 'rowspan'),
(None, 'rules'),
(None, 'scope'),
(None, 'selected'),
(None, 'shape'),
(None, 'size'),
(None, 'span'),
(None, 'src'),
(None, 'start'),
(None, 'step'),
(None, 'style'),
(None, 'summary'),
(None, 'suppress'),
(None, 'tabindex'),
(None, 'target'),
(None, 'template'),
(None, 'title'),
(None, 'toppadding'),
(None, 'type'),
(None, 'unselectable'),
(None, 'usemap'),
(None, 'urn'),
(None, 'valign'),
(None, 'value'),
(None, 'variable'),
(None, 'volume'),
(None, 'vspace'),
(None, 'vrml'),
(None, 'width'),
(None, 'wrap'),
(namespaces['xml'], 'lang'),
# MathML attributes
(None, 'actiontype'),
(None, 'align'),
(None, 'columnalign'),
(None, 'columnalign'),
(None, 'columnalign'),
(None, 'columnlines'),
(None, 'columnspacing'),
(None, 'columnspan'),
(None, 'depth'),
(None, 'display'),
(None, 'displaystyle'),
(None, 'equalcolumns'),
(None, 'equalrows'),
(None, 'fence'),
(None, 'fontstyle'),
(None, 'fontweight'),
(None, 'frame'),
(None, 'height'),
(None, 'linethickness'),
(None, 'lspace'),
(None, 'mathbackground'),
(None, 'mathcolor'),
(None, 'mathvariant'),
(None, 'mathvariant'),
(None, 'maxsize'),
(None, 'minsize'),
(None, 'other'),
(None, 'rowalign'),
(None, 'rowalign'),
(None, 'rowalign'),
(None, 'rowlines'),
(None, 'rowspacing'),
(None, 'rowspan'),
(None, 'rspace'),
(None, 'scriptlevel'),
(None, 'selection'),
(None, 'separator'),
(None, 'stretchy'),
(None, 'width'),
(None, 'width'),
(namespaces['xlink'], 'href'),
(namespaces['xlink'], 'show'),
(namespaces['xlink'], 'type'),
# SVG attributes
(None, 'accent-height'),
(None, 'accumulate'),
(None, 'additive'),
(None, 'alphabetic'),
(None, 'arabic-form'),
(None, 'ascent'),
(None, 'attributeName'),
(None, 'attributeType'),
(None, 'baseProfile'),
(None, 'bbox'),
(None, 'begin'),
(None, 'by'),
(None, 'calcMode'),
(None, 'cap-height'),
(None, 'class'),
(None, 'clip-path'),
(None, 'color'),
(None, 'color-rendering'),
(None, 'content'),
(None, 'cx'),
(None, 'cy'),
(None, 'd'),
(None, 'dx'),
(None, 'dy'),
(None, 'descent'),
(None, 'display'),
(None, 'dur'),
(None, 'end'),
(None, 'fill'),
(None, 'fill-opacity'),
(None, 'fill-rule'),
(None, 'font-family'),
(None, 'font-size'),
(None, 'font-stretch'),
(None, 'font-style'),
(None, 'font-variant'),
(None, 'font-weight'),
(None, 'from'),
(None, 'fx'),
(None, 'fy'),
(None, 'g1'),
(None, 'g2'),
(None, 'glyph-name'),
(None, 'gradientUnits'),
(None, 'hanging'),
(None, 'height'),
(None, 'horiz-adv-x'),
(None, 'horiz-origin-x'),
(None, 'id'),
(None, 'ideographic'),
(None, 'k'),
(None, 'keyPoints'),
(None, 'keySplines'),
(None, 'keyTimes'),
(None, 'lang'),
(None, 'marker-end'),
(None, 'marker-mid'),
(None, 'marker-start'),
(None, 'markerHeight'),
(None, 'markerUnits'),
(None, 'markerWidth'),
(None, 'mathematical'),
(None, 'max'),
(None, 'min'),
(None, 'name'),
(None, 'offset'),
(None, 'opacity'),
(None, 'orient'),
(None, 'origin'),
(None, 'overline-position'),
(None, 'overline-thickness'),
(None, 'panose-1'),
(None, 'path'),
(None, 'pathLength'),
(None, 'points'),
(None, 'preserveAspectRatio'),
(None, 'r'),
(None, 'refX'),
(None, 'refY'),
(None, 'repeatCount'),
(None, 'repeatDur'),
(None, 'requiredExtensions'),
(None, 'requiredFeatures'),
(None, 'restart'),
(None, 'rotate'),
(None, 'rx'),
(None, 'ry'),
(None, 'slope'),
(None, 'stemh'),
(None, 'stemv'),
(None, 'stop-color'),
(None, 'stop-opacity'),
(None, 'strikethrough-position'),
(None, 'strikethrough-thickness'),
(None, 'stroke'),
(None, 'stroke-dasharray'),
(None, 'stroke-dashoffset'),
(None, 'stroke-linecap'),
(None, 'stroke-linejoin'),
(None, 'stroke-miterlimit'),
(None, 'stroke-opacity'),
(None, 'stroke-width'),
(None, 'systemLanguage'),
(None, 'target'),
(None, 'text-anchor'),
(None, 'to'),
(None, 'transform'),
(None, 'type'),
(None, 'u1'),
(None, 'u2'),
(None, 'underline-position'),
(None, 'underline-thickness'),
(None, 'unicode'),
(None, 'unicode-range'),
(None, 'units-per-em'),
(None, 'values'),
(None, 'version'),
(None, 'viewBox'),
(None, 'visibility'),
(None, 'width'),
(None, 'widths'),
(None, 'x'),
(None, 'x-height'),
(None, 'x1'),
(None, 'x2'),
(namespaces['xlink'], 'actuate'),
(namespaces['xlink'], 'arcrole'),
(namespaces['xlink'], 'href'),
(namespaces['xlink'], 'role'),
(namespaces['xlink'], 'show'),
(namespaces['xlink'], 'title'),
(namespaces['xlink'], 'type'),
(namespaces['xml'], 'base'),
(namespaces['xml'], 'lang'),
(namespaces['xml'], 'space'),
(None, 'y'),
(None, 'y1'),
(None, 'y2'),
(None, 'zoomAndPan'),
))
attr_val_is_uri = frozenset((
(None, 'href'),
(None, 'src'),
(None, 'cite'),
(None, 'action'),
(None, 'longdesc'),
(None, 'poster'),
(None, 'background'),
(None, 'datasrc'),
(None, 'dynsrc'),
(None, 'lowsrc'),
(None, 'ping'),
(namespaces['xlink'], 'href'),
(namespaces['xml'], 'base'),
))
svg_attr_val_allows_ref = frozenset((
(None, 'clip-path'),
(None, 'color-profile'),
(None, 'cursor'),
(None, 'fill'),
(None, 'filter'),
(None, 'marker'),
(None, 'marker-start'),
(None, 'marker-mid'),
(None, 'marker-end'),
(None, 'mask'),
(None, 'stroke'),
))
svg_allow_local_href = frozenset((
(None, 'altGlyph'),
(None, 'animate'),
(None, 'animateColor'),
(None, 'animateMotion'),
(None, 'animateTransform'),
(None, 'cursor'),
(None, 'feImage'),
(None, 'filter'),
(None, 'linearGradient'),
(None, 'pattern'),
(None, 'radialGradient'),
(None, 'textpath'),
(None, 'tref'),
(None, 'set'),
(None, 'use')
))
allowed_css_properties = frozenset((
'azimuth',
'background-color',
'border-bottom-color',
'border-collapse',
'border-color',
'border-left-color',
'border-right-color',
'border-top-color',
'clear',
'color',
'cursor',
'direction',
'display',
'elevation',
'float',
'font',
'font-family',
'font-size',
'font-style',
'font-variant',
'font-weight',
'height',
'letter-spacing',
'line-height',
'overflow',
'pause',
'pause-after',
'pause-before',
'pitch',
'pitch-range',
'richness',
'speak',
'speak-header',
'speak-numeral',
'speak-punctuation',
'speech-rate',
'stress',
'text-align',
'text-decoration',
'text-indent',
'unicode-bidi',
'vertical-align',
'voice-family',
'volume',
'white-space',
'width',
))
allowed_css_keywords = frozenset((
'auto',
'aqua',
'black',
'block',
'blue',
'bold',
'both',
'bottom',
'brown',
'center',
'collapse',
'dashed',
'dotted',
'fuchsia',
'gray',
'green',
'!important',
'italic',
'left',
'lime',
'maroon',
'medium',
'none',
'navy',
'normal',
'nowrap',
'olive',
'pointer',
'purple',
'red',
'right',
'solid',
'silver',
'teal',
'top',
'transparent',
'underline',
'white',
'yellow',
))
allowed_svg_properties = frozenset((
'fill',
'fill-opacity',
'fill-rule',
'stroke',
'stroke-width',
'stroke-linecap',
'stroke-linejoin',
'stroke-opacity',
))
allowed_protocols = frozenset((
'ed2k',
'ftp',
'http',
'https',
'irc',
'mailto',
'news',
'gopher',
'nntp',
'telnet',
'webcal',
'xmpp',
'callto',
'feed',
'urn',
'aim',
'rsync',
'tag',
'ssh',
'sftp',
'rtsp',
'afs',
'data',
))
allowed_content_types = frozenset((
'image/png',
'image/jpeg',
'image/gif',
'image/webp',
'image/bmp',
'text/plain',
))
data_content_type = re.compile(r'''
^
# Match a content type <application>/<type>
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
# Match any character set and encoding
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
# Assume the rest is data
,.*
$
''',
re.VERBOSE)
class Filter(base.Filter):
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
def __init__(self,
source,
allowed_elements=allowed_elements,
allowed_attributes=allowed_attributes,
allowed_css_properties=allowed_css_properties,
allowed_css_keywords=allowed_css_keywords,
allowed_svg_properties=allowed_svg_properties,
allowed_protocols=allowed_protocols,
allowed_content_types=allowed_content_types,
attr_val_is_uri=attr_val_is_uri,
svg_attr_val_allows_ref=svg_attr_val_allows_ref,
svg_allow_local_href=svg_allow_local_href):
super(Filter, self).__init__(source)
self.allowed_elements = allowed_elements
self.allowed_attributes = allowed_attributes
self.allowed_css_properties = allowed_css_properties
self.allowed_css_keywords = allowed_css_keywords
self.allowed_svg_properties = allowed_svg_properties
self.allowed_protocols = allowed_protocols
self.allowed_content_types = allowed_content_types
self.attr_val_is_uri = attr_val_is_uri
self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
self.svg_allow_local_href = svg_allow_local_href
def __iter__(self): def __iter__(self):
for token in _base.Filter.__iter__(self): for token in base.Filter.__iter__(self):
token = self.sanitize_token(token) token = self.sanitize_token(token)
if token: if token:
yield token yield token
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
# attributes are parsed, and a restricted set, # specified by
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
# in ALLOWED_PROTOCOLS are allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_token(self, token):
# accommodate filters which use token_type differently
token_type = token["type"]
if token_type in ("StartTag", "EndTag", "EmptyTag"):
name = token["name"]
namespace = token["namespace"]
if ((namespace, name) in self.allowed_elements or
(namespace is None and
(namespaces["html"], name) in self.allowed_elements)):
return self.allowed_token(token)
else:
return self.disallowed_token(token)
elif token_type == "Comment":
pass
else:
return token
def allowed_token(self, token):
if "data" in token:
attrs = token["data"]
attr_names = set(attrs.keys())
# Remove forbidden attributes
for to_remove in (attr_names - self.allowed_attributes):
del token["data"][to_remove]
attr_names.remove(to_remove)
# Remove attributes with disallowed URL values
for attr in (attr_names & self.attr_val_is_uri):
assert attr in attrs
# I don't have a clue where this regexp comes from or why it matches those
# characters, nor why we call unescape. I just know it's always been here.
# Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
# this will do is remove *more* than it otherwise would.
val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\s]+", '',
unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "")
try:
uri = urlparse.urlparse(val_unescaped)
except ValueError:
uri = None
del attrs[attr]
if uri and uri.scheme:
if uri.scheme not in self.allowed_protocols:
del attrs[attr]
if uri.scheme == 'data':
m = data_content_type.match(uri.path)
if not m:
del attrs[attr]
elif m.group('content_type') not in self.allowed_content_types:
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
(namespaces['xlink'], 'href') in attrs and re.search('^\s*[^#\s].*',
attrs[(namespaces['xlink'], 'href')])):
del attrs[(namespaces['xlink'], 'href')]
if (None, 'style') in attrs:
attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
token["data"] = attrs
return token
def disallowed_token(self, token):
token_type = token["type"]
if token_type == "EndTag":
token["data"] = "</%s>" % token["name"]
elif token["data"]:
assert token_type in ("StartTag", "EmptyTag")
attrs = []
for (ns, name), v in token["data"].items():
attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
else:
token["data"] = "<%s>" % token["name"]
if token.get("selfClosing"):
token["data"] = token["data"][:-1] + "/>"
token["type"] = "Characters"
del token["name"]
return token
def sanitize_css(self, style):
# disallow urls
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
return ''
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ''
clean = []
for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
if not value:
continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
'padding']:
for keyword in value.split():
if keyword not in self.allowed_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
break
else:
clean.append(prop + ': ' + value + ';')
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)

View file

@ -2,20 +2,20 @@ from __future__ import absolute_import, division, unicode_literals
import re import re
from . import _base from . import base
from ..constants import rcdataElements, spaceCharacters from ..constants import rcdataElements, spaceCharacters
spaceCharacters = "".join(spaceCharacters) spaceCharacters = "".join(spaceCharacters)
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters) SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
class Filter(_base.Filter): class Filter(base.Filter):
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
def __iter__(self): def __iter__(self):
preserve = 0 preserve = 0
for token in _base.Filter.__iter__(self): for token in base.Filter.__iter__(self):
type = token["type"] type = token["type"]
if type == "StartTag" \ if type == "StartTag" \
and (preserve or token["name"] in self.spacePreserveElements): and (preserve or token["name"] in self.spacePreserveElements):

View file

@ -1,39 +1,44 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from six import with_metaclass from six import with_metaclass, viewkeys, PY3
import types import types
from . import inputstream try:
from . import tokenizer from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict
from . import _inputstream
from . import _tokenizer
from . import treebuilders from . import treebuilders
from .treebuilders._base import Marker from .treebuilders.base import Marker
from . import utils from . import _utils
from . import constants from .constants import (
from .constants import spaceCharacters, asciiUpper2Lower spaceCharacters, asciiUpper2Lower,
from .constants import specialElements specialElements, headingElements, cdataElements, rcdataElements,
from .constants import headingElements tokenTypes, tagTokenTypes,
from .constants import cdataElements, rcdataElements namespaces,
from .constants import tokenTypes, ReparseException, namespaces htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements adjustForeignAttributes as adjustForeignAttributesMap,
from .constants import adjustForeignAttributes as adjustForeignAttributesMap adjustMathMLAttributes, adjustSVGAttributes,
from .constants import E E,
ReparseException
)
def parse(doc, treebuilder="etree", encoding=None, def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
namespaceHTMLElements=True):
"""Parse a string or file-like object into a tree""" """Parse a string or file-like object into a tree"""
tb = treebuilders.getTreeBuilder(treebuilder) tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parse(doc, encoding=encoding) return p.parse(doc, **kwargs)
def parseFragment(doc, container="div", treebuilder="etree", encoding=None, def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
namespaceHTMLElements=True):
tb = treebuilders.getTreeBuilder(treebuilder) tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parseFragment(doc, container=container, encoding=encoding) return p.parseFragment(doc, container=container, **kwargs)
def method_decorator_metaclass(function): def method_decorator_metaclass(function):
@ -52,18 +57,13 @@ class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly """HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML""" malformed) HTML"""
def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer, def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
strict=False, namespaceHTMLElements=True, debug=False):
""" """
strict - raise an exception when a parse error is encountered strict - raise an exception when a parse error is encountered
tree - a treebuilder class controlling the type of tree that will be tree - a treebuilder class controlling the type of tree that will be
returned. Built in treebuilders can be accessed through returned. Built in treebuilders can be accessed through
html5lib.treebuilders.getTreeBuilder(treeType) html5lib.treebuilders.getTreeBuilder(treeType)
tokenizer - a class that provides a stream of tokens to the treebuilder.
This may be replaced for e.g. a sanitizer which converts some tags to
text
""" """
# Raise an exception on the first error encountered # Raise an exception on the first error encountered
@ -72,29 +72,24 @@ class HTMLParser(object):
if tree is None: if tree is None:
tree = treebuilders.getTreeBuilder("etree") tree = treebuilders.getTreeBuilder("etree")
self.tree = tree(namespaceHTMLElements) self.tree = tree(namespaceHTMLElements)
self.tokenizer_class = tokenizer
self.errors = [] self.errors = []
self.phases = dict([(name, cls(self, self.tree)) for name, cls in self.phases = dict([(name, cls(self, self.tree)) for name, cls in
getPhases(debug).items()]) getPhases(debug).items()])
def _parse(self, stream, innerHTML=False, container="div", def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
encoding=None, parseMeta=True, useChardet=True, **kwargs):
self.innerHTMLMode = innerHTML self.innerHTMLMode = innerHTML
self.container = container self.container = container
self.tokenizer = self.tokenizer_class(stream, encoding=encoding, self.scripting = scripting
parseMeta=parseMeta, self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
useChardet=useChardet,
parser=self, **kwargs)
self.reset() self.reset()
while True: try:
try: self.mainLoop()
self.mainLoop() except ReparseException:
break self.reset()
except ReparseException: self.mainLoop()
self.reset()
def reset(self): def reset(self):
self.tree.reset() self.tree.reset()
@ -121,7 +116,7 @@ class HTMLParser(object):
self.phase.insertHtmlElement() self.phase.insertHtmlElement()
self.resetInsertionMode() self.resetInsertionMode()
else: else:
self.innerHTML = False self.innerHTML = False # pylint:disable=redefined-variable-type
self.phase = self.phases["initial"] self.phase = self.phases["initial"]
self.lastPhase = None self.lastPhase = None
@ -139,7 +134,7 @@ class HTMLParser(object):
""" """
if not hasattr(self, 'tokenizer'): if not hasattr(self, 'tokenizer'):
return None return None
return self.tokenizer.stream.charEncoding[0] return self.tokenizer.stream.charEncoding[0].name
def isHTMLIntegrationPoint(self, element): def isHTMLIntegrationPoint(self, element):
if (element.name == "annotation-xml" and if (element.name == "annotation-xml" and
@ -164,8 +159,10 @@ class HTMLParser(object):
ParseErrorToken = tokenTypes["ParseError"] ParseErrorToken = tokenTypes["ParseError"]
for token in self.normalizedTokens(): for token in self.normalizedTokens():
prev_token = None
new_token = token new_token = token
while new_token is not None: while new_token is not None:
prev_token = new_token
currentNode = self.tree.openElements[-1] if self.tree.openElements else None currentNode = self.tree.openElements[-1] if self.tree.openElements else None
currentNodeNamespace = currentNode.namespace if currentNode else None currentNodeNamespace = currentNode.namespace if currentNode else None
currentNodeName = currentNode.name if currentNode else None currentNodeName = currentNode.name if currentNode else None
@ -184,6 +181,7 @@ class HTMLParser(object):
type in (CharactersToken, SpaceCharactersToken))) or type in (CharactersToken, SpaceCharactersToken))) or
(currentNodeNamespace == namespaces["mathml"] and (currentNodeNamespace == namespaces["mathml"] and
currentNodeName == "annotation-xml" and currentNodeName == "annotation-xml" and
type == StartTagToken and
token["name"] == "svg") or token["name"] == "svg") or
(self.isHTMLIntegrationPoint(currentNode) and (self.isHTMLIntegrationPoint(currentNode) and
type in (StartTagToken, CharactersToken, SpaceCharactersToken))): type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
@ -204,10 +202,10 @@ class HTMLParser(object):
elif type == DoctypeToken: elif type == DoctypeToken:
new_token = phase.processDoctype(new_token) new_token = phase.processDoctype(new_token)
if (type == StartTagToken and token["selfClosing"] if (type == StartTagToken and prev_token["selfClosing"] and
and not token["selfClosingAcknowledged"]): not prev_token["selfClosingAcknowledged"]):
self.parseError("non-void-element-with-trailing-solidus", self.parseError("non-void-element-with-trailing-solidus",
{"name": token["name"]}) {"name": prev_token["name"]})
# When the loop finishes it's EOF # When the loop finishes it's EOF
reprocess = True reprocess = True
@ -222,7 +220,7 @@ class HTMLParser(object):
for token in self.tokenizer: for token in self.tokenizer:
yield self.normalizeToken(token) yield self.normalizeToken(token)
def parse(self, stream, encoding=None, parseMeta=True, useChardet=True): def parse(self, stream, *args, **kwargs):
"""Parse a HTML document into a well-formed tree """Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed stream - a filelike object or string containing the HTML to be parsed
@ -231,13 +229,13 @@ class HTMLParser(object):
the encoding. If specified, that encoding will be used, the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta regardless of any BOM or later declaration (such as in a meta
element) element)
scripting - treat noscript elements as if javascript was turned on
""" """
self._parse(stream, innerHTML=False, encoding=encoding, self._parse(stream, False, None, *args, **kwargs)
parseMeta=parseMeta, useChardet=useChardet)
return self.tree.getDocument() return self.tree.getDocument()
def parseFragment(self, stream, container="div", encoding=None, def parseFragment(self, stream, *args, **kwargs):
parseMeta=False, useChardet=True):
"""Parse a HTML fragment into a well-formed tree fragment """Parse a HTML fragment into a well-formed tree fragment
container - name of the element we're setting the innerHTML property container - name of the element we're setting the innerHTML property
@ -249,12 +247,16 @@ class HTMLParser(object):
the encoding. If specified, that encoding will be used, the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta regardless of any BOM or later declaration (such as in a meta
element) element)
scripting - treat noscript elements as if javascript was turned on
""" """
self._parse(stream, True, container=container, encoding=encoding) self._parse(stream, True, *args, **kwargs)
return self.tree.getFragment() return self.tree.getFragment()
def parseError(self, errorcode="XXX-undefined-error", datavars={}): def parseError(self, errorcode="XXX-undefined-error", datavars=None):
# XXX The idea is to make errorcode mandatory. # XXX The idea is to make errorcode mandatory.
if datavars is None:
datavars = {}
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
if self.strict: if self.strict:
raise ParseError(E[errorcode] % datavars) raise ParseError(E[errorcode] % datavars)
@ -263,98 +265,25 @@ class HTMLParser(object):
""" HTML5 specific normalizations to the token stream """ """ HTML5 specific normalizations to the token stream """
if token["type"] == tokenTypes["StartTag"]: if token["type"] == tokenTypes["StartTag"]:
token["data"] = dict(token["data"][::-1]) raw = token["data"]
token["data"] = OrderedDict(raw)
if len(raw) > len(token["data"]):
# we had some duplicated attribute, fix so first wins
token["data"].update(raw[::-1])
return token return token
def adjustMathMLAttributes(self, token): def adjustMathMLAttributes(self, token):
replacements = {"definitionurl": "definitionURL"} adjust_attributes(token, adjustMathMLAttributes)
for k, v in replacements.items():
if k in token["data"]:
token["data"][v] = token["data"][k]
del token["data"][k]
def adjustSVGAttributes(self, token): def adjustSVGAttributes(self, token):
replacements = { adjust_attributes(token, adjustSVGAttributes)
"attributename": "attributeName",
"attributetype": "attributeType",
"basefrequency": "baseFrequency",
"baseprofile": "baseProfile",
"calcmode": "calcMode",
"clippathunits": "clipPathUnits",
"contentscripttype": "contentScriptType",
"contentstyletype": "contentStyleType",
"diffuseconstant": "diffuseConstant",
"edgemode": "edgeMode",
"externalresourcesrequired": "externalResourcesRequired",
"filterres": "filterRes",
"filterunits": "filterUnits",
"glyphref": "glyphRef",
"gradienttransform": "gradientTransform",
"gradientunits": "gradientUnits",
"kernelmatrix": "kernelMatrix",
"kernelunitlength": "kernelUnitLength",
"keypoints": "keyPoints",
"keysplines": "keySplines",
"keytimes": "keyTimes",
"lengthadjust": "lengthAdjust",
"limitingconeangle": "limitingConeAngle",
"markerheight": "markerHeight",
"markerunits": "markerUnits",
"markerwidth": "markerWidth",
"maskcontentunits": "maskContentUnits",
"maskunits": "maskUnits",
"numoctaves": "numOctaves",
"pathlength": "pathLength",
"patterncontentunits": "patternContentUnits",
"patterntransform": "patternTransform",
"patternunits": "patternUnits",
"pointsatx": "pointsAtX",
"pointsaty": "pointsAtY",
"pointsatz": "pointsAtZ",
"preservealpha": "preserveAlpha",
"preserveaspectratio": "preserveAspectRatio",
"primitiveunits": "primitiveUnits",
"refx": "refX",
"refy": "refY",
"repeatcount": "repeatCount",
"repeatdur": "repeatDur",
"requiredextensions": "requiredExtensions",
"requiredfeatures": "requiredFeatures",
"specularconstant": "specularConstant",
"specularexponent": "specularExponent",
"spreadmethod": "spreadMethod",
"startoffset": "startOffset",
"stddeviation": "stdDeviation",
"stitchtiles": "stitchTiles",
"surfacescale": "surfaceScale",
"systemlanguage": "systemLanguage",
"tablevalues": "tableValues",
"targetx": "targetX",
"targety": "targetY",
"textlength": "textLength",
"viewbox": "viewBox",
"viewtarget": "viewTarget",
"xchannelselector": "xChannelSelector",
"ychannelselector": "yChannelSelector",
"zoomandpan": "zoomAndPan"
}
for originalName in list(token["data"].keys()):
if originalName in replacements:
svgName = replacements[originalName]
token["data"][svgName] = token["data"][originalName]
del token["data"][originalName]
def adjustForeignAttributes(self, token): def adjustForeignAttributes(self, token):
replacements = adjustForeignAttributesMap adjust_attributes(token, adjustForeignAttributesMap)
for originalName in token["data"].keys():
if originalName in replacements:
foreignName = replacements[originalName]
token["data"][foreignName] = token["data"][originalName]
del token["data"][originalName]
def reparseTokenNormal(self, token): def reparseTokenNormal(self, token):
# pylint:disable=unused-argument
self.parser.phase() self.parser.phase()
def resetInsertionMode(self): def resetInsertionMode(self):
@ -419,11 +348,12 @@ class HTMLParser(object):
self.phase = self.phases["text"] self.phase = self.phases["text"]
@_utils.memoize
def getPhases(debug): def getPhases(debug):
def log(function): def log(function):
"""Logger that records which phase processes each token""" """Logger that records which phase processes each token"""
type_names = dict((value, key) for key, value in type_names = dict((value, key) for key, value in
constants.tokenTypes.items()) tokenTypes.items())
def wrapped(self, *args, **kwargs): def wrapped(self, *args, **kwargs):
if function.__name__.startswith("process") and len(args) > 0: if function.__name__.startswith("process") and len(args) > 0:
@ -432,7 +362,7 @@ def getPhases(debug):
info = {"type": type_names[token['type']]} info = {"type": type_names[token['type']]}
except: except:
raise raise
if token['type'] in constants.tagTokenTypes: if token['type'] in tagTokenTypes:
info["name"] = token['name'] info["name"] = token['name']
self.parser.log.append((self.parser.tokenizer.state.__name__, self.parser.log.append((self.parser.tokenizer.state.__name__,
@ -451,6 +381,7 @@ def getPhases(debug):
else: else:
return type return type
# pylint:disable=unused-argument
class Phase(with_metaclass(getMetaclass(debug, log))): class Phase(with_metaclass(getMetaclass(debug, log))):
"""Base class for helper object that implements each phase of processing """Base class for helper object that implements each phase of processing
""" """
@ -517,77 +448,76 @@ def getPhases(debug):
if publicId != "": if publicId != "":
publicId = publicId.translate(asciiUpper2Lower) publicId = publicId.translate(asciiUpper2Lower)
if (not correct or token["name"] != "html" if (not correct or token["name"] != "html" or
or publicId.startswith( publicId.startswith(
("+//silmaril//dtd html pro v0r11 19970101//", ("+//silmaril//dtd html pro v0r11 19970101//",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//", "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
"-//as//dtd html 3.0 aswedit + extensions//", "-//as//dtd html 3.0 aswedit + extensions//",
"-//ietf//dtd html 2.0 level 1//", "-//ietf//dtd html 2.0 level 1//",
"-//ietf//dtd html 2.0 level 2//", "-//ietf//dtd html 2.0 level 2//",
"-//ietf//dtd html 2.0 strict level 1//", "-//ietf//dtd html 2.0 strict level 1//",
"-//ietf//dtd html 2.0 strict level 2//", "-//ietf//dtd html 2.0 strict level 2//",
"-//ietf//dtd html 2.0 strict//", "-//ietf//dtd html 2.0 strict//",
"-//ietf//dtd html 2.0//", "-//ietf//dtd html 2.0//",
"-//ietf//dtd html 2.1e//", "-//ietf//dtd html 2.1e//",
"-//ietf//dtd html 3.0//", "-//ietf//dtd html 3.0//",
"-//ietf//dtd html 3.2 final//", "-//ietf//dtd html 3.2 final//",
"-//ietf//dtd html 3.2//", "-//ietf//dtd html 3.2//",
"-//ietf//dtd html 3//", "-//ietf//dtd html 3//",
"-//ietf//dtd html level 0//", "-//ietf//dtd html level 0//",
"-//ietf//dtd html level 1//", "-//ietf//dtd html level 1//",
"-//ietf//dtd html level 2//", "-//ietf//dtd html level 2//",
"-//ietf//dtd html level 3//", "-//ietf//dtd html level 3//",
"-//ietf//dtd html strict level 0//", "-//ietf//dtd html strict level 0//",
"-//ietf//dtd html strict level 1//", "-//ietf//dtd html strict level 1//",
"-//ietf//dtd html strict level 2//", "-//ietf//dtd html strict level 2//",
"-//ietf//dtd html strict level 3//", "-//ietf//dtd html strict level 3//",
"-//ietf//dtd html strict//", "-//ietf//dtd html strict//",
"-//ietf//dtd html//", "-//ietf//dtd html//",
"-//metrius//dtd metrius presentational//", "-//metrius//dtd metrius presentational//",
"-//microsoft//dtd internet explorer 2.0 html strict//", "-//microsoft//dtd internet explorer 2.0 html strict//",
"-//microsoft//dtd internet explorer 2.0 html//", "-//microsoft//dtd internet explorer 2.0 html//",
"-//microsoft//dtd internet explorer 2.0 tables//", "-//microsoft//dtd internet explorer 2.0 tables//",
"-//microsoft//dtd internet explorer 3.0 html strict//", "-//microsoft//dtd internet explorer 3.0 html strict//",
"-//microsoft//dtd internet explorer 3.0 html//", "-//microsoft//dtd internet explorer 3.0 html//",
"-//microsoft//dtd internet explorer 3.0 tables//", "-//microsoft//dtd internet explorer 3.0 tables//",
"-//netscape comm. corp.//dtd html//", "-//netscape comm. corp.//dtd html//",
"-//netscape comm. corp.//dtd strict html//", "-//netscape comm. corp.//dtd strict html//",
"-//o'reilly and associates//dtd html 2.0//", "-//o'reilly and associates//dtd html 2.0//",
"-//o'reilly and associates//dtd html extended 1.0//", "-//o'reilly and associates//dtd html extended 1.0//",
"-//o'reilly and associates//dtd html extended relaxed 1.0//", "-//o'reilly and associates//dtd html extended relaxed 1.0//",
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
"-//spyglass//dtd html 2.0 extended//", "-//spyglass//dtd html 2.0 extended//",
"-//sq//dtd html 2.0 hotmetal + extensions//", "-//sq//dtd html 2.0 hotmetal + extensions//",
"-//sun microsystems corp.//dtd hotjava html//", "-//sun microsystems corp.//dtd hotjava html//",
"-//sun microsystems corp.//dtd hotjava strict html//", "-//sun microsystems corp.//dtd hotjava strict html//",
"-//w3c//dtd html 3 1995-03-24//", "-//w3c//dtd html 3 1995-03-24//",
"-//w3c//dtd html 3.2 draft//", "-//w3c//dtd html 3.2 draft//",
"-//w3c//dtd html 3.2 final//", "-//w3c//dtd html 3.2 final//",
"-//w3c//dtd html 3.2//", "-//w3c//dtd html 3.2//",
"-//w3c//dtd html 3.2s draft//", "-//w3c//dtd html 3.2s draft//",
"-//w3c//dtd html 4.0 frameset//", "-//w3c//dtd html 4.0 frameset//",
"-//w3c//dtd html 4.0 transitional//", "-//w3c//dtd html 4.0 transitional//",
"-//w3c//dtd html experimental 19960712//", "-//w3c//dtd html experimental 19960712//",
"-//w3c//dtd html experimental 970421//", "-//w3c//dtd html experimental 970421//",
"-//w3c//dtd w3 html//", "-//w3c//dtd w3 html//",
"-//w3o//dtd w3 html 3.0//", "-//w3o//dtd w3 html 3.0//",
"-//webtechs//dtd mozilla html 2.0//", "-//webtechs//dtd mozilla html 2.0//",
"-//webtechs//dtd mozilla html//")) "-//webtechs//dtd mozilla html//")) or
or publicId in publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
("-//w3o//dtd w3 html strict 3.0//en//", "-/w3c/dtd html 4.0 transitional/en",
"-/w3c/dtd html 4.0 transitional/en", "html") or
"html") publicId.startswith(
or publicId.startswith( ("-//w3c//dtd html 4.01 frameset//",
("-//w3c//dtd html 4.01 frameset//", "-//w3c//dtd html 4.01 transitional//")) and
"-//w3c//dtd html 4.01 transitional//")) and systemId is None or
systemId is None systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
self.parser.compatMode = "quirks" self.parser.compatMode = "quirks"
elif (publicId.startswith( elif (publicId.startswith(
("-//w3c//dtd xhtml 1.0 frameset//", ("-//w3c//dtd xhtml 1.0 frameset//",
"-//w3c//dtd xhtml 1.0 transitional//")) "-//w3c//dtd xhtml 1.0 transitional//")) or
or publicId.startswith( publicId.startswith(
("-//w3c//dtd html 4.01 frameset//", ("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and "-//w3c//dtd html 4.01 transitional//")) and
systemId is not None): systemId is not None):
@ -660,13 +590,13 @@ def getPhases(debug):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
("head", self.startTagHead) ("head", self.startTagHead)
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
(("head", "body", "html", "br"), self.endTagImplyHead) (("head", "body", "html", "br"), self.endTagImplyHead)
]) ])
self.endTagHandler.default = self.endTagOther self.endTagHandler.default = self.endTagOther
@ -706,10 +636,11 @@ def getPhases(debug):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
("title", self.startTagTitle), ("title", self.startTagTitle),
(("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle), (("noframes", "style"), self.startTagNoFramesStyle),
("noscript", self.startTagNoscript),
("script", self.startTagScript), ("script", self.startTagScript),
(("base", "basefont", "bgsound", "command", "link"), (("base", "basefont", "bgsound", "command", "link"),
self.startTagBaseLinkCommand), self.startTagBaseLinkCommand),
@ -718,7 +649,7 @@ def getPhases(debug):
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self. endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
("head", self.endTagHead), ("head", self.endTagHead),
(("br", "html", "body"), self.endTagHtmlBodyBr) (("br", "html", "body"), self.endTagHtmlBodyBr)
]) ])
@ -760,18 +691,25 @@ def getPhases(debug):
# the abstract Unicode string, and just use the # the abstract Unicode string, and just use the
# ContentAttrParser on that, but using UTF-8 allows all chars # ContentAttrParser on that, but using UTF-8 allows all chars
# to be encoded and as a ASCII-superset works. # to be encoded and as a ASCII-superset works.
data = inputstream.EncodingBytes(attributes["content"].encode("utf-8")) data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
parser = inputstream.ContentAttrParser(data) parser = _inputstream.ContentAttrParser(data)
codec = parser.parse() codec = parser.parse()
self.parser.tokenizer.stream.changeEncoding(codec) self.parser.tokenizer.stream.changeEncoding(codec)
def startTagTitle(self, token): def startTagTitle(self, token):
self.parser.parseRCDataRawtext(token, "RCDATA") self.parser.parseRCDataRawtext(token, "RCDATA")
def startTagNoScriptNoFramesStyle(self, token): def startTagNoFramesStyle(self, token):
# Need to decide whether to implement the scripting-disabled case # Need to decide whether to implement the scripting-disabled case
self.parser.parseRCDataRawtext(token, "RAWTEXT") self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagNoscript(self, token):
if self.parser.scripting:
self.parser.parseRCDataRawtext(token, "RAWTEXT")
else:
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inHeadNoscript"]
def startTagScript(self, token): def startTagScript(self, token):
self.tree.insertElement(token) self.tree.insertElement(token)
self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
@ -797,15 +735,75 @@ def getPhases(debug):
def anythingElse(self): def anythingElse(self):
self.endTagHead(impliedTagToken("head")) self.endTagHead(impliedTagToken("head"))
# XXX If we implement a parser for which scripting is disabled we need to class InHeadNoscriptPhase(Phase):
# implement this phase. def __init__(self, parser, tree):
# Phase.__init__(self, parser, tree)
# class InHeadNoScriptPhase(Phase):
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
(("head", "noscript"), self.startTagHeadNoscript),
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("noscript", self.endTagNoscript),
("br", self.endTagBr),
])
self.endTagHandler.default = self.endTagOther
def processEOF(self):
self.parser.parseError("eof-in-head-noscript")
self.anythingElse()
return True
def processComment(self, token):
return self.parser.phases["inHead"].processComment(token)
def processCharacters(self, token):
self.parser.parseError("char-in-head-noscript")
self.anythingElse()
return token
def processSpaceCharacters(self, token):
return self.parser.phases["inHead"].processSpaceCharacters(token)
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagBaseLinkCommand(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagHeadNoscript(self, token):
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
def startTagOther(self, token):
self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anythingElse()
return token
def endTagNoscript(self, token):
node = self.parser.tree.openElements.pop()
assert node.name == "noscript", "Expected noscript got %s" % node.name
self.parser.phase = self.parser.phases["inHead"]
def endTagBr(self, token):
self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anythingElse()
return token
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def anythingElse(self):
# Caller must raise parse error first!
self.endTagNoscript(impliedTagToken("noscript"))
class AfterHeadPhase(Phase): class AfterHeadPhase(Phase):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
("body", self.startTagBody), ("body", self.startTagBody),
("frameset", self.startTagFrameset), ("frameset", self.startTagFrameset),
@ -815,8 +813,8 @@ def getPhases(debug):
("head", self.startTagHead) ("head", self.startTagHead)
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"), self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
self.endTagHtmlBodyBr)]) self.endTagHtmlBodyBr)])
self.endTagHandler.default = self.endTagOther self.endTagHandler.default = self.endTagOther
def processEOF(self): def processEOF(self):
@ -874,10 +872,10 @@ def getPhases(debug):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
# Keep a ref to this for special handling of whitespace in <pre> # Set this to the default handler
self.processSpaceCharactersNonPre = self.processSpaceCharacters self.processSpaceCharacters = self.processSpaceCharactersNonPre
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
(("base", "basefont", "bgsound", "command", "link", "meta", (("base", "basefont", "bgsound", "command", "link", "meta",
"script", "style", "title"), "script", "style", "title"),
@ -885,7 +883,7 @@ def getPhases(debug):
("body", self.startTagBody), ("body", self.startTagBody),
("frameset", self.startTagFrameset), ("frameset", self.startTagFrameset),
(("address", "article", "aside", "blockquote", "center", "details", (("address", "article", "aside", "blockquote", "center", "details",
"details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
"section", "summary", "ul"), "section", "summary", "ul"),
self.startTagCloseP), self.startTagCloseP),
@ -911,7 +909,8 @@ def getPhases(debug):
("isindex", self.startTagIsIndex), ("isindex", self.startTagIsIndex),
("textarea", self.startTagTextarea), ("textarea", self.startTagTextarea),
("iframe", self.startTagIFrame), ("iframe", self.startTagIFrame),
(("noembed", "noframes", "noscript"), self.startTagRawtext), ("noscript", self.startTagNoscript),
(("noembed", "noframes"), self.startTagRawtext),
("select", self.startTagSelect), ("select", self.startTagSelect),
(("rp", "rt"), self.startTagRpRt), (("rp", "rt"), self.startTagRpRt),
(("option", "optgroup"), self.startTagOpt), (("option", "optgroup"), self.startTagOpt),
@ -923,7 +922,7 @@ def getPhases(debug):
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
("body", self.endTagBody), ("body", self.endTagBody),
("html", self.endTagHtml), ("html", self.endTagHtml),
(("address", "article", "aside", "blockquote", "button", "center", (("address", "article", "aside", "blockquote", "button", "center",
@ -942,17 +941,9 @@ def getPhases(debug):
self.endTagHandler.default = self.endTagOther self.endTagHandler.default = self.endTagOther
def isMatchingFormattingElement(self, node1, node2): def isMatchingFormattingElement(self, node1, node2):
if node1.name != node2.name or node1.namespace != node2.namespace: return (node1.name == node2.name and
return False node1.namespace == node2.namespace and
elif len(node1.attributes) != len(node2.attributes): node1.attributes == node2.attributes)
return False
else:
attributes1 = sorted(node1.attributes.items())
attributes2 = sorted(node2.attributes.items())
for attr1, attr2 in zip(attributes1, attributes2):
if attr1 != attr2:
return False
return True
# helper # helper
def addFormattingElement(self, token): def addFormattingElement(self, token):
@ -988,8 +979,8 @@ def getPhases(debug):
data = token["data"] data = token["data"]
self.processSpaceCharacters = self.processSpaceCharactersNonPre self.processSpaceCharacters = self.processSpaceCharactersNonPre
if (data.startswith("\n") and if (data.startswith("\n") and
self.tree.openElements[-1].name in ("pre", "listing", "textarea") self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
and not self.tree.openElements[-1].hasContent()): not self.tree.openElements[-1].hasContent()):
data = data[1:] data = data[1:]
if data: if data:
self.tree.reconstructActiveFormattingElements() self.tree.reconstructActiveFormattingElements()
@ -1007,7 +998,7 @@ def getPhases(debug):
for char in token["data"]])): for char in token["data"]])):
self.parser.framesetOK = False self.parser.framesetOK = False
def processSpaceCharacters(self, token): def processSpaceCharactersNonPre(self, token):
self.tree.reconstructActiveFormattingElements() self.tree.reconstructActiveFormattingElements()
self.tree.insertText(token["data"]) self.tree.insertText(token["data"])
@ -1016,8 +1007,8 @@ def getPhases(debug):
def startTagBody(self, token): def startTagBody(self, token):
self.parser.parseError("unexpected-start-tag", {"name": "body"}) self.parser.parseError("unexpected-start-tag", {"name": "body"})
if (len(self.tree.openElements) == 1 if (len(self.tree.openElements) == 1 or
or self.tree.openElements[1].name != "body"): self.tree.openElements[1].name != "body"):
assert self.parser.innerHTML assert self.parser.innerHTML
else: else:
self.parser.framesetOK = False self.parser.framesetOK = False
@ -1232,6 +1223,12 @@ def getPhases(debug):
self.parser.framesetOK = False self.parser.framesetOK = False
self.startTagRawtext(token) self.startTagRawtext(token)
def startTagNoscript(self, token):
if self.parser.scripting:
self.startTagRawtext(token)
else:
self.startTagOther(token)
def startTagRawtext(self, token): def startTagRawtext(self, token):
"""iframe, noembed noframes, noscript(if scripting enabled)""" """iframe, noembed noframes, noscript(if scripting enabled)"""
self.parser.parseRCDataRawtext(token, "RAWTEXT") self.parser.parseRCDataRawtext(token, "RAWTEXT")
@ -1595,9 +1592,9 @@ def getPhases(debug):
class TextPhase(Phase): class TextPhase(Phase):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([]) self.startTagHandler = _utils.MethodDispatcher([])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
("script", self.endTagScript)]) ("script", self.endTagScript)])
self.endTagHandler.default = self.endTagOther self.endTagHandler.default = self.endTagOther
@ -1629,7 +1626,7 @@ def getPhases(debug):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table # http://www.whatwg.org/specs/web-apps/current-work/#in-table
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
("caption", self.startTagCaption), ("caption", self.startTagCaption),
("colgroup", self.startTagColgroup), ("colgroup", self.startTagColgroup),
@ -1643,7 +1640,7 @@ def getPhases(debug):
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
("table", self.endTagTable), ("table", self.endTagTable),
(("body", "caption", "col", "colgroup", "html", "tbody", "td", (("body", "caption", "col", "colgroup", "html", "tbody", "td",
"tfoot", "th", "thead", "tr"), self.endTagIgnore) "tfoot", "th", "thead", "tr"), self.endTagIgnore)
@ -1820,14 +1817,14 @@ def getPhases(debug):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.startTagTableElement) "thead", "tr"), self.startTagTableElement)
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
("caption", self.endTagCaption), ("caption", self.endTagCaption),
("table", self.endTagTable), ("table", self.endTagTable),
(("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
@ -1892,13 +1889,13 @@ def getPhases(debug):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
("col", self.startTagCol) ("col", self.startTagCol)
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
("colgroup", self.endTagColgroup), ("colgroup", self.endTagColgroup),
("col", self.endTagCol) ("col", self.endTagCol)
]) ])
@ -1926,6 +1923,7 @@ def getPhases(debug):
def startTagCol(self, token): def startTagCol(self, token):
self.tree.insertElement(token) self.tree.insertElement(token)
self.tree.openElements.pop() self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def startTagOther(self, token): def startTagOther(self, token):
ignoreEndTag = self.ignoreEndTagColgroup() ignoreEndTag = self.ignoreEndTagColgroup()
@ -1955,7 +1953,7 @@ def getPhases(debug):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
("tr", self.startTagTr), ("tr", self.startTagTr),
(("td", "th"), self.startTagTableCell), (("td", "th"), self.startTagTableCell),
@ -1964,7 +1962,7 @@ def getPhases(debug):
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup), (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
("table", self.endTagTable), ("table", self.endTagTable),
(("body", "caption", "col", "colgroup", "html", "td", "th", (("body", "caption", "col", "colgroup", "html", "td", "th",
@ -2053,7 +2051,7 @@ def getPhases(debug):
# http://www.whatwg.org/specs/web-apps/current-work/#in-row # http://www.whatwg.org/specs/web-apps/current-work/#in-row
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
(("td", "th"), self.startTagTableCell), (("td", "th"), self.startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead", (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
@ -2061,7 +2059,7 @@ def getPhases(debug):
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
("tr", self.endTagTr), ("tr", self.endTagTr),
("table", self.endTagTable), ("table", self.endTagTable),
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup), (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
@ -2142,14 +2140,14 @@ def getPhases(debug):
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.startTagTableOther) "thead", "tr"), self.startTagTableOther)
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
(("td", "th"), self.endTagTableCell), (("td", "th"), self.endTagTableCell),
(("body", "caption", "col", "colgroup", "html"), self.endTagIgnore), (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
(("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply) (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
@ -2218,7 +2216,7 @@ def getPhases(debug):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
("option", self.startTagOption), ("option", self.startTagOption),
("optgroup", self.startTagOptgroup), ("optgroup", self.startTagOptgroup),
@ -2228,7 +2226,7 @@ def getPhases(debug):
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
("option", self.endTagOption), ("option", self.endTagOption),
("optgroup", self.endTagOptgroup), ("optgroup", self.endTagOptgroup),
("select", self.endTagSelect) ("select", self.endTagSelect)
@ -2318,13 +2316,13 @@ def getPhases(debug):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
self.startTagTable) self.startTagTable)
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
self.endTagTable) self.endTagTable)
]) ])
@ -2445,7 +2443,7 @@ def getPhases(debug):
def processEndTag(self, token): def processEndTag(self, token):
nodeIndex = len(self.tree.openElements) - 1 nodeIndex = len(self.tree.openElements) - 1
node = self.tree.openElements[-1] node = self.tree.openElements[-1]
if node.name != token["name"]: if node.name.translate(asciiUpper2Lower) != token["name"]:
self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
while True: while True:
@ -2472,12 +2470,12 @@ def getPhases(debug):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml) ("html", self.startTagHtml)
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)]) self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
self.endTagHandler.default = self.endTagOther self.endTagHandler.default = self.endTagOther
def processEOF(self): def processEOF(self):
@ -2520,7 +2518,7 @@ def getPhases(debug):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
("frameset", self.startTagFrameset), ("frameset", self.startTagFrameset),
("frame", self.startTagFrame), ("frame", self.startTagFrame),
@ -2528,7 +2526,7 @@ def getPhases(debug):
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
("frameset", self.endTagFrameset) ("frameset", self.endTagFrameset)
]) ])
self.endTagHandler.default = self.endTagOther self.endTagHandler.default = self.endTagOther
@ -2577,13 +2575,13 @@ def getPhases(debug):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
("noframes", self.startTagNoframes) ("noframes", self.startTagNoframes)
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = _utils.MethodDispatcher([
("html", self.endTagHtml) ("html", self.endTagHtml)
]) ])
self.endTagHandler.default = self.endTagOther self.endTagHandler.default = self.endTagOther
@ -2613,7 +2611,7 @@ def getPhases(debug):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml) ("html", self.startTagHtml)
]) ])
self.startTagHandler.default = self.startTagOther self.startTagHandler.default = self.startTagOther
@ -2651,7 +2649,7 @@ def getPhases(debug):
def __init__(self, parser, tree): def __init__(self, parser, tree):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
("noframes", self.startTagNoFrames) ("noframes", self.startTagNoFrames)
]) ])
@ -2682,13 +2680,14 @@ def getPhases(debug):
def processEndTag(self, token): def processEndTag(self, token):
self.parser.parseError("expected-eof-but-got-end-tag", self.parser.parseError("expected-eof-but-got-end-tag",
{"name": token["name"]}) {"name": token["name"]})
# pylint:enable=unused-argument
return { return {
"initial": InitialPhase, "initial": InitialPhase,
"beforeHtml": BeforeHtmlPhase, "beforeHtml": BeforeHtmlPhase,
"beforeHead": BeforeHeadPhase, "beforeHead": BeforeHeadPhase,
"inHead": InHeadPhase, "inHead": InHeadPhase,
# XXX "inHeadNoscript": InHeadNoScriptPhase, "inHeadNoscript": InHeadNoscriptPhase,
"afterHead": AfterHeadPhase, "afterHead": AfterHeadPhase,
"inBody": InBodyPhase, "inBody": InBodyPhase,
"text": TextPhase, "text": TextPhase,
@ -2711,6 +2710,16 @@ def getPhases(debug):
} }
def adjust_attributes(token, replacements):
if PY3 or _utils.PY27:
needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
else:
needs_adjustment = frozenset(token['data']) & frozenset(replacements)
if needs_adjustment:
token['data'] = OrderedDict((replacements.get(k, k), v)
for k, v in token['data'].items())
def impliedTagToken(name, type="EndTag", attributes=None, def impliedTagToken(name, type="EndTag", attributes=None,
selfClosing=False): selfClosing=False):
if attributes is None: if attributes is None:

View file

@ -1,300 +0,0 @@
from __future__ import absolute_import, division, unicode_literals
import re
from xml.sax.saxutils import escape, unescape
from six.moves import urllib_parse as urlparse
from .tokenizer import HTMLTokenizer
from .constants import tokenTypes
content_type_rgx = re.compile(r'''
^
# Match a content type <application>/<type>
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
# Match any character set and encoding
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
# Assume the rest is data
,.*
$
''',
re.VERBOSE)
class HTMLSanitizerMixin(object):
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
'munderover', 'none']
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
'background', 'balance', 'bgcolor', 'bgproperties', 'border',
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
'width', 'wrap', 'xml:lang']
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
'xlink:type', 'xmlns', 'xmlns:xlink']
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
'arabic-form', 'ascent', 'attributeName', 'attributeType',
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
'fill-opacity', 'fill-rule', 'font-family', 'font-size',
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
'opacity', 'orient', 'origin', 'overline-position',
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
'transform', 'type', 'u1', 'u2', 'underline-position',
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
'y1', 'y2', 'zoomAndPan']
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
'mask', 'stroke']
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
'set', 'use']
acceptable_css_properties = ['azimuth', 'background-color',
'border-bottom-color', 'border-collapse', 'border-color',
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
'white-space', 'width']
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
'transparent', 'underline', 'white', 'yellow']
acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
'stroke-opacity']
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
'ssh', 'sftp', 'rtsp', 'afs', 'data']
acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
# subclasses may define their own versions of these constants
allowed_elements = acceptable_elements + mathml_elements + svg_elements
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
allowed_css_properties = acceptable_css_properties
allowed_css_keywords = acceptable_css_keywords
allowed_svg_properties = acceptable_svg_properties
allowed_protocols = acceptable_protocols
allowed_content_types = acceptable_content_types
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
# attributes are parsed, and a restricted set, # specified by
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
# in ALLOWED_PROTOCOLS are allowed.
#
# sanitize_html('<script> do_nasty_stuff() </script>')
# => &lt;script> do_nasty_stuff() &lt;/script>
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
# => <a>Click here for $100</a>
def sanitize_token(self, token):
# accommodate filters which use token_type differently
token_type = token["type"]
if token_type in list(tokenTypes.keys()):
token_type = tokenTypes[token_type]
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
tokenTypes["EmptyTag"]):
if token["name"] in self.allowed_elements:
return self.allowed_token(token, token_type)
else:
return self.disallowed_token(token, token_type)
elif token_type == tokenTypes["Comment"]:
pass
else:
return token
def allowed_token(self, token, token_type):
if "data" in token:
attrs = dict([(name, val) for name, val in
token["data"][::-1]
if name in self.allowed_attributes])
for attr in self.attr_val_is_uri:
if attr not in attrs:
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
# remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace("\ufffd", "")
try:
uri = urlparse.urlparse(val_unescaped)
except ValueError:
uri = None
del attrs[attr]
if uri and uri.scheme:
if uri.scheme not in self.allowed_protocols:
del attrs[attr]
if uri.scheme == 'data':
m = content_type_rgx.match(uri.path)
if not m:
del attrs[attr]
elif m.group('content_type') not in self.allowed_content_types:
del attrs[attr]
for attr in self.svg_attr_val_allows_ref:
if attr in attrs:
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
' ',
unescape(attrs[attr]))
if (token["name"] in self.svg_allow_local_href and
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
attrs['xlink:href'])):
del attrs['xlink:href']
if 'style' in attrs:
attrs['style'] = self.sanitize_css(attrs['style'])
token["data"] = [[name, val] for name, val in list(attrs.items())]
return token
def disallowed_token(self, token, token_type):
if token_type == tokenTypes["EndTag"]:
token["data"] = "</%s>" % token["name"]
elif token["data"]:
attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
token["data"] = "<%s%s>" % (token["name"], attrs)
else:
token["data"] = "<%s>" % token["name"]
if token.get("selfClosing"):
token["data"] = token["data"][:-1] + "/>"
if token["type"] in list(tokenTypes.keys()):
token["type"] = "Characters"
else:
token["type"] = tokenTypes["Characters"]
del token["name"]
return token
def sanitize_css(self, style):
# disallow urls
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
# gauntlet
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
return ''
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
return ''
clean = []
for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
if not value:
continue
if prop.lower() in self.allowed_css_properties:
clean.append(prop + ': ' + value + ';')
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
'padding']:
for keyword in value.split():
if keyword not in self.acceptable_css_keywords and \
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
break
else:
clean.append(prop + ': ' + value + ';')
elif prop.lower() in self.allowed_svg_properties:
clean.append(prop + ': ' + value + ';')
return ' '.join(clean)
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=False, lowercaseAttrName=False, parser=None):
# Change case matching defaults as we only output lowercase html anyway
# This solution doesn't seem ideal...
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
lowercaseElementName, lowercaseAttrName, parser=parser)
def __iter__(self):
for token in HTMLTokenizer.__iter__(self):
token = self.sanitize_token(token)
if token:
yield token

View file

@ -1,79 +1,87 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from six import text_type from six import text_type
try: import re
from functools import reduce
except ImportError:
pass
from ..constants import voidElements, booleanAttributes, spaceCharacters from codecs import register_error, xmlcharrefreplace_errors
from ..constants import rcdataElements, entities, xmlEntities
from .. import utils from .constants import voidElements, booleanAttributes, spaceCharacters
from .constants import rcdataElements, entities, xmlEntities
from . import treewalkers, _utils
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
spaceCharacters = "".join(spaceCharacters) _quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
"\u3000]")
try:
from codecs import register_error, xmlcharrefreplace_errors
except ImportError:
unicode_encode_errors = "strict"
else:
unicode_encode_errors = "htmlentityreplace"
encode_entity_map = {} _encode_entity_map = {}
is_ucs4 = len("\U0010FFFF") == 1 _is_ucs4 = len("\U0010FFFF") == 1
for k, v in list(entities.items()): for k, v in list(entities.items()):
# skip multi-character entities # skip multi-character entities
if ((is_ucs4 and len(v) > 1) or if ((_is_ucs4 and len(v) > 1) or
(not is_ucs4 and len(v) > 2)): (not _is_ucs4 and len(v) > 2)):
continue continue
if v != "&": if v != "&":
if len(v) == 2: if len(v) == 2:
v = utils.surrogatePairToCodepoint(v) v = _utils.surrogatePairToCodepoint(v)
else:
v = ord(v)
if v not in encode_entity_map or k.islower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
encode_entity_map[v] = k
def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
codepoints = []
skip = False
for i, c in enumerate(exc.object[exc.start:exc.end]):
if skip:
skip = False
continue
index = i + exc.start
if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
skip = True
else:
codepoint = ord(c)
codepoints.append(codepoint)
for cp in codepoints:
e = encode_entity_map.get(cp)
if e:
res.append("&")
res.append(e)
if not e.endswith(";"):
res.append(";")
else:
res.append("&#x%s;" % (hex(cp)[2:]))
return ("".join(res), exc.end)
else: else:
return xmlcharrefreplace_errors(exc) v = ord(v)
if v not in _encode_entity_map or k.islower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
_encode_entity_map[v] = k
register_error(unicode_encode_errors, htmlentityreplace_errors)
del register_error def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
codepoints = []
skip = False
for i, c in enumerate(exc.object[exc.start:exc.end]):
if skip:
skip = False
continue
index = i + exc.start
if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
skip = True
else:
codepoint = ord(c)
codepoints.append(codepoint)
for cp in codepoints:
e = _encode_entity_map.get(cp)
if e:
res.append("&")
res.append(e)
if not e.endswith(";"):
res.append(";")
else:
res.append("&#x%s;" % (hex(cp)[2:]))
return ("".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
register_error("htmlentityreplace", htmlentityreplace_errors)
def serialize(input, tree="etree", encoding=None, **serializer_opts):
# XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree)
s = HTMLSerializer(**serializer_opts)
return s.render(walker(input), encoding)
class HTMLSerializer(object): class HTMLSerializer(object):
# attribute quoting options # attribute quoting options
quote_attr_values = False quote_attr_values = "legacy" # be secure by default
quote_char = '"' quote_char = '"'
use_best_quote_char = True use_best_quote_char = True
@ -109,9 +117,9 @@ class HTMLSerializer(object):
inject_meta_charset=True|False inject_meta_charset=True|False
Whether it insert a meta element to define the character set of the Whether it insert a meta element to define the character set of the
document. document.
quote_attr_values=True|False quote_attr_values="legacy"|"spec"|"always"
Whether to quote attribute values that don't require quoting Whether to quote attribute values that don't require quoting
per HTML5 parsing rules. per legacy browser behaviour, when required by the standard, or always.
quote_char=u'"'|u"'" quote_char=u'"'|u"'"
Use given quote character for attribute quoting. Default is to Use given quote character for attribute quoting. Default is to
use double quote unless attribute value contains a double quote, use double quote unless attribute value contains a double quote,
@ -147,6 +155,9 @@ class HTMLSerializer(object):
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
""" """
unexpected_args = frozenset(kwargs) - frozenset(self.options)
if len(unexpected_args) > 0:
raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
if 'quote_char' in kwargs: if 'quote_char' in kwargs:
self.use_best_quote_char = False self.use_best_quote_char = False
for attr in self.options: for attr in self.options:
@ -157,7 +168,7 @@ class HTMLSerializer(object):
def encode(self, string): def encode(self, string):
assert(isinstance(string, text_type)) assert(isinstance(string, text_type))
if self.encoding: if self.encoding:
return string.encode(self.encoding, unicode_encode_errors) return string.encode(self.encoding, "htmlentityreplace")
else: else:
return string return string
@ -169,28 +180,30 @@ class HTMLSerializer(object):
return string return string
def serialize(self, treewalker, encoding=None): def serialize(self, treewalker, encoding=None):
# pylint:disable=too-many-nested-blocks
self.encoding = encoding self.encoding = encoding
in_cdata = False in_cdata = False
self.errors = [] self.errors = []
if encoding and self.inject_meta_charset: if encoding and self.inject_meta_charset:
from ..filters.inject_meta_charset import Filter from .filters.inject_meta_charset import Filter
treewalker = Filter(treewalker, encoding) treewalker = Filter(treewalker, encoding)
# Alphabetical attributes is here under the assumption that none of
# the later filters add or change order of attributes; it needs to be
# before the sanitizer so escaped elements come out correctly
if self.alphabetical_attributes:
from .filters.alphabeticalattributes import Filter
treewalker = Filter(treewalker)
# WhitespaceFilter should be used before OptionalTagFilter # WhitespaceFilter should be used before OptionalTagFilter
# for maximum efficiently of this latter filter # for maximum efficiently of this latter filter
if self.strip_whitespace: if self.strip_whitespace:
from ..filters.whitespace import Filter from .filters.whitespace import Filter
treewalker = Filter(treewalker) treewalker = Filter(treewalker)
if self.sanitize: if self.sanitize:
from ..filters.sanitizer import Filter from .filters.sanitizer import Filter
treewalker = Filter(treewalker) treewalker = Filter(treewalker)
if self.omit_optional_tags: if self.omit_optional_tags:
from ..filters.optionaltags import Filter from .filters.optionaltags import Filter
treewalker = Filter(treewalker)
# Alphabetical attributes must be last, as other filters
# could add attributes and alter the order
if self.alphabetical_attributes:
from ..filters.alphabeticalattributes import Filter
treewalker = Filter(treewalker) treewalker = Filter(treewalker)
for token in treewalker: for token in treewalker:
@ -229,7 +242,7 @@ class HTMLSerializer(object):
in_cdata = True in_cdata = True
elif in_cdata: elif in_cdata:
self.serializeError("Unexpected child element of a CDATA element") self.serializeError("Unexpected child element of a CDATA element")
for (attr_namespace, attr_name), attr_value in token["data"].items(): for (_, attr_name), attr_value in token["data"].items():
# TODO: Add namespace support here # TODO: Add namespace support here
k = attr_name k = attr_name
v = attr_value v = attr_value
@ -237,14 +250,18 @@ class HTMLSerializer(object):
yield self.encodeStrict(k) yield self.encodeStrict(k)
if not self.minimize_boolean_attributes or \ if not self.minimize_boolean_attributes or \
(k not in booleanAttributes.get(name, tuple()) (k not in booleanAttributes.get(name, tuple()) and
and k not in booleanAttributes.get("", tuple())): k not in booleanAttributes.get("", tuple())):
yield self.encodeStrict("=") yield self.encodeStrict("=")
if self.quote_attr_values or not v: if self.quote_attr_values == "always" or len(v) == 0:
quote_attr = True quote_attr = True
elif self.quote_attr_values == "spec":
quote_attr = _quoteAttributeSpec.search(v) is not None
elif self.quote_attr_values == "legacy":
quote_attr = _quoteAttributeLegacy.search(v) is not None
else: else:
quote_attr = reduce(lambda x, y: x or (y in v), raise ValueError("quote_attr_values must be one of: "
spaceCharacters + ">\"'=", False) "'always', 'spec', or 'legacy'")
v = v.replace("&", "&amp;") v = v.replace("&", "&amp;")
if self.escape_lt_in_attrs: if self.escape_lt_in_attrs:
v = v.replace("<", "&lt;") v = v.replace("<", "&lt;")
@ -312,6 +329,6 @@ class HTMLSerializer(object):
raise SerializeError raise SerializeError
def SerializeError(Exception): class SerializeError(Exception):
"""Error in serialized tree""" """Error in serialized tree"""
pass pass

View file

@ -1,16 +0,0 @@
from __future__ import absolute_import, division, unicode_literals
from .. import treewalkers
from .htmlserializer import HTMLSerializer
def serialize(input, tree="etree", format="html", encoding=None,
**serializer_opts):
# XXX: Should we cache this?
walker = treewalkers.getTreeWalker(tree)
if format == "html":
s = HTMLSerializer(**serializer_opts)
else:
raise ValueError("type must be html")
return s.render(walker(input), encoding)

View file

@ -5,7 +5,7 @@ from . import sax
__all__ = ["sax"] __all__ = ["sax"]
try: try:
from . import genshi # flake8: noqa from . import genshi # noqa
except ImportError: except ImportError:
pass pass
else: else:

View file

@ -28,7 +28,7 @@ to the format used in the unittests
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from ..utils import default_etree from .._utils import default_etree
treeBuilderCache = {} treeBuilderCache = {}

View file

@ -126,6 +126,7 @@ class TreeBuilder(object):
commentClass - the class to use for comments commentClass - the class to use for comments
doctypeClass - the class to use for doctypes doctypeClass - the class to use for doctypes
""" """
# pylint:disable=not-callable
# Document class # Document class
documentClass = None documentClass = None
@ -166,12 +167,17 @@ class TreeBuilder(object):
# If we pass a node in we match that. if we pass a string # If we pass a node in we match that. if we pass a string
# match any node with that name # match any node with that name
exactNode = hasattr(target, "nameTuple") exactNode = hasattr(target, "nameTuple")
if not exactNode:
if isinstance(target, text_type):
target = (namespaces["html"], target)
assert isinstance(target, tuple)
listElements, invert = listElementsMap[variant] listElements, invert = listElementsMap[variant]
for node in reversed(self.openElements): for node in reversed(self.openElements):
if (node.name == target and not exactNode or if exactNode and node == target:
node == target and exactNode): return True
elif not exactNode and node.nameTuple == target:
return True return True
elif (invert ^ (node.nameTuple in listElements)): elif (invert ^ (node.nameTuple in listElements)):
return False return False
@ -353,8 +359,8 @@ class TreeBuilder(object):
def generateImpliedEndTags(self, exclude=None): def generateImpliedEndTags(self, exclude=None):
name = self.openElements[-1].name name = self.openElements[-1].name
# XXX td, th and tr are not actually needed # XXX td, th and tr are not actually needed
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
and name != exclude): name != exclude):
self.openElements.pop() self.openElements.pop()
# XXX This is not entirely what the specification says. We should # XXX This is not entirely what the specification says. We should
# investigate it more closely. # investigate it more closely.

View file

@ -1,54 +1,62 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from collections import MutableMapping
from xml.dom import minidom, Node from xml.dom import minidom, Node
import weakref import weakref
from . import _base from . import base
from .. import constants from .. import constants
from ..constants import namespaces from ..constants import namespaces
from ..utils import moduleFactoryFactory from .._utils import moduleFactoryFactory
def getDomBuilder(DomImplementation): def getDomBuilder(DomImplementation):
Dom = DomImplementation Dom = DomImplementation
class AttrList(object): class AttrList(MutableMapping):
def __init__(self, element): def __init__(self, element):
self.element = element self.element = element
def __iter__(self): def __iter__(self):
return list(self.element.attributes.items()).__iter__() return iter(self.element.attributes.keys())
def __setitem__(self, name, value): def __setitem__(self, name, value):
self.element.setAttribute(name, value)
def __len__(self):
return len(list(self.element.attributes.items()))
def items(self):
return [(item[0], item[1]) for item in
list(self.element.attributes.items())]
def keys(self):
return list(self.element.attributes.keys())
def __getitem__(self, name):
return self.element.getAttribute(name)
def __contains__(self, name):
if isinstance(name, tuple): if isinstance(name, tuple):
raise NotImplementedError raise NotImplementedError
else: else:
return self.element.hasAttribute(name) attr = self.element.ownerDocument.createAttribute(name)
attr.value = value
self.element.attributes[name] = attr
class NodeBuilder(_base.Node): def __len__(self):
return len(self.element.attributes)
def items(self):
return list(self.element.attributes.items())
def values(self):
return list(self.element.attributes.values())
def __getitem__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
return self.element.attributes[name].value
def __delitem__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
del self.element.attributes[name]
class NodeBuilder(base.Node):
def __init__(self, element): def __init__(self, element):
_base.Node.__init__(self, element.nodeName) base.Node.__init__(self, element.nodeName)
self.element = element self.element = element
namespace = property(lambda self: hasattr(self.element, "namespaceURI") namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
and self.element.namespaceURI or None) self.element.namespaceURI or None)
def appendChild(self, node): def appendChild(self, node):
node.parent = self node.parent = self
@ -109,7 +117,7 @@ def getDomBuilder(DomImplementation):
nameTuple = property(getNameTuple) nameTuple = property(getNameTuple)
class TreeBuilder(_base.TreeBuilder): class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
def documentClass(self): def documentClass(self):
self.dom = Dom.getDOMImplementation().createDocument(None, None, None) self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
return weakref.proxy(self) return weakref.proxy(self)
@ -149,15 +157,16 @@ def getDomBuilder(DomImplementation):
return self.dom return self.dom
def getFragment(self): def getFragment(self):
return _base.TreeBuilder.getFragment(self).element return base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None): def insertText(self, data, parent=None):
data = data data = data
if parent != self: if parent != self:
_base.TreeBuilder.insertText(self, data, parent) base.TreeBuilder.insertText(self, data, parent)
else: else:
# HACK: allow text nodes as children of the document node # HACK: allow text nodes as children of the document node
if hasattr(self.dom, '_child_node_types'): if hasattr(self.dom, '_child_node_types'):
# pylint:disable=protected-access
if Node.TEXT_NODE not in self.dom._child_node_types: if Node.TEXT_NODE not in self.dom._child_node_types:
self.dom._child_node_types = list(self.dom._child_node_types) self.dom._child_node_types = list(self.dom._child_node_types)
self.dom._child_node_types.append(Node.TEXT_NODE) self.dom._child_node_types.append(Node.TEXT_NODE)

View file

@ -1,13 +1,15 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
# pylint:disable=protected-access
from six import text_type from six import text_type
import re import re
from . import _base from . import base
from .. import ihatexml from .. import _ihatexml
from .. import constants from .. import constants
from ..constants import namespaces from ..constants import namespaces
from ..utils import moduleFactoryFactory from .._utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)") tag_regexp = re.compile("{([^}]*)}(.*)")
@ -16,7 +18,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag ElementTreeCommentType = ElementTree.Comment("asd").tag
class Element(_base.Node): class Element(base.Node):
def __init__(self, name, namespace=None): def __init__(self, name, namespace=None):
self._name = name self._name = name
self._namespace = namespace self._namespace = namespace
@ -98,6 +100,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
node.parent = self node.parent = self
def removeChild(self, node): def removeChild(self, node):
self._childNodes.remove(node)
self._element.remove(node._element) self._element.remove(node._element)
node.parent = None node.parent = None
@ -139,7 +142,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
if self._element.text is not None: if self._element.text is not None:
newParent._element.text += self._element.text newParent._element.text += self._element.text
self._element.text = "" self._element.text = ""
_base.Node.reparentChildren(self, newParent) base.Node.reparentChildren(self, newParent)
class Comment(Element): class Comment(Element):
def __init__(self, data): def __init__(self, data):
@ -253,10 +256,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return "\n".join(rv) return "\n".join(rv)
def tostring(element): def tostring(element): # pylint:disable=unused-variable
"""Serialize an element and its child nodes to a string""" """Serialize an element and its child nodes to a string"""
rv = [] rv = []
filter = ihatexml.InfosetFilter() filter = _ihatexml.InfosetFilter()
def serializeElement(element): def serializeElement(element):
if isinstance(element, ElementTree.ElementTree): if isinstance(element, ElementTree.ElementTree):
@ -307,7 +310,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return "".join(rv) return "".join(rv)
class TreeBuilder(_base.TreeBuilder): class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
documentClass = Document documentClass = Document
doctypeClass = DocumentType doctypeClass = DocumentType
elementClass = Element elementClass = Element
@ -329,7 +332,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return self.document._element.find("html") return self.document._element.find("html")
def getFragment(self): def getFragment(self):
return _base.TreeBuilder.getFragment(self)._element return base.TreeBuilder.getFragment(self)._element
return locals() return locals()

View file

@ -10,16 +10,17 @@ When any of these things occur, we emit a DataLossWarning
""" """
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
# pylint:disable=protected-access
import warnings import warnings
import re import re
import sys import sys
from . import _base from . import base
from ..constants import DataLossWarning from ..constants import DataLossWarning
from .. import constants from .. import constants
from . import etree as etree_builders from . import etree as etree_builders
from .. import ihatexml from .. import _ihatexml
import lxml.etree as etree import lxml.etree as etree
@ -53,8 +54,7 @@ class Document(object):
def testSerializer(element): def testSerializer(element):
rv = [] rv = []
finalText = None infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
def serializeElement(element, indent=0): def serializeElement(element, indent=0):
if not hasattr(element, "tag"): if not hasattr(element, "tag"):
@ -128,16 +128,12 @@ def testSerializer(element):
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail)) rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
serializeElement(element, 0) serializeElement(element, 0)
if finalText is not None:
rv.append("|%s\"%s\"" % (' ' * 2, finalText))
return "\n".join(rv) return "\n".join(rv)
def tostring(element): def tostring(element):
"""Serialize an element and its child nodes to a string""" """Serialize an element and its child nodes to a string"""
rv = [] rv = []
finalText = None
def serializeElement(element): def serializeElement(element):
if not hasattr(element, "tag"): if not hasattr(element, "tag"):
@ -173,13 +169,10 @@ def tostring(element):
serializeElement(element) serializeElement(element)
if finalText is not None:
rv.append("%s\"" % (' ' * 2, finalText))
return "".join(rv) return "".join(rv)
class TreeBuilder(_base.TreeBuilder): class TreeBuilder(base.TreeBuilder):
documentClass = Document documentClass = Document
doctypeClass = DocumentType doctypeClass = DocumentType
elementClass = None elementClass = None
@ -189,13 +182,15 @@ class TreeBuilder(_base.TreeBuilder):
def __init__(self, namespaceHTMLElements, fullTree=False): def __init__(self, namespaceHTMLElements, fullTree=False):
builder = etree_builders.getETreeModule(etree, fullTree=fullTree) builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
infosetFilter = self.infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True) infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
self.namespaceHTMLElements = namespaceHTMLElements self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict): class Attributes(dict):
def __init__(self, element, value={}): def __init__(self, element, value=None):
if value is None:
value = {}
self._element = element self._element = element
dict.__init__(self, value) dict.__init__(self, value) # pylint:disable=non-parent-init-called
for key, value in self.items(): for key, value in self.items():
if isinstance(key, tuple): if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1])) name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
@ -259,10 +254,10 @@ class TreeBuilder(_base.TreeBuilder):
self.elementClass = Element self.elementClass = Element
self.commentClass = Comment self.commentClass = Comment
# self.fragmentClass = builder.DocumentFragment # self.fragmentClass = builder.DocumentFragment
_base.TreeBuilder.__init__(self, namespaceHTMLElements) base.TreeBuilder.__init__(self, namespaceHTMLElements)
def reset(self): def reset(self):
_base.TreeBuilder.reset(self) base.TreeBuilder.reset(self)
self.insertComment = self.insertCommentInitial self.insertComment = self.insertCommentInitial
self.initial_comments = [] self.initial_comments = []
self.doctype = None self.doctype = None
@ -303,12 +298,14 @@ class TreeBuilder(_base.TreeBuilder):
self.doctype = doctype self.doctype = doctype
def insertCommentInitial(self, data, parent=None): def insertCommentInitial(self, data, parent=None):
assert parent is None or parent is self.document
assert self.document._elementTree is None
self.initial_comments.append(data) self.initial_comments.append(data)
def insertCommentMain(self, data, parent=None): def insertCommentMain(self, data, parent=None):
if (parent == self.document and if (parent == self.document and
self.document._elementTree.getroot()[-1].tag == comment_type): self.document._elementTree.getroot()[-1].tag == comment_type):
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning) warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
super(TreeBuilder, self).insertComment(data, parent) super(TreeBuilder, self).insertComment(data, parent)
def insertRoot(self, token): def insertRoot(self, token):

View file

@ -10,10 +10,10 @@ returning an iterator generating tokens.
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"]
from .. import constants from .. import constants
from ..utils import default_etree from .._utils import default_etree
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"]
treeWalkerCache = {} treeWalkerCache = {}
@ -43,11 +43,11 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
from . import dom from . import dom
treeWalkerCache[treeType] = dom.TreeWalker treeWalkerCache[treeType] = dom.TreeWalker
elif treeType == "genshi": elif treeType == "genshi":
from . import genshistream from . import genshi
treeWalkerCache[treeType] = genshistream.TreeWalker treeWalkerCache[treeType] = genshi.TreeWalker
elif treeType == "lxml": elif treeType == "lxml":
from . import lxmletree from . import etree_lxml
treeWalkerCache[treeType] = lxmletree.TreeWalker treeWalkerCache[treeType] = etree_lxml.TreeWalker
elif treeType == "etree": elif treeType == "etree":
from . import etree from . import etree
if implementation is None: if implementation is None:

View file

@ -1,11 +1,11 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from six import text_type, string_types
from xml.dom import Node
from ..constants import namespaces, voidElements, spaceCharacters
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
"TreeWalker", "NonRecursiveTreeWalker"] "TreeWalker", "NonRecursiveTreeWalker"]
from xml.dom import Node
DOCUMENT = Node.DOCUMENT_NODE DOCUMENT = Node.DOCUMENT_NODE
DOCTYPE = Node.DOCUMENT_TYPE_NODE DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE TEXT = Node.TEXT_NODE
@ -14,28 +14,9 @@ COMMENT = Node.COMMENT_NODE
ENTITY = Node.ENTITY_NODE ENTITY = Node.ENTITY_NODE
UNKNOWN = "<#UNKNOWN#>" UNKNOWN = "<#UNKNOWN#>"
from ..constants import voidElements, spaceCharacters
spaceCharacters = "".join(spaceCharacters) spaceCharacters = "".join(spaceCharacters)
def to_text(s, blank_if_none=True):
"""Wrapper around six.text_type to convert None to empty string"""
if s is None:
if blank_if_none:
return ""
else:
return None
elif isinstance(s, text_type):
return s
else:
return text_type(s)
def is_text_or_none(string):
"""Wrapper around isinstance(string_types) or is None"""
return string is None or isinstance(string, string_types)
class TreeWalker(object): class TreeWalker(object):
def __init__(self, tree): def __init__(self, tree):
self.tree = tree self.tree = tree
@ -47,47 +28,25 @@ class TreeWalker(object):
return {"type": "SerializeError", "data": msg} return {"type": "SerializeError", "data": msg}
def emptyTag(self, namespace, name, attrs, hasChildren=False): def emptyTag(self, namespace, name, attrs, hasChildren=False):
assert namespace is None or isinstance(namespace, string_types), type(namespace) yield {"type": "EmptyTag", "name": name,
assert isinstance(name, string_types), type(name) "namespace": namespace,
assert all((namespace is None or isinstance(namespace, string_types)) and
isinstance(name, string_types) and
isinstance(value, string_types)
for (namespace, name), value in attrs.items())
yield {"type": "EmptyTag", "name": to_text(name, False),
"namespace": to_text(namespace),
"data": attrs} "data": attrs}
if hasChildren: if hasChildren:
yield self.error("Void element has children") yield self.error("Void element has children")
def startTag(self, namespace, name, attrs): def startTag(self, namespace, name, attrs):
assert namespace is None or isinstance(namespace, string_types), type(namespace)
assert isinstance(name, string_types), type(name)
assert all((namespace is None or isinstance(namespace, string_types)) and
isinstance(name, string_types) and
isinstance(value, string_types)
for (namespace, name), value in attrs.items())
return {"type": "StartTag", return {"type": "StartTag",
"name": text_type(name), "name": name,
"namespace": to_text(namespace), "namespace": namespace,
"data": dict(((to_text(namespace, False), to_text(name)), "data": attrs}
to_text(value, False))
for (namespace, name), value in attrs.items())}
def endTag(self, namespace, name): def endTag(self, namespace, name):
assert namespace is None or isinstance(namespace, string_types), type(namespace)
assert isinstance(name, string_types), type(namespace)
return {"type": "EndTag", return {"type": "EndTag",
"name": to_text(name, False), "name": name,
"namespace": to_text(namespace), "namespace": namespace}
"data": {}}
def text(self, data): def text(self, data):
assert isinstance(data, string_types), type(data) data = data
data = to_text(data)
middle = data.lstrip(spaceCharacters) middle = data.lstrip(spaceCharacters)
left = data[:len(data) - len(middle)] left = data[:len(data) - len(middle)]
if left: if left:
@ -101,25 +60,16 @@ class TreeWalker(object):
yield {"type": "SpaceCharacters", "data": right} yield {"type": "SpaceCharacters", "data": right}
def comment(self, data): def comment(self, data):
assert isinstance(data, string_types), type(data) return {"type": "Comment", "data": data}
return {"type": "Comment", "data": text_type(data)}
def doctype(self, name, publicId=None, systemId=None, correct=True):
assert is_text_or_none(name), type(name)
assert is_text_or_none(publicId), type(publicId)
assert is_text_or_none(systemId), type(systemId)
def doctype(self, name, publicId=None, systemId=None):
return {"type": "Doctype", return {"type": "Doctype",
"name": to_text(name), "name": name,
"publicId": to_text(publicId), "publicId": publicId,
"systemId": to_text(systemId), "systemId": systemId}
"correct": to_text(correct)}
def entity(self, name): def entity(self, name):
assert isinstance(name, string_types), type(name) return {"type": "Entity", "name": name}
return {"type": "Entity", "name": text_type(name)}
def unknown(self, nodeType): def unknown(self, nodeType):
return self.error("Unknown node type: " + nodeType) return self.error("Unknown node type: " + nodeType)
@ -154,7 +104,7 @@ class NonRecursiveTreeWalker(TreeWalker):
elif type == ELEMENT: elif type == ELEMENT:
namespace, name, attributes, hasChildren = details namespace, name, attributes, hasChildren = details
if name in voidElements: if (not namespace or namespace == namespaces["html"]) and name in voidElements:
for token in self.emptyTag(namespace, name, attributes, for token in self.emptyTag(namespace, name, attributes,
hasChildren): hasChildren):
yield token yield token
@ -187,7 +137,7 @@ class NonRecursiveTreeWalker(TreeWalker):
type, details = details[0], details[1:] type, details = details[0], details[1:]
if type == ELEMENT: if type == ELEMENT:
namespace, name, attributes, hasChildren = details namespace, name, attributes, hasChildren = details
if name not in voidElements: if (namespace and namespace != namespaces["html"]) or name not in voidElements:
yield self.endTag(namespace, name) yield self.endTag(namespace, name)
if self.tree is currentNode: if self.tree is currentNode:
currentNode = None currentNode = None

View file

@ -2,16 +2,16 @@ from __future__ import absolute_import, division, unicode_literals
from xml.dom import Node from xml.dom import Node
from . import _base from . import base
class TreeWalker(_base.NonRecursiveTreeWalker): class TreeWalker(base.NonRecursiveTreeWalker):
def getNodeDetails(self, node): def getNodeDetails(self, node):
if node.nodeType == Node.DOCUMENT_TYPE_NODE: if node.nodeType == Node.DOCUMENT_TYPE_NODE:
return _base.DOCTYPE, node.name, node.publicId, node.systemId return base.DOCTYPE, node.name, node.publicId, node.systemId
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE): elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
return _base.TEXT, node.nodeValue return base.TEXT, node.nodeValue
elif node.nodeType == Node.ELEMENT_NODE: elif node.nodeType == Node.ELEMENT_NODE:
attrs = {} attrs = {}
@ -21,17 +21,17 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
attrs[(attr.namespaceURI, attr.localName)] = attr.value attrs[(attr.namespaceURI, attr.localName)] = attr.value
else: else:
attrs[(None, attr.name)] = attr.value attrs[(None, attr.name)] = attr.value
return (_base.ELEMENT, node.namespaceURI, node.nodeName, return (base.ELEMENT, node.namespaceURI, node.nodeName,
attrs, node.hasChildNodes()) attrs, node.hasChildNodes())
elif node.nodeType == Node.COMMENT_NODE: elif node.nodeType == Node.COMMENT_NODE:
return _base.COMMENT, node.nodeValue return base.COMMENT, node.nodeValue
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE): elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
return (_base.DOCUMENT,) return (base.DOCUMENT,)
else: else:
return _base.UNKNOWN, node.nodeType return base.UNKNOWN, node.nodeType
def getFirstChild(self, node): def getFirstChild(self, node):
return node.firstChild return node.firstChild

View file

@ -12,8 +12,8 @@ import re
from six import string_types from six import string_types
from . import _base from . import base
from ..utils import moduleFactoryFactory from .._utils import moduleFactoryFactory
tag_regexp = re.compile("{([^}]*)}(.*)") tag_regexp = re.compile("{([^}]*)}(.*)")
@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation):
ElementTree = ElementTreeImplementation ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag ElementTreeCommentType = ElementTree.Comment("asd").tag
class TreeWalker(_base.NonRecursiveTreeWalker): class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
"""Given the particular ElementTree representation, this implementation, """Given the particular ElementTree representation, this implementation,
to avoid using recursion, returns "nodes" as tuples with the following to avoid using recursion, returns "nodes" as tuples with the following
content: content:
@ -38,9 +38,9 @@ def getETreeBuilder(ElementTreeImplementation):
""" """
def getNodeDetails(self, node): def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element if isinstance(node, tuple): # It might be the root Element
elt, key, parents, flag = node elt, _, _, flag = node
if flag in ("text", "tail"): if flag in ("text", "tail"):
return _base.TEXT, getattr(elt, flag) return base.TEXT, getattr(elt, flag)
else: else:
node = elt node = elt
@ -48,14 +48,14 @@ def getETreeBuilder(ElementTreeImplementation):
node = node.getroot() node = node.getroot()
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"): if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
return (_base.DOCUMENT,) return (base.DOCUMENT,)
elif node.tag == "<!DOCTYPE>": elif node.tag == "<!DOCTYPE>":
return (_base.DOCTYPE, node.text, return (base.DOCTYPE, node.text,
node.get("publicId"), node.get("systemId")) node.get("publicId"), node.get("systemId"))
elif node.tag == ElementTreeCommentType: elif node.tag == ElementTreeCommentType:
return _base.COMMENT, node.text return base.COMMENT, node.text
else: else:
assert isinstance(node.tag, string_types), type(node.tag) assert isinstance(node.tag, string_types), type(node.tag)
@ -73,7 +73,7 @@ def getETreeBuilder(ElementTreeImplementation):
attrs[(match.group(1), match.group(2))] = value attrs[(match.group(1), match.group(2))] = value
else: else:
attrs[(None, name)] = value attrs[(None, name)] = value
return (_base.ELEMENT, namespace, tag, return (base.ELEMENT, namespace, tag,
attrs, len(node) or node.text) attrs, len(node) or node.text)
def getFirstChild(self, node): def getFirstChild(self, node):

View file

@ -4,9 +4,9 @@ from six import text_type
from lxml import etree from lxml import etree
from ..treebuilders.etree import tag_regexp from ..treebuilders.etree import tag_regexp
from . import _base from . import base
from .. import ihatexml from .. import _ihatexml
def ensure_str(s): def ensure_str(s):
@ -15,20 +15,27 @@ def ensure_str(s):
elif isinstance(s, text_type): elif isinstance(s, text_type):
return s return s
else: else:
return s.decode("utf-8", "strict") return s.decode("ascii", "strict")
class Root(object): class Root(object):
def __init__(self, et): def __init__(self, et):
self.elementtree = et self.elementtree = et
self.children = [] self.children = []
if et.docinfo.internalDTD:
self.children.append(Doctype(self, try:
ensure_str(et.docinfo.root_name), if et.docinfo.internalDTD:
ensure_str(et.docinfo.public_id), self.children.append(Doctype(self,
ensure_str(et.docinfo.system_url))) ensure_str(et.docinfo.root_name),
root = et.getroot() ensure_str(et.docinfo.public_id),
node = root ensure_str(et.docinfo.system_url)))
except AttributeError:
pass
try:
node = et.getroot()
except AttributeError:
node = et
while node.getprevious() is not None: while node.getprevious() is not None:
node = node.getprevious() node = node.getprevious()
@ -115,35 +122,38 @@ class FragmentWrapper(object):
return len(self.obj) return len(self.obj)
class TreeWalker(_base.NonRecursiveTreeWalker): class TreeWalker(base.NonRecursiveTreeWalker):
def __init__(self, tree): def __init__(self, tree):
if hasattr(tree, "getroot"): # pylint:disable=redefined-variable-type
tree = Root(tree) if isinstance(tree, list):
elif isinstance(tree, list): self.fragmentChildren = set(tree)
tree = FragmentRoot(tree) tree = FragmentRoot(tree)
_base.NonRecursiveTreeWalker.__init__(self, tree) else:
self.filter = ihatexml.InfosetFilter() self.fragmentChildren = set()
tree = Root(tree)
base.NonRecursiveTreeWalker.__init__(self, tree)
self.filter = _ihatexml.InfosetFilter()
def getNodeDetails(self, node): def getNodeDetails(self, node):
if isinstance(node, tuple): # Text node if isinstance(node, tuple): # Text node
node, key = node node, key = node
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
return _base.TEXT, ensure_str(getattr(node, key)) return base.TEXT, ensure_str(getattr(node, key))
elif isinstance(node, Root): elif isinstance(node, Root):
return (_base.DOCUMENT,) return (base.DOCUMENT,)
elif isinstance(node, Doctype): elif isinstance(node, Doctype):
return _base.DOCTYPE, node.name, node.public_id, node.system_id return base.DOCTYPE, node.name, node.public_id, node.system_id
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"): elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
return _base.TEXT, node.obj return base.TEXT, ensure_str(node.obj)
elif node.tag == etree.Comment: elif node.tag == etree.Comment:
return _base.COMMENT, ensure_str(node.text) return base.COMMENT, ensure_str(node.text)
elif node.tag == etree.Entity: elif node.tag == etree.Entity:
return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &; return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
else: else:
# This is assumed to be an ordinary element # This is assumed to be an ordinary element
@ -162,7 +172,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
attrs[(match.group(1), match.group(2))] = value attrs[(match.group(1), match.group(2))] = value
else: else:
attrs[(None, name)] = value attrs[(None, name)] = value
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
attrs, len(node) > 0 or node.text) attrs, len(node) > 0 or node.text)
def getFirstChild(self, node): def getFirstChild(self, node):
@ -197,5 +207,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
if key == "text": if key == "text":
return node return node
# else: fallback to "normal" processing # else: fallback to "normal" processing
elif node in self.fragmentChildren:
return None
return node.getparent() return node.getparent()

View file

@ -4,12 +4,12 @@ from genshi.core import QName
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from . import _base from . import base
from ..constants import voidElements, namespaces from ..constants import voidElements, namespaces
class TreeWalker(_base.TreeWalker): class TreeWalker(base.TreeWalker):
def __iter__(self): def __iter__(self):
# Buffer the events so we can pass in the following one # Buffer the events so we can pass in the following one
previous = None previous = None
@ -25,7 +25,7 @@ class TreeWalker(_base.TreeWalker):
yield token yield token
def tokens(self, event, next): def tokens(self, event, next):
kind, data, pos = event kind, data, _ = event
if kind == START: if kind == START:
tag, attribs = data tag, attribs = data
name = tag.localname name = tag.localname
@ -39,8 +39,8 @@ class TreeWalker(_base.TreeWalker):
if namespace == namespaces["html"] and name in voidElements: if namespace == namespaces["html"] and name in voidElements:
for token in self.emptyTag(namespace, name, converted_attribs, for token in self.emptyTag(namespace, name, converted_attribs,
not next or next[0] != END not next or next[0] != END or
or next[1] != tag): next[1] != tag):
yield token yield token
else: else:
yield self.startTag(namespace, name, converted_attribs) yield self.startTag(namespace, name, converted_attribs)
@ -48,7 +48,7 @@ class TreeWalker(_base.TreeWalker):
elif kind == END: elif kind == END:
name = data.localname name = data.localname
namespace = data.namespace namespace = data.namespace
if name not in voidElements: if namespace != namespaces["html"] or name not in voidElements:
yield self.endTag(namespace, name) yield self.endTag(namespace, name)
elif kind == COMMENT: elif kind == COMMENT: