mirror of
https://github.com/SickGear/SickGear.git
synced 2024-12-01 00:43:37 +00:00
Merge pull request #872 from JackDandy/feature/UpdateHtml5lib
Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72).
This commit is contained in:
commit
25566998ca
34 changed files with 1684 additions and 1283 deletions
|
@ -13,6 +13,7 @@
|
|||
* Update cachecontrol library 0.11.5 to 0.11.7 (3b3b776)
|
||||
* Update Certifi 2015.11.20.1 (385476b) to 2017.01.23 (9f9dc30)
|
||||
* Update feedparser library 5.2.0 (8c62940) to 5.2.1 (f1dd1bb)
|
||||
* Update html5lib 0.99999999/1.0b9 (46dae3d) to (1a28d72)
|
||||
|
||||
|
||||
[develop changelog]
|
||||
|
|
|
@ -22,4 +22,4 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
|
|||
"getTreeWalker", "serialize"]
|
||||
|
||||
# this has to be at the top level, see how setup.py parses this
|
||||
__version__ = "0.99999999-dev"
|
||||
__version__ = "0.9999999999-dev"
|
||||
|
|
|
@ -175,9 +175,9 @@ def escapeRegexp(string):
|
|||
return string
|
||||
|
||||
# output from the above
|
||||
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
||||
nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
||||
|
||||
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
||||
nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa
|
||||
|
||||
# Simpler things
|
||||
nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
|
||||
|
@ -186,7 +186,7 @@ nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
|
|||
class InfosetFilter(object):
|
||||
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
||||
|
||||
def __init__(self, replaceChars=None,
|
||||
def __init__(self,
|
||||
dropXmlnsLocalName=False,
|
||||
dropXmlnsAttrNs=False,
|
||||
preventDoubleDashComments=False,
|
||||
|
@ -217,7 +217,7 @@ class InfosetFilter(object):
|
|||
else:
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceElement(self, name, namespace=None):
|
||||
def coerceElement(self, name):
|
||||
return self.toXmlName(name)
|
||||
|
||||
def coerceComment(self, data):
|
||||
|
@ -232,7 +232,7 @@ class InfosetFilter(object):
|
|||
|
||||
def coerceCharacters(self, data):
|
||||
if self.replaceFormFeedCharacters:
|
||||
for i in range(data.count("\x0C")):
|
||||
for _ in range(data.count("\x0C")):
|
||||
warnings.warn("Text cannot contain U+000C", DataLossWarning)
|
||||
data = data.replace("\x0C", " ")
|
||||
# Other non-xml characters
|
|
@ -1,14 +1,16 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from six import text_type
|
||||
from six import text_type, binary_type
|
||||
from six.moves import http_client, urllib
|
||||
|
||||
import codecs
|
||||
import re
|
||||
|
||||
import webencodings
|
||||
|
||||
from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||
from .constants import encodings, ReparseException
|
||||
from . import utils
|
||||
from .constants import ReparseException
|
||||
from . import _utils
|
||||
|
||||
from io import StringIO
|
||||
|
||||
|
@ -17,12 +19,6 @@ try:
|
|||
except ImportError:
|
||||
BytesIO = StringIO
|
||||
|
||||
try:
|
||||
from io import BufferedIOBase
|
||||
except ImportError:
|
||||
class BufferedIOBase(object):
|
||||
pass
|
||||
|
||||
# Non-unicode versions of constants for use in the pre-parser
|
||||
spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
|
||||
asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
|
||||
|
@ -30,15 +26,17 @@ asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase
|
|||
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
|
||||
|
||||
|
||||
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"
|
||||
invalid_unicode_no_surrogate = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]" # noqa
|
||||
|
||||
if utils.supports_lone_surrogates:
|
||||
if _utils.supports_lone_surrogates:
|
||||
# Use one extra step of indirection and create surrogates with
|
||||
# unichr. Not using this indirection would introduce an illegal
|
||||
# eval. Not using this indirection would introduce an illegal
|
||||
# unicode literal on platforms not supporting such lone
|
||||
# surrogates.
|
||||
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate +
|
||||
eval('"\\uD800-\\uDFFF"'))
|
||||
assert invalid_unicode_no_surrogate[-1] == "]" and invalid_unicode_no_surrogate.count("]") == 1
|
||||
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate[:-1] +
|
||||
eval('"\\uD800-\\uDFFF"') + # pylint:disable=eval-used
|
||||
"]")
|
||||
else:
|
||||
invalid_unicode_re = re.compile(invalid_unicode_no_surrogate)
|
||||
|
||||
|
@ -130,7 +128,7 @@ class BufferedStream(object):
|
|||
return b"".join(rv)
|
||||
|
||||
|
||||
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
|
||||
def HTMLInputStream(source, **kwargs):
|
||||
# Work around Python bug #20007: read(0) closes the connection.
|
||||
# http://bugs.python.org/issue20007
|
||||
if (isinstance(source, http_client.HTTPResponse) or
|
||||
|
@ -144,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
|
|||
isUnicode = isinstance(source, text_type)
|
||||
|
||||
if isUnicode:
|
||||
if encoding is not None:
|
||||
raise TypeError("Cannot explicitly set an encoding with a unicode string")
|
||||
encodings = [x for x in kwargs if x.endswith("_encoding")]
|
||||
if encodings:
|
||||
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
|
||||
|
||||
return HTMLUnicodeInputStream(source)
|
||||
return HTMLUnicodeInputStream(source, **kwargs)
|
||||
else:
|
||||
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
|
||||
return HTMLBinaryInputStream(source, **kwargs)
|
||||
|
||||
|
||||
class HTMLUnicodeInputStream(object):
|
||||
|
@ -175,27 +174,21 @@ class HTMLUnicodeInputStream(object):
|
|||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
parseMeta - Look for a <meta> element containing encoding information
|
||||
|
||||
"""
|
||||
|
||||
if not utils.supports_lone_surrogates:
|
||||
if not _utils.supports_lone_surrogates:
|
||||
# Such platforms will have already checked for such
|
||||
# surrogate errors, so no need to do this checking.
|
||||
self.reportCharacterErrors = None
|
||||
self.replaceCharactersRegexp = None
|
||||
elif len("\U0010FFFF") == 1:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS4
|
||||
self.replaceCharactersRegexp = re.compile(eval('"[\\uD800-\\uDFFF]"'))
|
||||
else:
|
||||
self.reportCharacterErrors = self.characterErrorsUCS2
|
||||
self.replaceCharactersRegexp = re.compile(
|
||||
eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))
|
||||
|
||||
# List of where new lines occur
|
||||
self.newLines = [0]
|
||||
|
||||
self.charEncoding = ("utf-8", "certain")
|
||||
self.charEncoding = (lookupEncoding("utf-8"), "certain")
|
||||
self.dataStream = self.openStream(source)
|
||||
|
||||
self.reset()
|
||||
|
@ -288,10 +281,7 @@ class HTMLUnicodeInputStream(object):
|
|||
if self.reportCharacterErrors:
|
||||
self.reportCharacterErrors(data)
|
||||
|
||||
# Replace invalid characters
|
||||
# Note U+0000 is dealt with in the tokenizer
|
||||
data = self.replaceCharactersRegexp.sub("\ufffd", data)
|
||||
|
||||
# Replace invalid characters
|
||||
data = data.replace("\r\n", "\n")
|
||||
data = data.replace("\r", "\n")
|
||||
|
||||
|
@ -301,7 +291,7 @@ class HTMLUnicodeInputStream(object):
|
|||
return True
|
||||
|
||||
def characterErrorsUCS4(self, data):
|
||||
for i in range(len(invalid_unicode_re.findall(data))):
|
||||
for _ in range(len(invalid_unicode_re.findall(data))):
|
||||
self.errors.append("invalid-codepoint")
|
||||
|
||||
def characterErrorsUCS2(self, data):
|
||||
|
@ -314,9 +304,9 @@ class HTMLUnicodeInputStream(object):
|
|||
codepoint = ord(match.group())
|
||||
pos = match.start()
|
||||
# Pretty sure there should be endianness issues here
|
||||
if utils.isSurrogatePair(data[pos:pos + 2]):
|
||||
if _utils.isSurrogatePair(data[pos:pos + 2]):
|
||||
# We have a surrogate pair!
|
||||
char_val = utils.surrogatePairToCodepoint(data[pos:pos + 2])
|
||||
char_val = _utils.surrogatePairToCodepoint(data[pos:pos + 2])
|
||||
if char_val in non_bmp_invalid_codepoints:
|
||||
self.errors.append("invalid-codepoint")
|
||||
skip = True
|
||||
|
@ -399,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
|||
|
||||
"""
|
||||
|
||||
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
|
||||
def __init__(self, source, override_encoding=None, transport_encoding=None,
|
||||
same_origin_parent_encoding=None, likely_encoding=None,
|
||||
default_encoding="windows-1252", useChardet=True):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
|
@ -412,8 +404,6 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
|||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
parseMeta - Look for a <meta> element containing encoding information
|
||||
|
||||
"""
|
||||
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||
# self.charEncoding as appropriate
|
||||
|
@ -421,27 +411,28 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
|||
|
||||
HTMLUnicodeInputStream.__init__(self, self.rawStream)
|
||||
|
||||
self.charEncoding = (codecName(encoding), "certain")
|
||||
|
||||
# Encoding Information
|
||||
# Number of bytes to use when looking for a meta element with
|
||||
# encoding information
|
||||
self.numBytesMeta = 512
|
||||
self.numBytesMeta = 1024
|
||||
# Number of bytes to use when using detecting encoding using chardet
|
||||
self.numBytesChardet = 100
|
||||
# Encoding to use if no other information can be found
|
||||
self.defaultEncoding = "windows-1252"
|
||||
# Things from args
|
||||
self.override_encoding = override_encoding
|
||||
self.transport_encoding = transport_encoding
|
||||
self.same_origin_parent_encoding = same_origin_parent_encoding
|
||||
self.likely_encoding = likely_encoding
|
||||
self.default_encoding = default_encoding
|
||||
|
||||
# Detect encoding iff no explicit "transport level" encoding is supplied
|
||||
if (self.charEncoding[0] is None):
|
||||
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
||||
# Determine encoding
|
||||
self.charEncoding = self.determineEncoding(useChardet)
|
||||
assert self.charEncoding[0] is not None
|
||||
|
||||
# Call superclass
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
|
||||
'replace')
|
||||
self.dataStream = self.charEncoding[0].codec_info.streamreader(self.rawStream, 'replace')
|
||||
HTMLUnicodeInputStream.reset(self)
|
||||
|
||||
def openStream(self, source):
|
||||
|
@ -458,29 +449,50 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
|||
|
||||
try:
|
||||
stream.seek(stream.tell())
|
||||
except:
|
||||
except: # pylint:disable=bare-except
|
||||
stream = BufferedStream(stream)
|
||||
|
||||
return stream
|
||||
|
||||
def detectEncoding(self, parseMeta=True, chardet=True):
|
||||
# First look for a BOM
|
||||
def determineEncoding(self, chardet=True):
|
||||
# BOMs take precedence over everything
|
||||
# This will also read past the BOM if present
|
||||
encoding = self.detectBOM()
|
||||
confidence = "certain"
|
||||
# If there is no BOM need to look for meta elements with encoding
|
||||
# information
|
||||
if encoding is None and parseMeta:
|
||||
encoding = self.detectEncodingMeta()
|
||||
confidence = "tentative"
|
||||
charEncoding = self.detectBOM(), "certain"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# If we've been overriden, we've been overriden
|
||||
charEncoding = lookupEncoding(self.override_encoding), "certain"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Now check the transport layer
|
||||
charEncoding = lookupEncoding(self.transport_encoding), "certain"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Look for meta elements with encoding information
|
||||
charEncoding = self.detectEncodingMeta(), "tentative"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Parent document encoding
|
||||
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
|
||||
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
|
||||
return charEncoding
|
||||
|
||||
# "likely" encoding
|
||||
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
# Guess with chardet, if available
|
||||
if encoding is None and chardet:
|
||||
confidence = "tentative"
|
||||
if chardet:
|
||||
try:
|
||||
try:
|
||||
from charade.universaldetector import UniversalDetector
|
||||
except ImportError:
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
from chardet.universaldetector import UniversalDetector
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
buffers = []
|
||||
detector = UniversalDetector()
|
||||
while not detector.done:
|
||||
|
@ -491,36 +503,33 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
|||
buffers.append(buffer)
|
||||
detector.feed(buffer)
|
||||
detector.close()
|
||||
encoding = detector.result['encoding']
|
||||
encoding = lookupEncoding(detector.result['encoding'])
|
||||
self.rawStream.seek(0)
|
||||
except ImportError:
|
||||
pass
|
||||
# If all else fails use the default encoding
|
||||
if encoding is None:
|
||||
confidence = "tentative"
|
||||
encoding = self.defaultEncoding
|
||||
if encoding is not None:
|
||||
return encoding, "tentative"
|
||||
|
||||
# Substitute for equivalent encodings:
|
||||
encodingSub = {"iso-8859-1": "windows-1252"}
|
||||
# Try the default encoding
|
||||
charEncoding = lookupEncoding(self.default_encoding), "tentative"
|
||||
if charEncoding[0] is not None:
|
||||
return charEncoding
|
||||
|
||||
if encoding.lower() in encodingSub:
|
||||
encoding = encodingSub[encoding.lower()]
|
||||
|
||||
return encoding, confidence
|
||||
# Fallback to html5lib's default if even that hasn't worked
|
||||
return lookupEncoding("windows-1252"), "tentative"
|
||||
|
||||
def changeEncoding(self, newEncoding):
|
||||
assert self.charEncoding[1] != "certain"
|
||||
newEncoding = codecName(newEncoding)
|
||||
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
||||
newEncoding = "utf-8"
|
||||
newEncoding = lookupEncoding(newEncoding)
|
||||
if newEncoding is None:
|
||||
return
|
||||
if newEncoding.name in ("utf-16be", "utf-16le"):
|
||||
newEncoding = lookupEncoding("utf-8")
|
||||
assert newEncoding is not None
|
||||
elif newEncoding == self.charEncoding[0]:
|
||||
self.charEncoding = (self.charEncoding[0], "certain")
|
||||
else:
|
||||
self.rawStream.seek(0)
|
||||
self.reset()
|
||||
self.charEncoding = (newEncoding, "certain")
|
||||
self.reset()
|
||||
raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
|
||||
|
||||
def detectBOM(self):
|
||||
|
@ -529,8 +538,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
|||
encoding otherwise return None"""
|
||||
bomDict = {
|
||||
codecs.BOM_UTF8: 'utf-8',
|
||||
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
|
||||
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
|
||||
codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be',
|
||||
codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be'
|
||||
}
|
||||
|
||||
# Go to beginning of file and read in 4 bytes
|
||||
|
@ -550,9 +559,12 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
|||
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
self.rawStream.seek(encoding and seek or 0)
|
||||
|
||||
return encoding
|
||||
if encoding:
|
||||
self.rawStream.seek(seek)
|
||||
return lookupEncoding(encoding)
|
||||
else:
|
||||
self.rawStream.seek(0)
|
||||
return None
|
||||
|
||||
def detectEncodingMeta(self):
|
||||
"""Report the encoding declared by the meta element
|
||||
|
@ -563,8 +575,8 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
|
|||
self.rawStream.seek(0)
|
||||
encoding = parser.getEncoding()
|
||||
|
||||
if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
|
||||
encoding = "utf-8"
|
||||
if encoding is not None and encoding.name in ("utf-16be", "utf-16le"):
|
||||
encoding = lookupEncoding("utf-8")
|
||||
|
||||
return encoding
|
||||
|
||||
|
@ -578,6 +590,7 @@ class EncodingBytes(bytes):
|
|||
return bytes.__new__(self, value.lower())
|
||||
|
||||
def __init__(self, value):
|
||||
# pylint:disable=unused-argument
|
||||
self._position = -1
|
||||
|
||||
def __iter__(self):
|
||||
|
@ -688,7 +701,7 @@ class EncodingParser(object):
|
|||
(b"<!", self.handleOther),
|
||||
(b"<?", self.handleOther),
|
||||
(b"<", self.handlePossibleStartTag))
|
||||
for byte in self.data:
|
||||
for _ in self.data:
|
||||
keepParsing = True
|
||||
for key, method in methodDispatch:
|
||||
if self.data.matchBytes(key):
|
||||
|
@ -727,7 +740,7 @@ class EncodingParser(object):
|
|||
return False
|
||||
elif attr[0] == b"charset":
|
||||
tentativeEncoding = attr[1]
|
||||
codec = codecName(tentativeEncoding)
|
||||
codec = lookupEncoding(tentativeEncoding)
|
||||
if codec is not None:
|
||||
self.encoding = codec
|
||||
return False
|
||||
|
@ -735,7 +748,7 @@ class EncodingParser(object):
|
|||
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
||||
tentativeEncoding = contentParser.parse()
|
||||
if tentativeEncoding is not None:
|
||||
codec = codecName(tentativeEncoding)
|
||||
codec = lookupEncoding(tentativeEncoding)
|
||||
if codec is not None:
|
||||
if hasPragma:
|
||||
self.encoding = codec
|
||||
|
@ -892,16 +905,19 @@ class ContentAttrParser(object):
|
|||
return None
|
||||
|
||||
|
||||
def codecName(encoding):
|
||||
def lookupEncoding(encoding):
|
||||
"""Return the python codec name corresponding to an encoding or None if the
|
||||
string doesn't correspond to a valid encoding."""
|
||||
if isinstance(encoding, bytes):
|
||||
if isinstance(encoding, binary_type):
|
||||
try:
|
||||
encoding = encoding.decode("ascii")
|
||||
except UnicodeDecodeError:
|
||||
return None
|
||||
if encoding:
|
||||
canonicalName = ascii_punctuation_re.sub("", encoding).lower()
|
||||
return encodings.get(canonicalName, None)
|
||||
|
||||
if encoding is not None:
|
||||
try:
|
||||
return webencodings.lookup(encoding)
|
||||
except AttributeError:
|
||||
return None
|
||||
else:
|
||||
return None
|
|
@ -1,9 +1,6 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
try:
|
||||
chr = unichr # flake8: noqa
|
||||
except NameError:
|
||||
pass
|
||||
from six import unichr as chr
|
||||
|
||||
from collections import deque
|
||||
|
||||
|
@ -14,9 +11,9 @@ from .constants import digits, hexDigits, EOF
|
|||
from .constants import tokenTypes, tagTokenTypes
|
||||
from .constants import replacementCharacters
|
||||
|
||||
from .inputstream import HTMLInputStream
|
||||
from ._inputstream import HTMLInputStream
|
||||
|
||||
from .trie import Trie
|
||||
from ._trie import Trie
|
||||
|
||||
entitiesTrie = Trie(entities)
|
||||
|
||||
|
@ -34,16 +31,11 @@ class HTMLTokenizer(object):
|
|||
Points to HTMLInputStream object.
|
||||
"""
|
||||
|
||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
||||
lowercaseElementName=True, lowercaseAttrName=True, parser=None):
|
||||
def __init__(self, stream, parser=None, **kwargs):
|
||||
|
||||
self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
|
||||
self.stream = HTMLInputStream(stream, **kwargs)
|
||||
self.parser = parser
|
||||
|
||||
# Perform case conversions?
|
||||
self.lowercaseElementName = lowercaseElementName
|
||||
self.lowercaseAttrName = lowercaseAttrName
|
||||
|
||||
# Setup the initial tokenizer state
|
||||
self.escapeFlag = False
|
||||
self.lastFourChars = []
|
||||
|
@ -147,8 +139,8 @@ class HTMLTokenizer(object):
|
|||
output = "&"
|
||||
|
||||
charStack = [self.stream.char()]
|
||||
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
|
||||
or (allowedChar is not None and allowedChar == charStack[0])):
|
||||
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or
|
||||
(allowedChar is not None and allowedChar == charStack[0])):
|
||||
self.stream.unget(charStack[0])
|
||||
|
||||
elif charStack[0] == "#":
|
||||
|
@ -235,8 +227,7 @@ class HTMLTokenizer(object):
|
|||
token = self.currentToken
|
||||
# Add token to the queue to be yielded
|
||||
if (token["type"] in tagTokenTypes):
|
||||
if self.lowercaseElementName:
|
||||
token["name"] = token["name"].translate(asciiUpper2Lower)
|
||||
token["name"] = token["name"].translate(asciiUpper2Lower)
|
||||
if token["type"] == tokenTypes["EndTag"]:
|
||||
if token["data"]:
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||||
|
@ -921,10 +912,9 @@ class HTMLTokenizer(object):
|
|||
# Attributes are not dropped at this stage. That happens when the
|
||||
# start tag token is emitted so values can still be safely appended
|
||||
# to attributes, but we do want to report the parse error in time.
|
||||
if self.lowercaseAttrName:
|
||||
self.currentToken["data"][-1][0] = (
|
||||
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
|
||||
for name, value in self.currentToken["data"][:-1]:
|
||||
self.currentToken["data"][-1][0] = (
|
||||
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
|
||||
for name, _ in self.currentToken["data"][:-1]:
|
||||
if self.currentToken["data"][-1][0] == name:
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"duplicate-attribute"})
|
||||
|
@ -1716,11 +1706,11 @@ class HTMLTokenizer(object):
|
|||
else:
|
||||
data.append(char)
|
||||
|
||||
data = "".join(data)
|
||||
data = "".join(data) # pylint:disable=redefined-variable-type
|
||||
# Deal with null here rather than in the parser
|
||||
nullCount = data.count("\u0000")
|
||||
if nullCount > 0:
|
||||
for i in range(nullCount):
|
||||
for _ in range(nullCount):
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||||
"data": "invalid-codepoint"})
|
||||
data = data.replace("\u0000", "\uFFFD")
|
|
@ -4,9 +4,11 @@ from .py import Trie as PyTrie
|
|||
|
||||
Trie = PyTrie
|
||||
|
||||
# pylint:disable=wrong-import-position
|
||||
try:
|
||||
from .datrie import Trie as DATrie
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
Trie = DATrie
|
||||
# pylint:enable=wrong-import-position
|
|
@ -7,7 +7,8 @@ class Trie(Mapping):
|
|||
"""Abstract base class for tries"""
|
||||
|
||||
def keys(self, prefix=None):
|
||||
keys = super().keys()
|
||||
# pylint:disable=arguments-differ
|
||||
keys = super(Trie, self).keys()
|
||||
|
||||
if prefix is None:
|
||||
return set(keys)
|
|
@ -1,5 +1,6 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
import sys
|
||||
from types import ModuleType
|
||||
|
||||
from six import text_type
|
||||
|
@ -12,9 +13,11 @@ except ImportError:
|
|||
|
||||
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
||||
"surrogatePairToCodepoint", "moduleFactoryFactory",
|
||||
"supports_lone_surrogates"]
|
||||
"supports_lone_surrogates", "PY27"]
|
||||
|
||||
|
||||
PY27 = sys.version_info[0] == 2 and sys.version_info[1] >= 7
|
||||
|
||||
# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
|
||||
# caught by the below test. In general this would be any platform
|
||||
# using UTF-16 as its encoding of unicode strings, such as
|
||||
|
@ -22,12 +25,12 @@ __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
|
|||
# surrogates, and there is no mechanism to further escape such
|
||||
# escapes.
|
||||
try:
|
||||
_x = eval('"\\uD800"')
|
||||
_x = eval('"\\uD800"') # pylint:disable=eval-used
|
||||
if not isinstance(_x, text_type):
|
||||
# We need this with u"" because of http://bugs.jython.org/issue2039
|
||||
_x = eval('u"\\uD800"')
|
||||
_x = eval('u"\\uD800"') # pylint:disable=eval-used
|
||||
assert isinstance(_x, text_type)
|
||||
except:
|
||||
except: # pylint:disable=bare-except
|
||||
supports_lone_surrogates = False
|
||||
else:
|
||||
supports_lone_surrogates = True
|
||||
|
@ -52,12 +55,13 @@ class MethodDispatcher(dict):
|
|||
# anything here.
|
||||
_dictEntries = []
|
||||
for name, value in items:
|
||||
if type(name) in (list, tuple, frozenset, set):
|
||||
if isinstance(name, (list, tuple, frozenset, set)):
|
||||
for item in name:
|
||||
_dictEntries.append((item, value))
|
||||
else:
|
||||
_dictEntries.append((name, value))
|
||||
dict.__init__(self, _dictEntries)
|
||||
assert len(self) == len(_dictEntries)
|
||||
self.default = None
|
||||
|
||||
def __getitem__(self, key):
|
||||
|
@ -109,3 +113,15 @@ def moduleFactoryFactory(factory):
|
|||
return mod
|
||||
|
||||
return moduleFactory
|
||||
|
||||
|
||||
def memoize(func):
|
||||
cache = {}
|
||||
|
||||
def wrapped(*args, **kwargs):
|
||||
key = (tuple(args), tuple(kwargs.items()))
|
||||
if key not in cache:
|
||||
cache[key] = func(*args, **kwargs)
|
||||
return cache[key]
|
||||
|
||||
return wrapped
|
|
@ -283,6 +283,12 @@ E = {
|
|||
"Element %(name)s not allowed in a non-html context",
|
||||
"unexpected-end-tag-before-html":
|
||||
"Unexpected end tag (%(name)s) before html.",
|
||||
"unexpected-inhead-noscript-tag":
|
||||
"Element %(name)s not allowed in a inhead-noscript context",
|
||||
"eof-in-head-noscript":
|
||||
"Unexpected end of file. Expected inhead-noscript content",
|
||||
"char-in-head-noscript":
|
||||
"Unexpected non-space character. Expected inhead-noscript content",
|
||||
"XXX-undefined-error":
|
||||
"Undefined error (this sucks and should be fixed)",
|
||||
}
|
||||
|
@ -431,6 +437,73 @@ mathmlTextIntegrationPointElements = frozenset([
|
|||
(namespaces["mathml"], "mtext")
|
||||
])
|
||||
|
||||
adjustSVGAttributes = {
|
||||
"attributename": "attributeName",
|
||||
"attributetype": "attributeType",
|
||||
"basefrequency": "baseFrequency",
|
||||
"baseprofile": "baseProfile",
|
||||
"calcmode": "calcMode",
|
||||
"clippathunits": "clipPathUnits",
|
||||
"contentscripttype": "contentScriptType",
|
||||
"contentstyletype": "contentStyleType",
|
||||
"diffuseconstant": "diffuseConstant",
|
||||
"edgemode": "edgeMode",
|
||||
"externalresourcesrequired": "externalResourcesRequired",
|
||||
"filterres": "filterRes",
|
||||
"filterunits": "filterUnits",
|
||||
"glyphref": "glyphRef",
|
||||
"gradienttransform": "gradientTransform",
|
||||
"gradientunits": "gradientUnits",
|
||||
"kernelmatrix": "kernelMatrix",
|
||||
"kernelunitlength": "kernelUnitLength",
|
||||
"keypoints": "keyPoints",
|
||||
"keysplines": "keySplines",
|
||||
"keytimes": "keyTimes",
|
||||
"lengthadjust": "lengthAdjust",
|
||||
"limitingconeangle": "limitingConeAngle",
|
||||
"markerheight": "markerHeight",
|
||||
"markerunits": "markerUnits",
|
||||
"markerwidth": "markerWidth",
|
||||
"maskcontentunits": "maskContentUnits",
|
||||
"maskunits": "maskUnits",
|
||||
"numoctaves": "numOctaves",
|
||||
"pathlength": "pathLength",
|
||||
"patterncontentunits": "patternContentUnits",
|
||||
"patterntransform": "patternTransform",
|
||||
"patternunits": "patternUnits",
|
||||
"pointsatx": "pointsAtX",
|
||||
"pointsaty": "pointsAtY",
|
||||
"pointsatz": "pointsAtZ",
|
||||
"preservealpha": "preserveAlpha",
|
||||
"preserveaspectratio": "preserveAspectRatio",
|
||||
"primitiveunits": "primitiveUnits",
|
||||
"refx": "refX",
|
||||
"refy": "refY",
|
||||
"repeatcount": "repeatCount",
|
||||
"repeatdur": "repeatDur",
|
||||
"requiredextensions": "requiredExtensions",
|
||||
"requiredfeatures": "requiredFeatures",
|
||||
"specularconstant": "specularConstant",
|
||||
"specularexponent": "specularExponent",
|
||||
"spreadmethod": "spreadMethod",
|
||||
"startoffset": "startOffset",
|
||||
"stddeviation": "stdDeviation",
|
||||
"stitchtiles": "stitchTiles",
|
||||
"surfacescale": "surfaceScale",
|
||||
"systemlanguage": "systemLanguage",
|
||||
"tablevalues": "tableValues",
|
||||
"targetx": "targetX",
|
||||
"targety": "targetY",
|
||||
"textlength": "textLength",
|
||||
"viewbox": "viewBox",
|
||||
"viewtarget": "viewTarget",
|
||||
"xchannelselector": "xChannelSelector",
|
||||
"ychannelselector": "yChannelSelector",
|
||||
"zoomandpan": "zoomAndPan"
|
||||
}
|
||||
|
||||
adjustMathMLAttributes = {"definitionurl": "definitionURL"}
|
||||
|
||||
adjustForeignAttributes = {
|
||||
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
|
||||
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
|
||||
|
@ -2813,7 +2886,6 @@ replacementCharacters = {
|
|||
0x0d: "\u000D",
|
||||
0x80: "\u20AC",
|
||||
0x81: "\u0081",
|
||||
0x81: "\u0081",
|
||||
0x82: "\u201A",
|
||||
0x83: "\u0192",
|
||||
0x84: "\u201E",
|
||||
|
@ -2846,235 +2918,6 @@ replacementCharacters = {
|
|||
0x9F: "\u0178",
|
||||
}
|
||||
|
||||
encodings = {
|
||||
'437': 'cp437',
|
||||
'850': 'cp850',
|
||||
'852': 'cp852',
|
||||
'855': 'cp855',
|
||||
'857': 'cp857',
|
||||
'860': 'cp860',
|
||||
'861': 'cp861',
|
||||
'862': 'cp862',
|
||||
'863': 'cp863',
|
||||
'865': 'cp865',
|
||||
'866': 'cp866',
|
||||
'869': 'cp869',
|
||||
'ansix341968': 'ascii',
|
||||
'ansix341986': 'ascii',
|
||||
'arabic': 'iso8859-6',
|
||||
'ascii': 'ascii',
|
||||
'asmo708': 'iso8859-6',
|
||||
'big5': 'big5',
|
||||
'big5hkscs': 'big5hkscs',
|
||||
'chinese': 'gbk',
|
||||
'cp037': 'cp037',
|
||||
'cp1026': 'cp1026',
|
||||
'cp154': 'ptcp154',
|
||||
'cp367': 'ascii',
|
||||
'cp424': 'cp424',
|
||||
'cp437': 'cp437',
|
||||
'cp500': 'cp500',
|
||||
'cp775': 'cp775',
|
||||
'cp819': 'windows-1252',
|
||||
'cp850': 'cp850',
|
||||
'cp852': 'cp852',
|
||||
'cp855': 'cp855',
|
||||
'cp857': 'cp857',
|
||||
'cp860': 'cp860',
|
||||
'cp861': 'cp861',
|
||||
'cp862': 'cp862',
|
||||
'cp863': 'cp863',
|
||||
'cp864': 'cp864',
|
||||
'cp865': 'cp865',
|
||||
'cp866': 'cp866',
|
||||
'cp869': 'cp869',
|
||||
'cp936': 'gbk',
|
||||
'cpgr': 'cp869',
|
||||
'cpis': 'cp861',
|
||||
'csascii': 'ascii',
|
||||
'csbig5': 'big5',
|
||||
'cseuckr': 'cp949',
|
||||
'cseucpkdfmtjapanese': 'euc_jp',
|
||||
'csgb2312': 'gbk',
|
||||
'cshproman8': 'hp-roman8',
|
||||
'csibm037': 'cp037',
|
||||
'csibm1026': 'cp1026',
|
||||
'csibm424': 'cp424',
|
||||
'csibm500': 'cp500',
|
||||
'csibm855': 'cp855',
|
||||
'csibm857': 'cp857',
|
||||
'csibm860': 'cp860',
|
||||
'csibm861': 'cp861',
|
||||
'csibm863': 'cp863',
|
||||
'csibm864': 'cp864',
|
||||
'csibm865': 'cp865',
|
||||
'csibm866': 'cp866',
|
||||
'csibm869': 'cp869',
|
||||
'csiso2022jp': 'iso2022_jp',
|
||||
'csiso2022jp2': 'iso2022_jp_2',
|
||||
'csiso2022kr': 'iso2022_kr',
|
||||
'csiso58gb231280': 'gbk',
|
||||
'csisolatin1': 'windows-1252',
|
||||
'csisolatin2': 'iso8859-2',
|
||||
'csisolatin3': 'iso8859-3',
|
||||
'csisolatin4': 'iso8859-4',
|
||||
'csisolatin5': 'windows-1254',
|
||||
'csisolatin6': 'iso8859-10',
|
||||
'csisolatinarabic': 'iso8859-6',
|
||||
'csisolatincyrillic': 'iso8859-5',
|
||||
'csisolatingreek': 'iso8859-7',
|
||||
'csisolatinhebrew': 'iso8859-8',
|
||||
'cskoi8r': 'koi8-r',
|
||||
'csksc56011987': 'cp949',
|
||||
'cspc775baltic': 'cp775',
|
||||
'cspc850multilingual': 'cp850',
|
||||
'cspc862latinhebrew': 'cp862',
|
||||
'cspc8codepage437': 'cp437',
|
||||
'cspcp852': 'cp852',
|
||||
'csptcp154': 'ptcp154',
|
||||
'csshiftjis': 'shift_jis',
|
||||
'csunicode11utf7': 'utf-7',
|
||||
'cyrillic': 'iso8859-5',
|
||||
'cyrillicasian': 'ptcp154',
|
||||
'ebcdiccpbe': 'cp500',
|
||||
'ebcdiccpca': 'cp037',
|
||||
'ebcdiccpch': 'cp500',
|
||||
'ebcdiccphe': 'cp424',
|
||||
'ebcdiccpnl': 'cp037',
|
||||
'ebcdiccpus': 'cp037',
|
||||
'ebcdiccpwt': 'cp037',
|
||||
'ecma114': 'iso8859-6',
|
||||
'ecma118': 'iso8859-7',
|
||||
'elot928': 'iso8859-7',
|
||||
'eucjp': 'euc_jp',
|
||||
'euckr': 'cp949',
|
||||
'extendedunixcodepackedformatforjapanese': 'euc_jp',
|
||||
'gb18030': 'gb18030',
|
||||
'gb2312': 'gbk',
|
||||
'gb231280': 'gbk',
|
||||
'gbk': 'gbk',
|
||||
'greek': 'iso8859-7',
|
||||
'greek8': 'iso8859-7',
|
||||
'hebrew': 'iso8859-8',
|
||||
'hproman8': 'hp-roman8',
|
||||
'hzgb2312': 'hz',
|
||||
'ibm037': 'cp037',
|
||||
'ibm1026': 'cp1026',
|
||||
'ibm367': 'ascii',
|
||||
'ibm424': 'cp424',
|
||||
'ibm437': 'cp437',
|
||||
'ibm500': 'cp500',
|
||||
'ibm775': 'cp775',
|
||||
'ibm819': 'windows-1252',
|
||||
'ibm850': 'cp850',
|
||||
'ibm852': 'cp852',
|
||||
'ibm855': 'cp855',
|
||||
'ibm857': 'cp857',
|
||||
'ibm860': 'cp860',
|
||||
'ibm861': 'cp861',
|
||||
'ibm862': 'cp862',
|
||||
'ibm863': 'cp863',
|
||||
'ibm864': 'cp864',
|
||||
'ibm865': 'cp865',
|
||||
'ibm866': 'cp866',
|
||||
'ibm869': 'cp869',
|
||||
'iso2022jp': 'iso2022_jp',
|
||||
'iso2022jp2': 'iso2022_jp_2',
|
||||
'iso2022kr': 'iso2022_kr',
|
||||
'iso646irv1991': 'ascii',
|
||||
'iso646us': 'ascii',
|
||||
'iso88591': 'windows-1252',
|
||||
'iso885910': 'iso8859-10',
|
||||
'iso8859101992': 'iso8859-10',
|
||||
'iso885911987': 'windows-1252',
|
||||
'iso885913': 'iso8859-13',
|
||||
'iso885914': 'iso8859-14',
|
||||
'iso8859141998': 'iso8859-14',
|
||||
'iso885915': 'iso8859-15',
|
||||
'iso885916': 'iso8859-16',
|
||||
'iso8859162001': 'iso8859-16',
|
||||
'iso88592': 'iso8859-2',
|
||||
'iso885921987': 'iso8859-2',
|
||||
'iso88593': 'iso8859-3',
|
||||
'iso885931988': 'iso8859-3',
|
||||
'iso88594': 'iso8859-4',
|
||||
'iso885941988': 'iso8859-4',
|
||||
'iso88595': 'iso8859-5',
|
||||
'iso885951988': 'iso8859-5',
|
||||
'iso88596': 'iso8859-6',
|
||||
'iso885961987': 'iso8859-6',
|
||||
'iso88597': 'iso8859-7',
|
||||
'iso885971987': 'iso8859-7',
|
||||
'iso88598': 'iso8859-8',
|
||||
'iso885981988': 'iso8859-8',
|
||||
'iso88599': 'windows-1254',
|
||||
'iso885991989': 'windows-1254',
|
||||
'isoceltic': 'iso8859-14',
|
||||
'isoir100': 'windows-1252',
|
||||
'isoir101': 'iso8859-2',
|
||||
'isoir109': 'iso8859-3',
|
||||
'isoir110': 'iso8859-4',
|
||||
'isoir126': 'iso8859-7',
|
||||
'isoir127': 'iso8859-6',
|
||||
'isoir138': 'iso8859-8',
|
||||
'isoir144': 'iso8859-5',
|
||||
'isoir148': 'windows-1254',
|
||||
'isoir149': 'cp949',
|
||||
'isoir157': 'iso8859-10',
|
||||
'isoir199': 'iso8859-14',
|
||||
'isoir226': 'iso8859-16',
|
||||
'isoir58': 'gbk',
|
||||
'isoir6': 'ascii',
|
||||
'koi8r': 'koi8-r',
|
||||
'koi8u': 'koi8-u',
|
||||
'korean': 'cp949',
|
||||
'ksc5601': 'cp949',
|
||||
'ksc56011987': 'cp949',
|
||||
'ksc56011989': 'cp949',
|
||||
'l1': 'windows-1252',
|
||||
'l10': 'iso8859-16',
|
||||
'l2': 'iso8859-2',
|
||||
'l3': 'iso8859-3',
|
||||
'l4': 'iso8859-4',
|
||||
'l5': 'windows-1254',
|
||||
'l6': 'iso8859-10',
|
||||
'l8': 'iso8859-14',
|
||||
'latin1': 'windows-1252',
|
||||
'latin10': 'iso8859-16',
|
||||
'latin2': 'iso8859-2',
|
||||
'latin3': 'iso8859-3',
|
||||
'latin4': 'iso8859-4',
|
||||
'latin5': 'windows-1254',
|
||||
'latin6': 'iso8859-10',
|
||||
'latin8': 'iso8859-14',
|
||||
'latin9': 'iso8859-15',
|
||||
'ms936': 'gbk',
|
||||
'mskanji': 'shift_jis',
|
||||
'pt154': 'ptcp154',
|
||||
'ptcp154': 'ptcp154',
|
||||
'r8': 'hp-roman8',
|
||||
'roman8': 'hp-roman8',
|
||||
'shiftjis': 'shift_jis',
|
||||
'tis620': 'cp874',
|
||||
'unicode11utf7': 'utf-7',
|
||||
'us': 'ascii',
|
||||
'usascii': 'ascii',
|
||||
'utf16': 'utf-16',
|
||||
'utf16be': 'utf-16-be',
|
||||
'utf16le': 'utf-16-le',
|
||||
'utf8': 'utf-8',
|
||||
'windows1250': 'cp1250',
|
||||
'windows1251': 'cp1251',
|
||||
'windows1252': 'cp1252',
|
||||
'windows1253': 'cp1253',
|
||||
'windows1254': 'cp1254',
|
||||
'windows1255': 'cp1255',
|
||||
'windows1256': 'cp1256',
|
||||
'windows1257': 'cp1257',
|
||||
'windows1258': 'cp1258',
|
||||
'windows936': 'gbk',
|
||||
'x-x-big5': 'big5'}
|
||||
|
||||
tokenTypes = {
|
||||
"Doctype": 0,
|
||||
"Characters": 1,
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import _base
|
||||
from . import base
|
||||
|
||||
try:
|
||||
from collections import OrderedDict
|
||||
|
@ -8,9 +8,9 @@ except ImportError:
|
|||
from ordereddict import OrderedDict
|
||||
|
||||
|
||||
class Filter(_base.Filter):
|
||||
class Filter(base.Filter):
|
||||
def __iter__(self):
|
||||
for token in _base.Filter.__iter__(self):
|
||||
for token in base.Filter.__iter__(self):
|
||||
if token["type"] in ("StartTag", "EmptyTag"):
|
||||
attrs = OrderedDict()
|
||||
for name, value in sorted(token["data"].items(),
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import _base
|
||||
from . import base
|
||||
|
||||
|
||||
class Filter(_base.Filter):
|
||||
class Filter(base.Filter):
|
||||
def __init__(self, source, encoding):
|
||||
_base.Filter.__init__(self, source)
|
||||
base.Filter.__init__(self, source)
|
||||
self.encoding = encoding
|
||||
|
||||
def __iter__(self):
|
||||
|
@ -13,7 +13,7 @@ class Filter(_base.Filter):
|
|||
meta_found = (self.encoding is None)
|
||||
pending = []
|
||||
|
||||
for token in _base.Filter.__iter__(self):
|
||||
for token in base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag":
|
||||
if token["name"].lower() == "head":
|
||||
|
|
|
@ -1,90 +1,81 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import _base
|
||||
from ..constants import cdataElements, rcdataElements, voidElements
|
||||
from six import text_type
|
||||
|
||||
from . import base
|
||||
from ..constants import namespaces, voidElements
|
||||
|
||||
from ..constants import spaceCharacters
|
||||
spaceCharacters = "".join(spaceCharacters)
|
||||
|
||||
|
||||
class LintError(Exception):
|
||||
pass
|
||||
class Filter(base.Filter):
|
||||
def __init__(self, source, require_matching_tags=True):
|
||||
super(Filter, self).__init__(source)
|
||||
self.require_matching_tags = require_matching_tags
|
||||
|
||||
|
||||
class Filter(_base.Filter):
|
||||
def __iter__(self):
|
||||
open_elements = []
|
||||
contentModelFlag = "PCDATA"
|
||||
for token in _base.Filter.__iter__(self):
|
||||
for token in base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type in ("StartTag", "EmptyTag"):
|
||||
namespace = token["namespace"]
|
||||
name = token["name"]
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError("StartTag not in PCDATA content model flag: %(tag)s" % {"tag": name})
|
||||
if not isinstance(name, str):
|
||||
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
|
||||
if not name:
|
||||
raise LintError("Empty tag name")
|
||||
if type == "StartTag" and name in voidElements:
|
||||
raise LintError("Void element reported as StartTag token: %(tag)s" % {"tag": name})
|
||||
elif type == "EmptyTag" and name not in voidElements:
|
||||
raise LintError("Non-void element reported as EmptyTag token: %(tag)s" % {"tag": token["name"]})
|
||||
if type == "StartTag":
|
||||
open_elements.append(name)
|
||||
for name, value in token["data"]:
|
||||
if not isinstance(name, str):
|
||||
raise LintError("Attribute name is not a string: %(name)r" % {"name": name})
|
||||
if not name:
|
||||
raise LintError("Empty attribute name")
|
||||
if not isinstance(value, str):
|
||||
raise LintError("Attribute value is not a string: %(value)r" % {"value": value})
|
||||
if name in cdataElements:
|
||||
contentModelFlag = "CDATA"
|
||||
elif name in rcdataElements:
|
||||
contentModelFlag = "RCDATA"
|
||||
elif name == "plaintext":
|
||||
contentModelFlag = "PLAINTEXT"
|
||||
assert namespace is None or isinstance(namespace, text_type)
|
||||
assert namespace != ""
|
||||
assert isinstance(name, text_type)
|
||||
assert name != ""
|
||||
assert isinstance(token["data"], dict)
|
||||
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||
assert type == "EmptyTag"
|
||||
else:
|
||||
assert type == "StartTag"
|
||||
if type == "StartTag" and self.require_matching_tags:
|
||||
open_elements.append((namespace, name))
|
||||
for (namespace, name), value in token["data"].items():
|
||||
assert namespace is None or isinstance(namespace, text_type)
|
||||
assert namespace != ""
|
||||
assert isinstance(name, text_type)
|
||||
assert name != ""
|
||||
assert isinstance(value, text_type)
|
||||
|
||||
elif type == "EndTag":
|
||||
namespace = token["namespace"]
|
||||
name = token["name"]
|
||||
if not isinstance(name, str):
|
||||
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
|
||||
if not name:
|
||||
raise LintError("Empty tag name")
|
||||
if name in voidElements:
|
||||
raise LintError("Void element reported as EndTag token: %(tag)s" % {"tag": name})
|
||||
start_name = open_elements.pop()
|
||||
if start_name != name:
|
||||
raise LintError("EndTag (%(end)s) does not match StartTag (%(start)s)" % {"end": name, "start": start_name})
|
||||
contentModelFlag = "PCDATA"
|
||||
assert namespace is None or isinstance(namespace, text_type)
|
||||
assert namespace != ""
|
||||
assert isinstance(name, text_type)
|
||||
assert name != ""
|
||||
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
|
||||
elif self.require_matching_tags:
|
||||
start = open_elements.pop()
|
||||
assert start == (namespace, name)
|
||||
|
||||
elif type == "Comment":
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError("Comment not in PCDATA content model flag")
|
||||
data = token["data"]
|
||||
assert isinstance(data, text_type)
|
||||
|
||||
elif type in ("Characters", "SpaceCharacters"):
|
||||
data = token["data"]
|
||||
if not isinstance(data, str):
|
||||
raise LintError("Attribute name is not a string: %(name)r" % {"name": data})
|
||||
if not data:
|
||||
raise LintError("%(type)s token with empty data" % {"type": type})
|
||||
assert isinstance(data, text_type)
|
||||
assert data != ""
|
||||
if type == "SpaceCharacters":
|
||||
data = data.strip(spaceCharacters)
|
||||
if data:
|
||||
raise LintError("Non-space character(s) found in SpaceCharacters token: %(token)r" % {"token": data})
|
||||
assert data.strip(spaceCharacters) == ""
|
||||
|
||||
elif type == "Doctype":
|
||||
name = token["name"]
|
||||
if contentModelFlag != "PCDATA":
|
||||
raise LintError("Doctype not in PCDATA content model flag: %(name)s" % {"name": name})
|
||||
if not isinstance(name, str):
|
||||
raise LintError("Tag name is not a string: %(tag)r" % {"tag": name})
|
||||
# XXX: what to do with token["data"] ?
|
||||
assert name is None or isinstance(name, text_type)
|
||||
assert token["publicId"] is None or isinstance(name, text_type)
|
||||
assert token["systemId"] is None or isinstance(name, text_type)
|
||||
|
||||
elif type in ("ParseError", "SerializeError"):
|
||||
pass
|
||||
elif type == "Entity":
|
||||
assert isinstance(token["name"], text_type)
|
||||
|
||||
elif type == "SerializerError":
|
||||
assert isinstance(token["data"], text_type)
|
||||
|
||||
else:
|
||||
raise LintError("Unknown token type: %(type)s" % {"type": type})
|
||||
assert False, "Unknown token type: %(type)s" % {"type": type}
|
||||
|
||||
yield token
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import _base
|
||||
from . import base
|
||||
|
||||
|
||||
class Filter(_base.Filter):
|
||||
class Filter(base.Filter):
|
||||
def slider(self):
|
||||
previous1 = previous2 = None
|
||||
for token in self.source:
|
||||
|
@ -11,7 +11,8 @@ class Filter(_base.Filter):
|
|||
yield previous2, previous1, token
|
||||
previous2 = previous1
|
||||
previous1 = token
|
||||
yield previous2, previous1, None
|
||||
if previous1 is not None:
|
||||
yield previous2, previous1, None
|
||||
|
||||
def __iter__(self):
|
||||
for previous, token, next in self.slider():
|
||||
|
|
|
@ -1,12 +1,865 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from . import _base
|
||||
from ..sanitizer import HTMLSanitizerMixin
|
||||
import re
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
|
||||
from six.moves import urllib_parse as urlparse
|
||||
|
||||
from . import base
|
||||
from ..constants import namespaces, prefixes
|
||||
|
||||
__all__ = ["Filter"]
|
||||
|
||||
|
||||
class Filter(_base.Filter, HTMLSanitizerMixin):
|
||||
allowed_elements = frozenset((
|
||||
(namespaces['html'], 'a'),
|
||||
(namespaces['html'], 'abbr'),
|
||||
(namespaces['html'], 'acronym'),
|
||||
(namespaces['html'], 'address'),
|
||||
(namespaces['html'], 'area'),
|
||||
(namespaces['html'], 'article'),
|
||||
(namespaces['html'], 'aside'),
|
||||
(namespaces['html'], 'audio'),
|
||||
(namespaces['html'], 'b'),
|
||||
(namespaces['html'], 'big'),
|
||||
(namespaces['html'], 'blockquote'),
|
||||
(namespaces['html'], 'br'),
|
||||
(namespaces['html'], 'button'),
|
||||
(namespaces['html'], 'canvas'),
|
||||
(namespaces['html'], 'caption'),
|
||||
(namespaces['html'], 'center'),
|
||||
(namespaces['html'], 'cite'),
|
||||
(namespaces['html'], 'code'),
|
||||
(namespaces['html'], 'col'),
|
||||
(namespaces['html'], 'colgroup'),
|
||||
(namespaces['html'], 'command'),
|
||||
(namespaces['html'], 'datagrid'),
|
||||
(namespaces['html'], 'datalist'),
|
||||
(namespaces['html'], 'dd'),
|
||||
(namespaces['html'], 'del'),
|
||||
(namespaces['html'], 'details'),
|
||||
(namespaces['html'], 'dfn'),
|
||||
(namespaces['html'], 'dialog'),
|
||||
(namespaces['html'], 'dir'),
|
||||
(namespaces['html'], 'div'),
|
||||
(namespaces['html'], 'dl'),
|
||||
(namespaces['html'], 'dt'),
|
||||
(namespaces['html'], 'em'),
|
||||
(namespaces['html'], 'event-source'),
|
||||
(namespaces['html'], 'fieldset'),
|
||||
(namespaces['html'], 'figcaption'),
|
||||
(namespaces['html'], 'figure'),
|
||||
(namespaces['html'], 'footer'),
|
||||
(namespaces['html'], 'font'),
|
||||
(namespaces['html'], 'form'),
|
||||
(namespaces['html'], 'header'),
|
||||
(namespaces['html'], 'h1'),
|
||||
(namespaces['html'], 'h2'),
|
||||
(namespaces['html'], 'h3'),
|
||||
(namespaces['html'], 'h4'),
|
||||
(namespaces['html'], 'h5'),
|
||||
(namespaces['html'], 'h6'),
|
||||
(namespaces['html'], 'hr'),
|
||||
(namespaces['html'], 'i'),
|
||||
(namespaces['html'], 'img'),
|
||||
(namespaces['html'], 'input'),
|
||||
(namespaces['html'], 'ins'),
|
||||
(namespaces['html'], 'keygen'),
|
||||
(namespaces['html'], 'kbd'),
|
||||
(namespaces['html'], 'label'),
|
||||
(namespaces['html'], 'legend'),
|
||||
(namespaces['html'], 'li'),
|
||||
(namespaces['html'], 'm'),
|
||||
(namespaces['html'], 'map'),
|
||||
(namespaces['html'], 'menu'),
|
||||
(namespaces['html'], 'meter'),
|
||||
(namespaces['html'], 'multicol'),
|
||||
(namespaces['html'], 'nav'),
|
||||
(namespaces['html'], 'nextid'),
|
||||
(namespaces['html'], 'ol'),
|
||||
(namespaces['html'], 'output'),
|
||||
(namespaces['html'], 'optgroup'),
|
||||
(namespaces['html'], 'option'),
|
||||
(namespaces['html'], 'p'),
|
||||
(namespaces['html'], 'pre'),
|
||||
(namespaces['html'], 'progress'),
|
||||
(namespaces['html'], 'q'),
|
||||
(namespaces['html'], 's'),
|
||||
(namespaces['html'], 'samp'),
|
||||
(namespaces['html'], 'section'),
|
||||
(namespaces['html'], 'select'),
|
||||
(namespaces['html'], 'small'),
|
||||
(namespaces['html'], 'sound'),
|
||||
(namespaces['html'], 'source'),
|
||||
(namespaces['html'], 'spacer'),
|
||||
(namespaces['html'], 'span'),
|
||||
(namespaces['html'], 'strike'),
|
||||
(namespaces['html'], 'strong'),
|
||||
(namespaces['html'], 'sub'),
|
||||
(namespaces['html'], 'sup'),
|
||||
(namespaces['html'], 'table'),
|
||||
(namespaces['html'], 'tbody'),
|
||||
(namespaces['html'], 'td'),
|
||||
(namespaces['html'], 'textarea'),
|
||||
(namespaces['html'], 'time'),
|
||||
(namespaces['html'], 'tfoot'),
|
||||
(namespaces['html'], 'th'),
|
||||
(namespaces['html'], 'thead'),
|
||||
(namespaces['html'], 'tr'),
|
||||
(namespaces['html'], 'tt'),
|
||||
(namespaces['html'], 'u'),
|
||||
(namespaces['html'], 'ul'),
|
||||
(namespaces['html'], 'var'),
|
||||
(namespaces['html'], 'video'),
|
||||
(namespaces['mathml'], 'maction'),
|
||||
(namespaces['mathml'], 'math'),
|
||||
(namespaces['mathml'], 'merror'),
|
||||
(namespaces['mathml'], 'mfrac'),
|
||||
(namespaces['mathml'], 'mi'),
|
||||
(namespaces['mathml'], 'mmultiscripts'),
|
||||
(namespaces['mathml'], 'mn'),
|
||||
(namespaces['mathml'], 'mo'),
|
||||
(namespaces['mathml'], 'mover'),
|
||||
(namespaces['mathml'], 'mpadded'),
|
||||
(namespaces['mathml'], 'mphantom'),
|
||||
(namespaces['mathml'], 'mprescripts'),
|
||||
(namespaces['mathml'], 'mroot'),
|
||||
(namespaces['mathml'], 'mrow'),
|
||||
(namespaces['mathml'], 'mspace'),
|
||||
(namespaces['mathml'], 'msqrt'),
|
||||
(namespaces['mathml'], 'mstyle'),
|
||||
(namespaces['mathml'], 'msub'),
|
||||
(namespaces['mathml'], 'msubsup'),
|
||||
(namespaces['mathml'], 'msup'),
|
||||
(namespaces['mathml'], 'mtable'),
|
||||
(namespaces['mathml'], 'mtd'),
|
||||
(namespaces['mathml'], 'mtext'),
|
||||
(namespaces['mathml'], 'mtr'),
|
||||
(namespaces['mathml'], 'munder'),
|
||||
(namespaces['mathml'], 'munderover'),
|
||||
(namespaces['mathml'], 'none'),
|
||||
(namespaces['svg'], 'a'),
|
||||
(namespaces['svg'], 'animate'),
|
||||
(namespaces['svg'], 'animateColor'),
|
||||
(namespaces['svg'], 'animateMotion'),
|
||||
(namespaces['svg'], 'animateTransform'),
|
||||
(namespaces['svg'], 'clipPath'),
|
||||
(namespaces['svg'], 'circle'),
|
||||
(namespaces['svg'], 'defs'),
|
||||
(namespaces['svg'], 'desc'),
|
||||
(namespaces['svg'], 'ellipse'),
|
||||
(namespaces['svg'], 'font-face'),
|
||||
(namespaces['svg'], 'font-face-name'),
|
||||
(namespaces['svg'], 'font-face-src'),
|
||||
(namespaces['svg'], 'g'),
|
||||
(namespaces['svg'], 'glyph'),
|
||||
(namespaces['svg'], 'hkern'),
|
||||
(namespaces['svg'], 'linearGradient'),
|
||||
(namespaces['svg'], 'line'),
|
||||
(namespaces['svg'], 'marker'),
|
||||
(namespaces['svg'], 'metadata'),
|
||||
(namespaces['svg'], 'missing-glyph'),
|
||||
(namespaces['svg'], 'mpath'),
|
||||
(namespaces['svg'], 'path'),
|
||||
(namespaces['svg'], 'polygon'),
|
||||
(namespaces['svg'], 'polyline'),
|
||||
(namespaces['svg'], 'radialGradient'),
|
||||
(namespaces['svg'], 'rect'),
|
||||
(namespaces['svg'], 'set'),
|
||||
(namespaces['svg'], 'stop'),
|
||||
(namespaces['svg'], 'svg'),
|
||||
(namespaces['svg'], 'switch'),
|
||||
(namespaces['svg'], 'text'),
|
||||
(namespaces['svg'], 'title'),
|
||||
(namespaces['svg'], 'tspan'),
|
||||
(namespaces['svg'], 'use'),
|
||||
))
|
||||
|
||||
allowed_attributes = frozenset((
|
||||
# HTML attributes
|
||||
(None, 'abbr'),
|
||||
(None, 'accept'),
|
||||
(None, 'accept-charset'),
|
||||
(None, 'accesskey'),
|
||||
(None, 'action'),
|
||||
(None, 'align'),
|
||||
(None, 'alt'),
|
||||
(None, 'autocomplete'),
|
||||
(None, 'autofocus'),
|
||||
(None, 'axis'),
|
||||
(None, 'background'),
|
||||
(None, 'balance'),
|
||||
(None, 'bgcolor'),
|
||||
(None, 'bgproperties'),
|
||||
(None, 'border'),
|
||||
(None, 'bordercolor'),
|
||||
(None, 'bordercolordark'),
|
||||
(None, 'bordercolorlight'),
|
||||
(None, 'bottompadding'),
|
||||
(None, 'cellpadding'),
|
||||
(None, 'cellspacing'),
|
||||
(None, 'ch'),
|
||||
(None, 'challenge'),
|
||||
(None, 'char'),
|
||||
(None, 'charoff'),
|
||||
(None, 'choff'),
|
||||
(None, 'charset'),
|
||||
(None, 'checked'),
|
||||
(None, 'cite'),
|
||||
(None, 'class'),
|
||||
(None, 'clear'),
|
||||
(None, 'color'),
|
||||
(None, 'cols'),
|
||||
(None, 'colspan'),
|
||||
(None, 'compact'),
|
||||
(None, 'contenteditable'),
|
||||
(None, 'controls'),
|
||||
(None, 'coords'),
|
||||
(None, 'data'),
|
||||
(None, 'datafld'),
|
||||
(None, 'datapagesize'),
|
||||
(None, 'datasrc'),
|
||||
(None, 'datetime'),
|
||||
(None, 'default'),
|
||||
(None, 'delay'),
|
||||
(None, 'dir'),
|
||||
(None, 'disabled'),
|
||||
(None, 'draggable'),
|
||||
(None, 'dynsrc'),
|
||||
(None, 'enctype'),
|
||||
(None, 'end'),
|
||||
(None, 'face'),
|
||||
(None, 'for'),
|
||||
(None, 'form'),
|
||||
(None, 'frame'),
|
||||
(None, 'galleryimg'),
|
||||
(None, 'gutter'),
|
||||
(None, 'headers'),
|
||||
(None, 'height'),
|
||||
(None, 'hidefocus'),
|
||||
(None, 'hidden'),
|
||||
(None, 'high'),
|
||||
(None, 'href'),
|
||||
(None, 'hreflang'),
|
||||
(None, 'hspace'),
|
||||
(None, 'icon'),
|
||||
(None, 'id'),
|
||||
(None, 'inputmode'),
|
||||
(None, 'ismap'),
|
||||
(None, 'keytype'),
|
||||
(None, 'label'),
|
||||
(None, 'leftspacing'),
|
||||
(None, 'lang'),
|
||||
(None, 'list'),
|
||||
(None, 'longdesc'),
|
||||
(None, 'loop'),
|
||||
(None, 'loopcount'),
|
||||
(None, 'loopend'),
|
||||
(None, 'loopstart'),
|
||||
(None, 'low'),
|
||||
(None, 'lowsrc'),
|
||||
(None, 'max'),
|
||||
(None, 'maxlength'),
|
||||
(None, 'media'),
|
||||
(None, 'method'),
|
||||
(None, 'min'),
|
||||
(None, 'multiple'),
|
||||
(None, 'name'),
|
||||
(None, 'nohref'),
|
||||
(None, 'noshade'),
|
||||
(None, 'nowrap'),
|
||||
(None, 'open'),
|
||||
(None, 'optimum'),
|
||||
(None, 'pattern'),
|
||||
(None, 'ping'),
|
||||
(None, 'point-size'),
|
||||
(None, 'poster'),
|
||||
(None, 'pqg'),
|
||||
(None, 'preload'),
|
||||
(None, 'prompt'),
|
||||
(None, 'radiogroup'),
|
||||
(None, 'readonly'),
|
||||
(None, 'rel'),
|
||||
(None, 'repeat-max'),
|
||||
(None, 'repeat-min'),
|
||||
(None, 'replace'),
|
||||
(None, 'required'),
|
||||
(None, 'rev'),
|
||||
(None, 'rightspacing'),
|
||||
(None, 'rows'),
|
||||
(None, 'rowspan'),
|
||||
(None, 'rules'),
|
||||
(None, 'scope'),
|
||||
(None, 'selected'),
|
||||
(None, 'shape'),
|
||||
(None, 'size'),
|
||||
(None, 'span'),
|
||||
(None, 'src'),
|
||||
(None, 'start'),
|
||||
(None, 'step'),
|
||||
(None, 'style'),
|
||||
(None, 'summary'),
|
||||
(None, 'suppress'),
|
||||
(None, 'tabindex'),
|
||||
(None, 'target'),
|
||||
(None, 'template'),
|
||||
(None, 'title'),
|
||||
(None, 'toppadding'),
|
||||
(None, 'type'),
|
||||
(None, 'unselectable'),
|
||||
(None, 'usemap'),
|
||||
(None, 'urn'),
|
||||
(None, 'valign'),
|
||||
(None, 'value'),
|
||||
(None, 'variable'),
|
||||
(None, 'volume'),
|
||||
(None, 'vspace'),
|
||||
(None, 'vrml'),
|
||||
(None, 'width'),
|
||||
(None, 'wrap'),
|
||||
(namespaces['xml'], 'lang'),
|
||||
# MathML attributes
|
||||
(None, 'actiontype'),
|
||||
(None, 'align'),
|
||||
(None, 'columnalign'),
|
||||
(None, 'columnalign'),
|
||||
(None, 'columnalign'),
|
||||
(None, 'columnlines'),
|
||||
(None, 'columnspacing'),
|
||||
(None, 'columnspan'),
|
||||
(None, 'depth'),
|
||||
(None, 'display'),
|
||||
(None, 'displaystyle'),
|
||||
(None, 'equalcolumns'),
|
||||
(None, 'equalrows'),
|
||||
(None, 'fence'),
|
||||
(None, 'fontstyle'),
|
||||
(None, 'fontweight'),
|
||||
(None, 'frame'),
|
||||
(None, 'height'),
|
||||
(None, 'linethickness'),
|
||||
(None, 'lspace'),
|
||||
(None, 'mathbackground'),
|
||||
(None, 'mathcolor'),
|
||||
(None, 'mathvariant'),
|
||||
(None, 'mathvariant'),
|
||||
(None, 'maxsize'),
|
||||
(None, 'minsize'),
|
||||
(None, 'other'),
|
||||
(None, 'rowalign'),
|
||||
(None, 'rowalign'),
|
||||
(None, 'rowalign'),
|
||||
(None, 'rowlines'),
|
||||
(None, 'rowspacing'),
|
||||
(None, 'rowspan'),
|
||||
(None, 'rspace'),
|
||||
(None, 'scriptlevel'),
|
||||
(None, 'selection'),
|
||||
(None, 'separator'),
|
||||
(None, 'stretchy'),
|
||||
(None, 'width'),
|
||||
(None, 'width'),
|
||||
(namespaces['xlink'], 'href'),
|
||||
(namespaces['xlink'], 'show'),
|
||||
(namespaces['xlink'], 'type'),
|
||||
# SVG attributes
|
||||
(None, 'accent-height'),
|
||||
(None, 'accumulate'),
|
||||
(None, 'additive'),
|
||||
(None, 'alphabetic'),
|
||||
(None, 'arabic-form'),
|
||||
(None, 'ascent'),
|
||||
(None, 'attributeName'),
|
||||
(None, 'attributeType'),
|
||||
(None, 'baseProfile'),
|
||||
(None, 'bbox'),
|
||||
(None, 'begin'),
|
||||
(None, 'by'),
|
||||
(None, 'calcMode'),
|
||||
(None, 'cap-height'),
|
||||
(None, 'class'),
|
||||
(None, 'clip-path'),
|
||||
(None, 'color'),
|
||||
(None, 'color-rendering'),
|
||||
(None, 'content'),
|
||||
(None, 'cx'),
|
||||
(None, 'cy'),
|
||||
(None, 'd'),
|
||||
(None, 'dx'),
|
||||
(None, 'dy'),
|
||||
(None, 'descent'),
|
||||
(None, 'display'),
|
||||
(None, 'dur'),
|
||||
(None, 'end'),
|
||||
(None, 'fill'),
|
||||
(None, 'fill-opacity'),
|
||||
(None, 'fill-rule'),
|
||||
(None, 'font-family'),
|
||||
(None, 'font-size'),
|
||||
(None, 'font-stretch'),
|
||||
(None, 'font-style'),
|
||||
(None, 'font-variant'),
|
||||
(None, 'font-weight'),
|
||||
(None, 'from'),
|
||||
(None, 'fx'),
|
||||
(None, 'fy'),
|
||||
(None, 'g1'),
|
||||
(None, 'g2'),
|
||||
(None, 'glyph-name'),
|
||||
(None, 'gradientUnits'),
|
||||
(None, 'hanging'),
|
||||
(None, 'height'),
|
||||
(None, 'horiz-adv-x'),
|
||||
(None, 'horiz-origin-x'),
|
||||
(None, 'id'),
|
||||
(None, 'ideographic'),
|
||||
(None, 'k'),
|
||||
(None, 'keyPoints'),
|
||||
(None, 'keySplines'),
|
||||
(None, 'keyTimes'),
|
||||
(None, 'lang'),
|
||||
(None, 'marker-end'),
|
||||
(None, 'marker-mid'),
|
||||
(None, 'marker-start'),
|
||||
(None, 'markerHeight'),
|
||||
(None, 'markerUnits'),
|
||||
(None, 'markerWidth'),
|
||||
(None, 'mathematical'),
|
||||
(None, 'max'),
|
||||
(None, 'min'),
|
||||
(None, 'name'),
|
||||
(None, 'offset'),
|
||||
(None, 'opacity'),
|
||||
(None, 'orient'),
|
||||
(None, 'origin'),
|
||||
(None, 'overline-position'),
|
||||
(None, 'overline-thickness'),
|
||||
(None, 'panose-1'),
|
||||
(None, 'path'),
|
||||
(None, 'pathLength'),
|
||||
(None, 'points'),
|
||||
(None, 'preserveAspectRatio'),
|
||||
(None, 'r'),
|
||||
(None, 'refX'),
|
||||
(None, 'refY'),
|
||||
(None, 'repeatCount'),
|
||||
(None, 'repeatDur'),
|
||||
(None, 'requiredExtensions'),
|
||||
(None, 'requiredFeatures'),
|
||||
(None, 'restart'),
|
||||
(None, 'rotate'),
|
||||
(None, 'rx'),
|
||||
(None, 'ry'),
|
||||
(None, 'slope'),
|
||||
(None, 'stemh'),
|
||||
(None, 'stemv'),
|
||||
(None, 'stop-color'),
|
||||
(None, 'stop-opacity'),
|
||||
(None, 'strikethrough-position'),
|
||||
(None, 'strikethrough-thickness'),
|
||||
(None, 'stroke'),
|
||||
(None, 'stroke-dasharray'),
|
||||
(None, 'stroke-dashoffset'),
|
||||
(None, 'stroke-linecap'),
|
||||
(None, 'stroke-linejoin'),
|
||||
(None, 'stroke-miterlimit'),
|
||||
(None, 'stroke-opacity'),
|
||||
(None, 'stroke-width'),
|
||||
(None, 'systemLanguage'),
|
||||
(None, 'target'),
|
||||
(None, 'text-anchor'),
|
||||
(None, 'to'),
|
||||
(None, 'transform'),
|
||||
(None, 'type'),
|
||||
(None, 'u1'),
|
||||
(None, 'u2'),
|
||||
(None, 'underline-position'),
|
||||
(None, 'underline-thickness'),
|
||||
(None, 'unicode'),
|
||||
(None, 'unicode-range'),
|
||||
(None, 'units-per-em'),
|
||||
(None, 'values'),
|
||||
(None, 'version'),
|
||||
(None, 'viewBox'),
|
||||
(None, 'visibility'),
|
||||
(None, 'width'),
|
||||
(None, 'widths'),
|
||||
(None, 'x'),
|
||||
(None, 'x-height'),
|
||||
(None, 'x1'),
|
||||
(None, 'x2'),
|
||||
(namespaces['xlink'], 'actuate'),
|
||||
(namespaces['xlink'], 'arcrole'),
|
||||
(namespaces['xlink'], 'href'),
|
||||
(namespaces['xlink'], 'role'),
|
||||
(namespaces['xlink'], 'show'),
|
||||
(namespaces['xlink'], 'title'),
|
||||
(namespaces['xlink'], 'type'),
|
||||
(namespaces['xml'], 'base'),
|
||||
(namespaces['xml'], 'lang'),
|
||||
(namespaces['xml'], 'space'),
|
||||
(None, 'y'),
|
||||
(None, 'y1'),
|
||||
(None, 'y2'),
|
||||
(None, 'zoomAndPan'),
|
||||
))
|
||||
|
||||
attr_val_is_uri = frozenset((
|
||||
(None, 'href'),
|
||||
(None, 'src'),
|
||||
(None, 'cite'),
|
||||
(None, 'action'),
|
||||
(None, 'longdesc'),
|
||||
(None, 'poster'),
|
||||
(None, 'background'),
|
||||
(None, 'datasrc'),
|
||||
(None, 'dynsrc'),
|
||||
(None, 'lowsrc'),
|
||||
(None, 'ping'),
|
||||
(namespaces['xlink'], 'href'),
|
||||
(namespaces['xml'], 'base'),
|
||||
))
|
||||
|
||||
svg_attr_val_allows_ref = frozenset((
|
||||
(None, 'clip-path'),
|
||||
(None, 'color-profile'),
|
||||
(None, 'cursor'),
|
||||
(None, 'fill'),
|
||||
(None, 'filter'),
|
||||
(None, 'marker'),
|
||||
(None, 'marker-start'),
|
||||
(None, 'marker-mid'),
|
||||
(None, 'marker-end'),
|
||||
(None, 'mask'),
|
||||
(None, 'stroke'),
|
||||
))
|
||||
|
||||
svg_allow_local_href = frozenset((
|
||||
(None, 'altGlyph'),
|
||||
(None, 'animate'),
|
||||
(None, 'animateColor'),
|
||||
(None, 'animateMotion'),
|
||||
(None, 'animateTransform'),
|
||||
(None, 'cursor'),
|
||||
(None, 'feImage'),
|
||||
(None, 'filter'),
|
||||
(None, 'linearGradient'),
|
||||
(None, 'pattern'),
|
||||
(None, 'radialGradient'),
|
||||
(None, 'textpath'),
|
||||
(None, 'tref'),
|
||||
(None, 'set'),
|
||||
(None, 'use')
|
||||
))
|
||||
|
||||
allowed_css_properties = frozenset((
|
||||
'azimuth',
|
||||
'background-color',
|
||||
'border-bottom-color',
|
||||
'border-collapse',
|
||||
'border-color',
|
||||
'border-left-color',
|
||||
'border-right-color',
|
||||
'border-top-color',
|
||||
'clear',
|
||||
'color',
|
||||
'cursor',
|
||||
'direction',
|
||||
'display',
|
||||
'elevation',
|
||||
'float',
|
||||
'font',
|
||||
'font-family',
|
||||
'font-size',
|
||||
'font-style',
|
||||
'font-variant',
|
||||
'font-weight',
|
||||
'height',
|
||||
'letter-spacing',
|
||||
'line-height',
|
||||
'overflow',
|
||||
'pause',
|
||||
'pause-after',
|
||||
'pause-before',
|
||||
'pitch',
|
||||
'pitch-range',
|
||||
'richness',
|
||||
'speak',
|
||||
'speak-header',
|
||||
'speak-numeral',
|
||||
'speak-punctuation',
|
||||
'speech-rate',
|
||||
'stress',
|
||||
'text-align',
|
||||
'text-decoration',
|
||||
'text-indent',
|
||||
'unicode-bidi',
|
||||
'vertical-align',
|
||||
'voice-family',
|
||||
'volume',
|
||||
'white-space',
|
||||
'width',
|
||||
))
|
||||
|
||||
allowed_css_keywords = frozenset((
|
||||
'auto',
|
||||
'aqua',
|
||||
'black',
|
||||
'block',
|
||||
'blue',
|
||||
'bold',
|
||||
'both',
|
||||
'bottom',
|
||||
'brown',
|
||||
'center',
|
||||
'collapse',
|
||||
'dashed',
|
||||
'dotted',
|
||||
'fuchsia',
|
||||
'gray',
|
||||
'green',
|
||||
'!important',
|
||||
'italic',
|
||||
'left',
|
||||
'lime',
|
||||
'maroon',
|
||||
'medium',
|
||||
'none',
|
||||
'navy',
|
||||
'normal',
|
||||
'nowrap',
|
||||
'olive',
|
||||
'pointer',
|
||||
'purple',
|
||||
'red',
|
||||
'right',
|
||||
'solid',
|
||||
'silver',
|
||||
'teal',
|
||||
'top',
|
||||
'transparent',
|
||||
'underline',
|
||||
'white',
|
||||
'yellow',
|
||||
))
|
||||
|
||||
allowed_svg_properties = frozenset((
|
||||
'fill',
|
||||
'fill-opacity',
|
||||
'fill-rule',
|
||||
'stroke',
|
||||
'stroke-width',
|
||||
'stroke-linecap',
|
||||
'stroke-linejoin',
|
||||
'stroke-opacity',
|
||||
))
|
||||
|
||||
allowed_protocols = frozenset((
|
||||
'ed2k',
|
||||
'ftp',
|
||||
'http',
|
||||
'https',
|
||||
'irc',
|
||||
'mailto',
|
||||
'news',
|
||||
'gopher',
|
||||
'nntp',
|
||||
'telnet',
|
||||
'webcal',
|
||||
'xmpp',
|
||||
'callto',
|
||||
'feed',
|
||||
'urn',
|
||||
'aim',
|
||||
'rsync',
|
||||
'tag',
|
||||
'ssh',
|
||||
'sftp',
|
||||
'rtsp',
|
||||
'afs',
|
||||
'data',
|
||||
))
|
||||
|
||||
allowed_content_types = frozenset((
|
||||
'image/png',
|
||||
'image/jpeg',
|
||||
'image/gif',
|
||||
'image/webp',
|
||||
'image/bmp',
|
||||
'text/plain',
|
||||
))
|
||||
|
||||
|
||||
data_content_type = re.compile(r'''
|
||||
^
|
||||
# Match a content type <application>/<type>
|
||||
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
|
||||
# Match any character set and encoding
|
||||
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|
||||
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
|
||||
# Assume the rest is data
|
||||
,.*
|
||||
$
|
||||
''',
|
||||
re.VERBOSE)
|
||||
|
||||
|
||||
class Filter(base.Filter):
|
||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||
def __init__(self,
|
||||
source,
|
||||
allowed_elements=allowed_elements,
|
||||
allowed_attributes=allowed_attributes,
|
||||
allowed_css_properties=allowed_css_properties,
|
||||
allowed_css_keywords=allowed_css_keywords,
|
||||
allowed_svg_properties=allowed_svg_properties,
|
||||
allowed_protocols=allowed_protocols,
|
||||
allowed_content_types=allowed_content_types,
|
||||
attr_val_is_uri=attr_val_is_uri,
|
||||
svg_attr_val_allows_ref=svg_attr_val_allows_ref,
|
||||
svg_allow_local_href=svg_allow_local_href):
|
||||
super(Filter, self).__init__(source)
|
||||
self.allowed_elements = allowed_elements
|
||||
self.allowed_attributes = allowed_attributes
|
||||
self.allowed_css_properties = allowed_css_properties
|
||||
self.allowed_css_keywords = allowed_css_keywords
|
||||
self.allowed_svg_properties = allowed_svg_properties
|
||||
self.allowed_protocols = allowed_protocols
|
||||
self.allowed_content_types = allowed_content_types
|
||||
self.attr_val_is_uri = attr_val_is_uri
|
||||
self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
|
||||
self.svg_allow_local_href = svg_allow_local_href
|
||||
|
||||
def __iter__(self):
|
||||
for token in _base.Filter.__iter__(self):
|
||||
for token in base.Filter.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token:
|
||||
yield token
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
||||
# attributes are parsed, and a restricted set, # specified by
|
||||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
||||
# in ALLOWED_PROTOCOLS are allowed.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def sanitize_token(self, token):
|
||||
|
||||
# accommodate filters which use token_type differently
|
||||
token_type = token["type"]
|
||||
if token_type in ("StartTag", "EndTag", "EmptyTag"):
|
||||
name = token["name"]
|
||||
namespace = token["namespace"]
|
||||
if ((namespace, name) in self.allowed_elements or
|
||||
(namespace is None and
|
||||
(namespaces["html"], name) in self.allowed_elements)):
|
||||
return self.allowed_token(token)
|
||||
else:
|
||||
return self.disallowed_token(token)
|
||||
elif token_type == "Comment":
|
||||
pass
|
||||
else:
|
||||
return token
|
||||
|
||||
def allowed_token(self, token):
|
||||
if "data" in token:
|
||||
attrs = token["data"]
|
||||
attr_names = set(attrs.keys())
|
||||
|
||||
# Remove forbidden attributes
|
||||
for to_remove in (attr_names - self.allowed_attributes):
|
||||
del token["data"][to_remove]
|
||||
attr_names.remove(to_remove)
|
||||
|
||||
# Remove attributes with disallowed URL values
|
||||
for attr in (attr_names & self.attr_val_is_uri):
|
||||
assert attr in attrs
|
||||
# I don't have a clue where this regexp comes from or why it matches those
|
||||
# characters, nor why we call unescape. I just know it's always been here.
|
||||
# Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
|
||||
# this will do is remove *more* than it otherwise would.
|
||||
val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\s]+", '',
|
||||
unescape(attrs[attr])).lower()
|
||||
# remove replacement characters from unescaped characters
|
||||
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||
try:
|
||||
uri = urlparse.urlparse(val_unescaped)
|
||||
except ValueError:
|
||||
uri = None
|
||||
del attrs[attr]
|
||||
if uri and uri.scheme:
|
||||
if uri.scheme not in self.allowed_protocols:
|
||||
del attrs[attr]
|
||||
if uri.scheme == 'data':
|
||||
m = data_content_type.match(uri.path)
|
||||
if not m:
|
||||
del attrs[attr]
|
||||
elif m.group('content_type') not in self.allowed_content_types:
|
||||
del attrs[attr]
|
||||
|
||||
for attr in self.svg_attr_val_allows_ref:
|
||||
if attr in attrs:
|
||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||
' ',
|
||||
unescape(attrs[attr]))
|
||||
if (token["name"] in self.svg_allow_local_href and
|
||||
(namespaces['xlink'], 'href') in attrs and re.search('^\s*[^#\s].*',
|
||||
attrs[(namespaces['xlink'], 'href')])):
|
||||
del attrs[(namespaces['xlink'], 'href')]
|
||||
if (None, 'style') in attrs:
|
||||
attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
|
||||
token["data"] = attrs
|
||||
return token
|
||||
|
||||
def disallowed_token(self, token):
|
||||
token_type = token["type"]
|
||||
if token_type == "EndTag":
|
||||
token["data"] = "</%s>" % token["name"]
|
||||
elif token["data"]:
|
||||
assert token_type in ("StartTag", "EmptyTag")
|
||||
attrs = []
|
||||
for (ns, name), v in token["data"].items():
|
||||
attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
|
||||
token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
if token.get("selfClosing"):
|
||||
token["data"] = token["data"][:-1] + "/>"
|
||||
|
||||
token["type"] = "Characters"
|
||||
|
||||
del token["name"]
|
||||
return token
|
||||
|
||||
def sanitize_css(self, style):
|
||||
# disallow urls
|
||||
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
||||
|
||||
# gauntlet
|
||||
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
|
||||
return ''
|
||||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||
return ''
|
||||
|
||||
clean = []
|
||||
for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
|
||||
if not value:
|
||||
continue
|
||||
if prop.lower() in self.allowed_css_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
|
||||
'padding']:
|
||||
for keyword in value.split():
|
||||
if keyword not in self.allowed_css_keywords and \
|
||||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
|
||||
break
|
||||
else:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.lower() in self.allowed_svg_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
|
||||
return ' '.join(clean)
|
||||
|
|
|
@ -2,20 +2,20 @@ from __future__ import absolute_import, division, unicode_literals
|
|||
|
||||
import re
|
||||
|
||||
from . import _base
|
||||
from . import base
|
||||
from ..constants import rcdataElements, spaceCharacters
|
||||
spaceCharacters = "".join(spaceCharacters)
|
||||
|
||||
SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
|
||||
|
||||
|
||||
class Filter(_base.Filter):
|
||||
class Filter(base.Filter):
|
||||
|
||||
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
||||
|
||||
def __iter__(self):
|
||||
preserve = 0
|
||||
for token in _base.Filter.__iter__(self):
|
||||
for token in base.Filter.__iter__(self):
|
||||
type = token["type"]
|
||||
if type == "StartTag" \
|
||||
and (preserve or token["name"] in self.spacePreserveElements):
|
||||
|
|
|
@ -1,39 +1,44 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
from six import with_metaclass
|
||||
from six import with_metaclass, viewkeys, PY3
|
||||
|
||||
import types
|
||||
|
||||
from . import inputstream
|
||||
from . import tokenizer
|
||||
try:
|
||||
from collections import OrderedDict
|
||||
except ImportError:
|
||||
from ordereddict import OrderedDict
|
||||
|
||||
from . import _inputstream
|
||||
from . import _tokenizer
|
||||
|
||||
from . import treebuilders
|
||||
from .treebuilders._base import Marker
|
||||
from .treebuilders.base import Marker
|
||||
|
||||
from . import utils
|
||||
from . import constants
|
||||
from .constants import spaceCharacters, asciiUpper2Lower
|
||||
from .constants import specialElements
|
||||
from .constants import headingElements
|
||||
from .constants import cdataElements, rcdataElements
|
||||
from .constants import tokenTypes, ReparseException, namespaces
|
||||
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
|
||||
from .constants import adjustForeignAttributes as adjustForeignAttributesMap
|
||||
from .constants import E
|
||||
from . import _utils
|
||||
from .constants import (
|
||||
spaceCharacters, asciiUpper2Lower,
|
||||
specialElements, headingElements, cdataElements, rcdataElements,
|
||||
tokenTypes, tagTokenTypes,
|
||||
namespaces,
|
||||
htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
|
||||
adjustForeignAttributes as adjustForeignAttributesMap,
|
||||
adjustMathMLAttributes, adjustSVGAttributes,
|
||||
E,
|
||||
ReparseException
|
||||
)
|
||||
|
||||
|
||||
def parse(doc, treebuilder="etree", encoding=None,
|
||||
namespaceHTMLElements=True):
|
||||
def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
|
||||
"""Parse a string or file-like object into a tree"""
|
||||
tb = treebuilders.getTreeBuilder(treebuilder)
|
||||
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
|
||||
return p.parse(doc, encoding=encoding)
|
||||
return p.parse(doc, **kwargs)
|
||||
|
||||
|
||||
def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
|
||||
namespaceHTMLElements=True):
|
||||
def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
|
||||
tb = treebuilders.getTreeBuilder(treebuilder)
|
||||
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
|
||||
return p.parseFragment(doc, container=container, encoding=encoding)
|
||||
return p.parseFragment(doc, container=container, **kwargs)
|
||||
|
||||
|
||||
def method_decorator_metaclass(function):
|
||||
|
@ -52,18 +57,13 @@ class HTMLParser(object):
|
|||
"""HTML parser. Generates a tree structure from a stream of (possibly
|
||||
malformed) HTML"""
|
||||
|
||||
def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
|
||||
strict=False, namespaceHTMLElements=True, debug=False):
|
||||
def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
|
||||
"""
|
||||
strict - raise an exception when a parse error is encountered
|
||||
|
||||
tree - a treebuilder class controlling the type of tree that will be
|
||||
returned. Built in treebuilders can be accessed through
|
||||
html5lib.treebuilders.getTreeBuilder(treeType)
|
||||
|
||||
tokenizer - a class that provides a stream of tokens to the treebuilder.
|
||||
This may be replaced for e.g. a sanitizer which converts some tags to
|
||||
text
|
||||
"""
|
||||
|
||||
# Raise an exception on the first error encountered
|
||||
|
@ -72,29 +72,24 @@ class HTMLParser(object):
|
|||
if tree is None:
|
||||
tree = treebuilders.getTreeBuilder("etree")
|
||||
self.tree = tree(namespaceHTMLElements)
|
||||
self.tokenizer_class = tokenizer
|
||||
self.errors = []
|
||||
|
||||
self.phases = dict([(name, cls(self, self.tree)) for name, cls in
|
||||
getPhases(debug).items()])
|
||||
|
||||
def _parse(self, stream, innerHTML=False, container="div",
|
||||
encoding=None, parseMeta=True, useChardet=True, **kwargs):
|
||||
def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
|
||||
|
||||
self.innerHTMLMode = innerHTML
|
||||
self.container = container
|
||||
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
|
||||
parseMeta=parseMeta,
|
||||
useChardet=useChardet,
|
||||
parser=self, **kwargs)
|
||||
self.scripting = scripting
|
||||
self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
|
||||
self.reset()
|
||||
|
||||
while True:
|
||||
try:
|
||||
self.mainLoop()
|
||||
break
|
||||
except ReparseException:
|
||||
self.reset()
|
||||
try:
|
||||
self.mainLoop()
|
||||
except ReparseException:
|
||||
self.reset()
|
||||
self.mainLoop()
|
||||
|
||||
def reset(self):
|
||||
self.tree.reset()
|
||||
|
@ -121,7 +116,7 @@ class HTMLParser(object):
|
|||
self.phase.insertHtmlElement()
|
||||
self.resetInsertionMode()
|
||||
else:
|
||||
self.innerHTML = False
|
||||
self.innerHTML = False # pylint:disable=redefined-variable-type
|
||||
self.phase = self.phases["initial"]
|
||||
|
||||
self.lastPhase = None
|
||||
|
@ -139,7 +134,7 @@ class HTMLParser(object):
|
|||
"""
|
||||
if not hasattr(self, 'tokenizer'):
|
||||
return None
|
||||
return self.tokenizer.stream.charEncoding[0]
|
||||
return self.tokenizer.stream.charEncoding[0].name
|
||||
|
||||
def isHTMLIntegrationPoint(self, element):
|
||||
if (element.name == "annotation-xml" and
|
||||
|
@ -164,8 +159,10 @@ class HTMLParser(object):
|
|||
ParseErrorToken = tokenTypes["ParseError"]
|
||||
|
||||
for token in self.normalizedTokens():
|
||||
prev_token = None
|
||||
new_token = token
|
||||
while new_token is not None:
|
||||
prev_token = new_token
|
||||
currentNode = self.tree.openElements[-1] if self.tree.openElements else None
|
||||
currentNodeNamespace = currentNode.namespace if currentNode else None
|
||||
currentNodeName = currentNode.name if currentNode else None
|
||||
|
@ -184,6 +181,7 @@ class HTMLParser(object):
|
|||
type in (CharactersToken, SpaceCharactersToken))) or
|
||||
(currentNodeNamespace == namespaces["mathml"] and
|
||||
currentNodeName == "annotation-xml" and
|
||||
type == StartTagToken and
|
||||
token["name"] == "svg") or
|
||||
(self.isHTMLIntegrationPoint(currentNode) and
|
||||
type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
|
||||
|
@ -204,10 +202,10 @@ class HTMLParser(object):
|
|||
elif type == DoctypeToken:
|
||||
new_token = phase.processDoctype(new_token)
|
||||
|
||||
if (type == StartTagToken and token["selfClosing"]
|
||||
and not token["selfClosingAcknowledged"]):
|
||||
if (type == StartTagToken and prev_token["selfClosing"] and
|
||||
not prev_token["selfClosingAcknowledged"]):
|
||||
self.parseError("non-void-element-with-trailing-solidus",
|
||||
{"name": token["name"]})
|
||||
{"name": prev_token["name"]})
|
||||
|
||||
# When the loop finishes it's EOF
|
||||
reprocess = True
|
||||
|
@ -222,7 +220,7 @@ class HTMLParser(object):
|
|||
for token in self.tokenizer:
|
||||
yield self.normalizeToken(token)
|
||||
|
||||
def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
|
||||
def parse(self, stream, *args, **kwargs):
|
||||
"""Parse a HTML document into a well-formed tree
|
||||
|
||||
stream - a filelike object or string containing the HTML to be parsed
|
||||
|
@ -231,13 +229,13 @@ class HTMLParser(object):
|
|||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
scripting - treat noscript elements as if javascript was turned on
|
||||
"""
|
||||
self._parse(stream, innerHTML=False, encoding=encoding,
|
||||
parseMeta=parseMeta, useChardet=useChardet)
|
||||
self._parse(stream, False, None, *args, **kwargs)
|
||||
return self.tree.getDocument()
|
||||
|
||||
def parseFragment(self, stream, container="div", encoding=None,
|
||||
parseMeta=False, useChardet=True):
|
||||
def parseFragment(self, stream, *args, **kwargs):
|
||||
"""Parse a HTML fragment into a well-formed tree fragment
|
||||
|
||||
container - name of the element we're setting the innerHTML property
|
||||
|
@ -249,12 +247,16 @@ class HTMLParser(object):
|
|||
the encoding. If specified, that encoding will be used,
|
||||
regardless of any BOM or later declaration (such as in a meta
|
||||
element)
|
||||
|
||||
scripting - treat noscript elements as if javascript was turned on
|
||||
"""
|
||||
self._parse(stream, True, container=container, encoding=encoding)
|
||||
self._parse(stream, True, *args, **kwargs)
|
||||
return self.tree.getFragment()
|
||||
|
||||
def parseError(self, errorcode="XXX-undefined-error", datavars={}):
|
||||
def parseError(self, errorcode="XXX-undefined-error", datavars=None):
|
||||
# XXX The idea is to make errorcode mandatory.
|
||||
if datavars is None:
|
||||
datavars = {}
|
||||
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
|
||||
if self.strict:
|
||||
raise ParseError(E[errorcode] % datavars)
|
||||
|
@ -263,98 +265,25 @@ class HTMLParser(object):
|
|||
""" HTML5 specific normalizations to the token stream """
|
||||
|
||||
if token["type"] == tokenTypes["StartTag"]:
|
||||
token["data"] = dict(token["data"][::-1])
|
||||
raw = token["data"]
|
||||
token["data"] = OrderedDict(raw)
|
||||
if len(raw) > len(token["data"]):
|
||||
# we had some duplicated attribute, fix so first wins
|
||||
token["data"].update(raw[::-1])
|
||||
|
||||
return token
|
||||
|
||||
def adjustMathMLAttributes(self, token):
|
||||
replacements = {"definitionurl": "definitionURL"}
|
||||
for k, v in replacements.items():
|
||||
if k in token["data"]:
|
||||
token["data"][v] = token["data"][k]
|
||||
del token["data"][k]
|
||||
adjust_attributes(token, adjustMathMLAttributes)
|
||||
|
||||
def adjustSVGAttributes(self, token):
|
||||
replacements = {
|
||||
"attributename": "attributeName",
|
||||
"attributetype": "attributeType",
|
||||
"basefrequency": "baseFrequency",
|
||||
"baseprofile": "baseProfile",
|
||||
"calcmode": "calcMode",
|
||||
"clippathunits": "clipPathUnits",
|
||||
"contentscripttype": "contentScriptType",
|
||||
"contentstyletype": "contentStyleType",
|
||||
"diffuseconstant": "diffuseConstant",
|
||||
"edgemode": "edgeMode",
|
||||
"externalresourcesrequired": "externalResourcesRequired",
|
||||
"filterres": "filterRes",
|
||||
"filterunits": "filterUnits",
|
||||
"glyphref": "glyphRef",
|
||||
"gradienttransform": "gradientTransform",
|
||||
"gradientunits": "gradientUnits",
|
||||
"kernelmatrix": "kernelMatrix",
|
||||
"kernelunitlength": "kernelUnitLength",
|
||||
"keypoints": "keyPoints",
|
||||
"keysplines": "keySplines",
|
||||
"keytimes": "keyTimes",
|
||||
"lengthadjust": "lengthAdjust",
|
||||
"limitingconeangle": "limitingConeAngle",
|
||||
"markerheight": "markerHeight",
|
||||
"markerunits": "markerUnits",
|
||||
"markerwidth": "markerWidth",
|
||||
"maskcontentunits": "maskContentUnits",
|
||||
"maskunits": "maskUnits",
|
||||
"numoctaves": "numOctaves",
|
||||
"pathlength": "pathLength",
|
||||
"patterncontentunits": "patternContentUnits",
|
||||
"patterntransform": "patternTransform",
|
||||
"patternunits": "patternUnits",
|
||||
"pointsatx": "pointsAtX",
|
||||
"pointsaty": "pointsAtY",
|
||||
"pointsatz": "pointsAtZ",
|
||||
"preservealpha": "preserveAlpha",
|
||||
"preserveaspectratio": "preserveAspectRatio",
|
||||
"primitiveunits": "primitiveUnits",
|
||||
"refx": "refX",
|
||||
"refy": "refY",
|
||||
"repeatcount": "repeatCount",
|
||||
"repeatdur": "repeatDur",
|
||||
"requiredextensions": "requiredExtensions",
|
||||
"requiredfeatures": "requiredFeatures",
|
||||
"specularconstant": "specularConstant",
|
||||
"specularexponent": "specularExponent",
|
||||
"spreadmethod": "spreadMethod",
|
||||
"startoffset": "startOffset",
|
||||
"stddeviation": "stdDeviation",
|
||||
"stitchtiles": "stitchTiles",
|
||||
"surfacescale": "surfaceScale",
|
||||
"systemlanguage": "systemLanguage",
|
||||
"tablevalues": "tableValues",
|
||||
"targetx": "targetX",
|
||||
"targety": "targetY",
|
||||
"textlength": "textLength",
|
||||
"viewbox": "viewBox",
|
||||
"viewtarget": "viewTarget",
|
||||
"xchannelselector": "xChannelSelector",
|
||||
"ychannelselector": "yChannelSelector",
|
||||
"zoomandpan": "zoomAndPan"
|
||||
}
|
||||
for originalName in list(token["data"].keys()):
|
||||
if originalName in replacements:
|
||||
svgName = replacements[originalName]
|
||||
token["data"][svgName] = token["data"][originalName]
|
||||
del token["data"][originalName]
|
||||
adjust_attributes(token, adjustSVGAttributes)
|
||||
|
||||
def adjustForeignAttributes(self, token):
|
||||
replacements = adjustForeignAttributesMap
|
||||
|
||||
for originalName in token["data"].keys():
|
||||
if originalName in replacements:
|
||||
foreignName = replacements[originalName]
|
||||
token["data"][foreignName] = token["data"][originalName]
|
||||
del token["data"][originalName]
|
||||
adjust_attributes(token, adjustForeignAttributesMap)
|
||||
|
||||
def reparseTokenNormal(self, token):
|
||||
# pylint:disable=unused-argument
|
||||
self.parser.phase()
|
||||
|
||||
def resetInsertionMode(self):
|
||||
|
@ -419,11 +348,12 @@ class HTMLParser(object):
|
|||
self.phase = self.phases["text"]
|
||||
|
||||
|
||||
@_utils.memoize
|
||||
def getPhases(debug):
|
||||
def log(function):
|
||||
"""Logger that records which phase processes each token"""
|
||||
type_names = dict((value, key) for key, value in
|
||||
constants.tokenTypes.items())
|
||||
tokenTypes.items())
|
||||
|
||||
def wrapped(self, *args, **kwargs):
|
||||
if function.__name__.startswith("process") and len(args) > 0:
|
||||
|
@ -432,7 +362,7 @@ def getPhases(debug):
|
|||
info = {"type": type_names[token['type']]}
|
||||
except:
|
||||
raise
|
||||
if token['type'] in constants.tagTokenTypes:
|
||||
if token['type'] in tagTokenTypes:
|
||||
info["name"] = token['name']
|
||||
|
||||
self.parser.log.append((self.parser.tokenizer.state.__name__,
|
||||
|
@ -451,6 +381,7 @@ def getPhases(debug):
|
|||
else:
|
||||
return type
|
||||
|
||||
# pylint:disable=unused-argument
|
||||
class Phase(with_metaclass(getMetaclass(debug, log))):
|
||||
"""Base class for helper object that implements each phase of processing
|
||||
"""
|
||||
|
@ -517,77 +448,76 @@ def getPhases(debug):
|
|||
if publicId != "":
|
||||
publicId = publicId.translate(asciiUpper2Lower)
|
||||
|
||||
if (not correct or token["name"] != "html"
|
||||
or publicId.startswith(
|
||||
("+//silmaril//dtd html pro v0r11 19970101//",
|
||||
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
|
||||
"-//as//dtd html 3.0 aswedit + extensions//",
|
||||
"-//ietf//dtd html 2.0 level 1//",
|
||||
"-//ietf//dtd html 2.0 level 2//",
|
||||
"-//ietf//dtd html 2.0 strict level 1//",
|
||||
"-//ietf//dtd html 2.0 strict level 2//",
|
||||
"-//ietf//dtd html 2.0 strict//",
|
||||
"-//ietf//dtd html 2.0//",
|
||||
"-//ietf//dtd html 2.1e//",
|
||||
"-//ietf//dtd html 3.0//",
|
||||
"-//ietf//dtd html 3.2 final//",
|
||||
"-//ietf//dtd html 3.2//",
|
||||
"-//ietf//dtd html 3//",
|
||||
"-//ietf//dtd html level 0//",
|
||||
"-//ietf//dtd html level 1//",
|
||||
"-//ietf//dtd html level 2//",
|
||||
"-//ietf//dtd html level 3//",
|
||||
"-//ietf//dtd html strict level 0//",
|
||||
"-//ietf//dtd html strict level 1//",
|
||||
"-//ietf//dtd html strict level 2//",
|
||||
"-//ietf//dtd html strict level 3//",
|
||||
"-//ietf//dtd html strict//",
|
||||
"-//ietf//dtd html//",
|
||||
"-//metrius//dtd metrius presentational//",
|
||||
"-//microsoft//dtd internet explorer 2.0 html strict//",
|
||||
"-//microsoft//dtd internet explorer 2.0 html//",
|
||||
"-//microsoft//dtd internet explorer 2.0 tables//",
|
||||
"-//microsoft//dtd internet explorer 3.0 html strict//",
|
||||
"-//microsoft//dtd internet explorer 3.0 html//",
|
||||
"-//microsoft//dtd internet explorer 3.0 tables//",
|
||||
"-//netscape comm. corp.//dtd html//",
|
||||
"-//netscape comm. corp.//dtd strict html//",
|
||||
"-//o'reilly and associates//dtd html 2.0//",
|
||||
"-//o'reilly and associates//dtd html extended 1.0//",
|
||||
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
|
||||
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
|
||||
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
|
||||
"-//spyglass//dtd html 2.0 extended//",
|
||||
"-//sq//dtd html 2.0 hotmetal + extensions//",
|
||||
"-//sun microsystems corp.//dtd hotjava html//",
|
||||
"-//sun microsystems corp.//dtd hotjava strict html//",
|
||||
"-//w3c//dtd html 3 1995-03-24//",
|
||||
"-//w3c//dtd html 3.2 draft//",
|
||||
"-//w3c//dtd html 3.2 final//",
|
||||
"-//w3c//dtd html 3.2//",
|
||||
"-//w3c//dtd html 3.2s draft//",
|
||||
"-//w3c//dtd html 4.0 frameset//",
|
||||
"-//w3c//dtd html 4.0 transitional//",
|
||||
"-//w3c//dtd html experimental 19960712//",
|
||||
"-//w3c//dtd html experimental 970421//",
|
||||
"-//w3c//dtd w3 html//",
|
||||
"-//w3o//dtd w3 html 3.0//",
|
||||
"-//webtechs//dtd mozilla html 2.0//",
|
||||
"-//webtechs//dtd mozilla html//"))
|
||||
or publicId in
|
||||
("-//w3o//dtd w3 html strict 3.0//en//",
|
||||
"-/w3c/dtd html 4.0 transitional/en",
|
||||
"html")
|
||||
or publicId.startswith(
|
||||
("-//w3c//dtd html 4.01 frameset//",
|
||||
"-//w3c//dtd html 4.01 transitional//")) and
|
||||
systemId is None
|
||||
or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
|
||||
if (not correct or token["name"] != "html" or
|
||||
publicId.startswith(
|
||||
("+//silmaril//dtd html pro v0r11 19970101//",
|
||||
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
|
||||
"-//as//dtd html 3.0 aswedit + extensions//",
|
||||
"-//ietf//dtd html 2.0 level 1//",
|
||||
"-//ietf//dtd html 2.0 level 2//",
|
||||
"-//ietf//dtd html 2.0 strict level 1//",
|
||||
"-//ietf//dtd html 2.0 strict level 2//",
|
||||
"-//ietf//dtd html 2.0 strict//",
|
||||
"-//ietf//dtd html 2.0//",
|
||||
"-//ietf//dtd html 2.1e//",
|
||||
"-//ietf//dtd html 3.0//",
|
||||
"-//ietf//dtd html 3.2 final//",
|
||||
"-//ietf//dtd html 3.2//",
|
||||
"-//ietf//dtd html 3//",
|
||||
"-//ietf//dtd html level 0//",
|
||||
"-//ietf//dtd html level 1//",
|
||||
"-//ietf//dtd html level 2//",
|
||||
"-//ietf//dtd html level 3//",
|
||||
"-//ietf//dtd html strict level 0//",
|
||||
"-//ietf//dtd html strict level 1//",
|
||||
"-//ietf//dtd html strict level 2//",
|
||||
"-//ietf//dtd html strict level 3//",
|
||||
"-//ietf//dtd html strict//",
|
||||
"-//ietf//dtd html//",
|
||||
"-//metrius//dtd metrius presentational//",
|
||||
"-//microsoft//dtd internet explorer 2.0 html strict//",
|
||||
"-//microsoft//dtd internet explorer 2.0 html//",
|
||||
"-//microsoft//dtd internet explorer 2.0 tables//",
|
||||
"-//microsoft//dtd internet explorer 3.0 html strict//",
|
||||
"-//microsoft//dtd internet explorer 3.0 html//",
|
||||
"-//microsoft//dtd internet explorer 3.0 tables//",
|
||||
"-//netscape comm. corp.//dtd html//",
|
||||
"-//netscape comm. corp.//dtd strict html//",
|
||||
"-//o'reilly and associates//dtd html 2.0//",
|
||||
"-//o'reilly and associates//dtd html extended 1.0//",
|
||||
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
|
||||
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
|
||||
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
|
||||
"-//spyglass//dtd html 2.0 extended//",
|
||||
"-//sq//dtd html 2.0 hotmetal + extensions//",
|
||||
"-//sun microsystems corp.//dtd hotjava html//",
|
||||
"-//sun microsystems corp.//dtd hotjava strict html//",
|
||||
"-//w3c//dtd html 3 1995-03-24//",
|
||||
"-//w3c//dtd html 3.2 draft//",
|
||||
"-//w3c//dtd html 3.2 final//",
|
||||
"-//w3c//dtd html 3.2//",
|
||||
"-//w3c//dtd html 3.2s draft//",
|
||||
"-//w3c//dtd html 4.0 frameset//",
|
||||
"-//w3c//dtd html 4.0 transitional//",
|
||||
"-//w3c//dtd html experimental 19960712//",
|
||||
"-//w3c//dtd html experimental 970421//",
|
||||
"-//w3c//dtd w3 html//",
|
||||
"-//w3o//dtd w3 html 3.0//",
|
||||
"-//webtechs//dtd mozilla html 2.0//",
|
||||
"-//webtechs//dtd mozilla html//")) or
|
||||
publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
|
||||
"-/w3c/dtd html 4.0 transitional/en",
|
||||
"html") or
|
||||
publicId.startswith(
|
||||
("-//w3c//dtd html 4.01 frameset//",
|
||||
"-//w3c//dtd html 4.01 transitional//")) and
|
||||
systemId is None or
|
||||
systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
|
||||
self.parser.compatMode = "quirks"
|
||||
elif (publicId.startswith(
|
||||
("-//w3c//dtd xhtml 1.0 frameset//",
|
||||
"-//w3c//dtd xhtml 1.0 transitional//"))
|
||||
or publicId.startswith(
|
||||
"-//w3c//dtd xhtml 1.0 transitional//")) or
|
||||
publicId.startswith(
|
||||
("-//w3c//dtd html 4.01 frameset//",
|
||||
"-//w3c//dtd html 4.01 transitional//")) and
|
||||
systemId is not None):
|
||||
|
@ -660,13 +590,13 @@ def getPhases(debug):
|
|||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
("head", self.startTagHead)
|
||||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
(("head", "body", "html", "br"), self.endTagImplyHead)
|
||||
])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
@ -706,10 +636,11 @@ def getPhases(debug):
|
|||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
("title", self.startTagTitle),
|
||||
(("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
|
||||
(("noframes", "style"), self.startTagNoFramesStyle),
|
||||
("noscript", self.startTagNoscript),
|
||||
("script", self.startTagScript),
|
||||
(("base", "basefont", "bgsound", "command", "link"),
|
||||
self.startTagBaseLinkCommand),
|
||||
|
@ -718,7 +649,7 @@ def getPhases(debug):
|
|||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self. endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
("head", self.endTagHead),
|
||||
(("br", "html", "body"), self.endTagHtmlBodyBr)
|
||||
])
|
||||
|
@ -760,18 +691,25 @@ def getPhases(debug):
|
|||
# the abstract Unicode string, and just use the
|
||||
# ContentAttrParser on that, but using UTF-8 allows all chars
|
||||
# to be encoded and as a ASCII-superset works.
|
||||
data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
|
||||
parser = inputstream.ContentAttrParser(data)
|
||||
data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
|
||||
parser = _inputstream.ContentAttrParser(data)
|
||||
codec = parser.parse()
|
||||
self.parser.tokenizer.stream.changeEncoding(codec)
|
||||
|
||||
def startTagTitle(self, token):
|
||||
self.parser.parseRCDataRawtext(token, "RCDATA")
|
||||
|
||||
def startTagNoScriptNoFramesStyle(self, token):
|
||||
def startTagNoFramesStyle(self, token):
|
||||
# Need to decide whether to implement the scripting-disabled case
|
||||
self.parser.parseRCDataRawtext(token, "RAWTEXT")
|
||||
|
||||
def startTagNoscript(self, token):
|
||||
if self.parser.scripting:
|
||||
self.parser.parseRCDataRawtext(token, "RAWTEXT")
|
||||
else:
|
||||
self.tree.insertElement(token)
|
||||
self.parser.phase = self.parser.phases["inHeadNoscript"]
|
||||
|
||||
def startTagScript(self, token):
|
||||
self.tree.insertElement(token)
|
||||
self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
|
||||
|
@ -797,15 +735,75 @@ def getPhases(debug):
|
|||
def anythingElse(self):
|
||||
self.endTagHead(impliedTagToken("head"))
|
||||
|
||||
# XXX If we implement a parser for which scripting is disabled we need to
|
||||
# implement this phase.
|
||||
#
|
||||
# class InHeadNoScriptPhase(Phase):
|
||||
class InHeadNoscriptPhase(Phase):
|
||||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
(("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
|
||||
(("head", "noscript"), self.startTagHeadNoscript),
|
||||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
("noscript", self.endTagNoscript),
|
||||
("br", self.endTagBr),
|
||||
])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
def processEOF(self):
|
||||
self.parser.parseError("eof-in-head-noscript")
|
||||
self.anythingElse()
|
||||
return True
|
||||
|
||||
def processComment(self, token):
|
||||
return self.parser.phases["inHead"].processComment(token)
|
||||
|
||||
def processCharacters(self, token):
|
||||
self.parser.parseError("char-in-head-noscript")
|
||||
self.anythingElse()
|
||||
return token
|
||||
|
||||
def processSpaceCharacters(self, token):
|
||||
return self.parser.phases["inHead"].processSpaceCharacters(token)
|
||||
|
||||
def startTagHtml(self, token):
|
||||
return self.parser.phases["inBody"].processStartTag(token)
|
||||
|
||||
def startTagBaseLinkCommand(self, token):
|
||||
return self.parser.phases["inHead"].processStartTag(token)
|
||||
|
||||
def startTagHeadNoscript(self, token):
|
||||
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
|
||||
|
||||
def startTagOther(self, token):
|
||||
self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
|
||||
self.anythingElse()
|
||||
return token
|
||||
|
||||
def endTagNoscript(self, token):
|
||||
node = self.parser.tree.openElements.pop()
|
||||
assert node.name == "noscript", "Expected noscript got %s" % node.name
|
||||
self.parser.phase = self.parser.phases["inHead"]
|
||||
|
||||
def endTagBr(self, token):
|
||||
self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
|
||||
self.anythingElse()
|
||||
return token
|
||||
|
||||
def endTagOther(self, token):
|
||||
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
||||
|
||||
def anythingElse(self):
|
||||
# Caller must raise parse error first!
|
||||
self.endTagNoscript(impliedTagToken("noscript"))
|
||||
|
||||
class AfterHeadPhase(Phase):
|
||||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
("body", self.startTagBody),
|
||||
("frameset", self.startTagFrameset),
|
||||
|
@ -815,8 +813,8 @@ def getPhases(debug):
|
|||
("head", self.startTagHead)
|
||||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"),
|
||||
self.endTagHtmlBodyBr)])
|
||||
self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
|
||||
self.endTagHtmlBodyBr)])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
def processEOF(self):
|
||||
|
@ -874,10 +872,10 @@ def getPhases(debug):
|
|||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
# Keep a ref to this for special handling of whitespace in <pre>
|
||||
self.processSpaceCharactersNonPre = self.processSpaceCharacters
|
||||
# Set this to the default handler
|
||||
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
(("base", "basefont", "bgsound", "command", "link", "meta",
|
||||
"script", "style", "title"),
|
||||
|
@ -885,7 +883,7 @@ def getPhases(debug):
|
|||
("body", self.startTagBody),
|
||||
("frameset", self.startTagFrameset),
|
||||
(("address", "article", "aside", "blockquote", "center", "details",
|
||||
"details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
|
||||
"dir", "div", "dl", "fieldset", "figcaption", "figure",
|
||||
"footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
|
||||
"section", "summary", "ul"),
|
||||
self.startTagCloseP),
|
||||
|
@ -911,7 +909,8 @@ def getPhases(debug):
|
|||
("isindex", self.startTagIsIndex),
|
||||
("textarea", self.startTagTextarea),
|
||||
("iframe", self.startTagIFrame),
|
||||
(("noembed", "noframes", "noscript"), self.startTagRawtext),
|
||||
("noscript", self.startTagNoscript),
|
||||
(("noembed", "noframes"), self.startTagRawtext),
|
||||
("select", self.startTagSelect),
|
||||
(("rp", "rt"), self.startTagRpRt),
|
||||
(("option", "optgroup"), self.startTagOpt),
|
||||
|
@ -923,7 +922,7 @@ def getPhases(debug):
|
|||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
("body", self.endTagBody),
|
||||
("html", self.endTagHtml),
|
||||
(("address", "article", "aside", "blockquote", "button", "center",
|
||||
|
@ -942,17 +941,9 @@ def getPhases(debug):
|
|||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
def isMatchingFormattingElement(self, node1, node2):
|
||||
if node1.name != node2.name or node1.namespace != node2.namespace:
|
||||
return False
|
||||
elif len(node1.attributes) != len(node2.attributes):
|
||||
return False
|
||||
else:
|
||||
attributes1 = sorted(node1.attributes.items())
|
||||
attributes2 = sorted(node2.attributes.items())
|
||||
for attr1, attr2 in zip(attributes1, attributes2):
|
||||
if attr1 != attr2:
|
||||
return False
|
||||
return True
|
||||
return (node1.name == node2.name and
|
||||
node1.namespace == node2.namespace and
|
||||
node1.attributes == node2.attributes)
|
||||
|
||||
# helper
|
||||
def addFormattingElement(self, token):
|
||||
|
@ -988,8 +979,8 @@ def getPhases(debug):
|
|||
data = token["data"]
|
||||
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
||||
if (data.startswith("\n") and
|
||||
self.tree.openElements[-1].name in ("pre", "listing", "textarea")
|
||||
and not self.tree.openElements[-1].hasContent()):
|
||||
self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
|
||||
not self.tree.openElements[-1].hasContent()):
|
||||
data = data[1:]
|
||||
if data:
|
||||
self.tree.reconstructActiveFormattingElements()
|
||||
|
@ -1007,7 +998,7 @@ def getPhases(debug):
|
|||
for char in token["data"]])):
|
||||
self.parser.framesetOK = False
|
||||
|
||||
def processSpaceCharacters(self, token):
|
||||
def processSpaceCharactersNonPre(self, token):
|
||||
self.tree.reconstructActiveFormattingElements()
|
||||
self.tree.insertText(token["data"])
|
||||
|
||||
|
@ -1016,8 +1007,8 @@ def getPhases(debug):
|
|||
|
||||
def startTagBody(self, token):
|
||||
self.parser.parseError("unexpected-start-tag", {"name": "body"})
|
||||
if (len(self.tree.openElements) == 1
|
||||
or self.tree.openElements[1].name != "body"):
|
||||
if (len(self.tree.openElements) == 1 or
|
||||
self.tree.openElements[1].name != "body"):
|
||||
assert self.parser.innerHTML
|
||||
else:
|
||||
self.parser.framesetOK = False
|
||||
|
@ -1232,6 +1223,12 @@ def getPhases(debug):
|
|||
self.parser.framesetOK = False
|
||||
self.startTagRawtext(token)
|
||||
|
||||
def startTagNoscript(self, token):
|
||||
if self.parser.scripting:
|
||||
self.startTagRawtext(token)
|
||||
else:
|
||||
self.startTagOther(token)
|
||||
|
||||
def startTagRawtext(self, token):
|
||||
"""iframe, noembed noframes, noscript(if scripting enabled)"""
|
||||
self.parser.parseRCDataRawtext(token, "RAWTEXT")
|
||||
|
@ -1595,9 +1592,9 @@ def getPhases(debug):
|
|||
class TextPhase(Phase):
|
||||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
self.startTagHandler = utils.MethodDispatcher([])
|
||||
self.startTagHandler = _utils.MethodDispatcher([])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
("script", self.endTagScript)])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
|
@ -1629,7 +1626,7 @@ def getPhases(debug):
|
|||
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
|
||||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
("caption", self.startTagCaption),
|
||||
("colgroup", self.startTagColgroup),
|
||||
|
@ -1643,7 +1640,7 @@ def getPhases(debug):
|
|||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
("table", self.endTagTable),
|
||||
(("body", "caption", "col", "colgroup", "html", "tbody", "td",
|
||||
"tfoot", "th", "thead", "tr"), self.endTagIgnore)
|
||||
|
@ -1820,14 +1817,14 @@ def getPhases(debug):
|
|||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
|
||||
"thead", "tr"), self.startTagTableElement)
|
||||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
("caption", self.endTagCaption),
|
||||
("table", self.endTagTable),
|
||||
(("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
|
||||
|
@ -1892,13 +1889,13 @@ def getPhases(debug):
|
|||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
("col", self.startTagCol)
|
||||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
("colgroup", self.endTagColgroup),
|
||||
("col", self.endTagCol)
|
||||
])
|
||||
|
@ -1926,6 +1923,7 @@ def getPhases(debug):
|
|||
def startTagCol(self, token):
|
||||
self.tree.insertElement(token)
|
||||
self.tree.openElements.pop()
|
||||
token["selfClosingAcknowledged"] = True
|
||||
|
||||
def startTagOther(self, token):
|
||||
ignoreEndTag = self.ignoreEndTagColgroup()
|
||||
|
@ -1955,7 +1953,7 @@ def getPhases(debug):
|
|||
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
|
||||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
("tr", self.startTagTr),
|
||||
(("td", "th"), self.startTagTableCell),
|
||||
|
@ -1964,7 +1962,7 @@ def getPhases(debug):
|
|||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
|
||||
("table", self.endTagTable),
|
||||
(("body", "caption", "col", "colgroup", "html", "td", "th",
|
||||
|
@ -2053,7 +2051,7 @@ def getPhases(debug):
|
|||
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
|
||||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
(("td", "th"), self.startTagTableCell),
|
||||
(("caption", "col", "colgroup", "tbody", "tfoot", "thead",
|
||||
|
@ -2061,7 +2059,7 @@ def getPhases(debug):
|
|||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
("tr", self.endTagTr),
|
||||
("table", self.endTagTable),
|
||||
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
|
||||
|
@ -2142,14 +2140,14 @@ def getPhases(debug):
|
|||
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
|
||||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
|
||||
"thead", "tr"), self.startTagTableOther)
|
||||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
(("td", "th"), self.endTagTableCell),
|
||||
(("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
|
||||
(("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
|
||||
|
@ -2218,7 +2216,7 @@ def getPhases(debug):
|
|||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
("option", self.startTagOption),
|
||||
("optgroup", self.startTagOptgroup),
|
||||
|
@ -2228,7 +2226,7 @@ def getPhases(debug):
|
|||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
("option", self.endTagOption),
|
||||
("optgroup", self.endTagOptgroup),
|
||||
("select", self.endTagSelect)
|
||||
|
@ -2318,13 +2316,13 @@ def getPhases(debug):
|
|||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
|
||||
self.startTagTable)
|
||||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
|
||||
self.endTagTable)
|
||||
])
|
||||
|
@ -2445,7 +2443,7 @@ def getPhases(debug):
|
|||
def processEndTag(self, token):
|
||||
nodeIndex = len(self.tree.openElements) - 1
|
||||
node = self.tree.openElements[-1]
|
||||
if node.name != token["name"]:
|
||||
if node.name.translate(asciiUpper2Lower) != token["name"]:
|
||||
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
||||
|
||||
while True:
|
||||
|
@ -2472,12 +2470,12 @@ def getPhases(debug):
|
|||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml)
|
||||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
|
||||
self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
def processEOF(self):
|
||||
|
@ -2520,7 +2518,7 @@ def getPhases(debug):
|
|||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
("frameset", self.startTagFrameset),
|
||||
("frame", self.startTagFrame),
|
||||
|
@ -2528,7 +2526,7 @@ def getPhases(debug):
|
|||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
("frameset", self.endTagFrameset)
|
||||
])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
@ -2577,13 +2575,13 @@ def getPhases(debug):
|
|||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
("noframes", self.startTagNoframes)
|
||||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
self.endTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.endTagHtml)
|
||||
])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
@ -2613,7 +2611,7 @@ def getPhases(debug):
|
|||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml)
|
||||
])
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
@ -2651,7 +2649,7 @@ def getPhases(debug):
|
|||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
self.startTagHandler = _utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
("noframes", self.startTagNoFrames)
|
||||
])
|
||||
|
@ -2682,13 +2680,14 @@ def getPhases(debug):
|
|||
def processEndTag(self, token):
|
||||
self.parser.parseError("expected-eof-but-got-end-tag",
|
||||
{"name": token["name"]})
|
||||
# pylint:enable=unused-argument
|
||||
|
||||
return {
|
||||
"initial": InitialPhase,
|
||||
"beforeHtml": BeforeHtmlPhase,
|
||||
"beforeHead": BeforeHeadPhase,
|
||||
"inHead": InHeadPhase,
|
||||
# XXX "inHeadNoscript": InHeadNoScriptPhase,
|
||||
"inHeadNoscript": InHeadNoscriptPhase,
|
||||
"afterHead": AfterHeadPhase,
|
||||
"inBody": InBodyPhase,
|
||||
"text": TextPhase,
|
||||
|
@ -2711,6 +2710,16 @@ def getPhases(debug):
|
|||
}
|
||||
|
||||
|
||||
def adjust_attributes(token, replacements):
|
||||
if PY3 or _utils.PY27:
|
||||
needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
|
||||
else:
|
||||
needs_adjustment = frozenset(token['data']) & frozenset(replacements)
|
||||
if needs_adjustment:
|
||||
token['data'] = OrderedDict((replacements.get(k, k), v)
|
||||
for k, v in token['data'].items())
|
||||
|
||||
|
||||
def impliedTagToken(name, type="EndTag", attributes=None,
|
||||
selfClosing=False):
|
||||
if attributes is None:
|
||||
|
|
|
@ -1,300 +0,0 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
import re
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
from six.moves import urllib_parse as urlparse
|
||||
|
||||
from .tokenizer import HTMLTokenizer
|
||||
from .constants import tokenTypes
|
||||
|
||||
|
||||
content_type_rgx = re.compile(r'''
|
||||
^
|
||||
# Match a content type <application>/<type>
|
||||
(?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
|
||||
# Match any character set and encoding
|
||||
(?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
|
||||
|(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
|
||||
# Assume the rest is data
|
||||
,.*
|
||||
$
|
||||
''',
|
||||
re.VERBOSE)
|
||||
|
||||
|
||||
class HTMLSanitizerMixin(object):
|
||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
|
||||
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
|
||||
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
|
||||
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
|
||||
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
|
||||
'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
|
||||
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
|
||||
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
|
||||
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
|
||||
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
|
||||
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
|
||||
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
|
||||
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
|
||||
|
||||
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
|
||||
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
|
||||
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
|
||||
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
|
||||
'munderover', 'none']
|
||||
|
||||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
|
||||
'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
|
||||
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
|
||||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
|
||||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
|
||||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
|
||||
|
||||
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
|
||||
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
|
||||
'background', 'balance', 'bgcolor', 'bgproperties', 'border',
|
||||
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
|
||||
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
|
||||
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
|
||||
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
|
||||
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
|
||||
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
|
||||
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
|
||||
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
|
||||
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
|
||||
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
|
||||
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
|
||||
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
|
||||
'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
|
||||
'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
|
||||
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
|
||||
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
|
||||
'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
|
||||
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
|
||||
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
|
||||
'width', 'wrap', 'xml:lang']
|
||||
|
||||
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
|
||||
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
|
||||
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
|
||||
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
|
||||
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
|
||||
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
|
||||
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
|
||||
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
|
||||
'xlink:type', 'xmlns', 'xmlns:xlink']
|
||||
|
||||
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
|
||||
'arabic-form', 'ascent', 'attributeName', 'attributeType',
|
||||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
|
||||
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
|
||||
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
|
||||
'fill-opacity', 'fill-rule', 'font-family', 'font-size',
|
||||
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
|
||||
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
|
||||
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
|
||||
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
|
||||
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
|
||||
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
|
||||
'opacity', 'orient', 'origin', 'overline-position',
|
||||
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
|
||||
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
|
||||
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
|
||||
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
|
||||
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
|
||||
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
||||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
|
||||
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
|
||||
'transform', 'type', 'u1', 'u2', 'underline-position',
|
||||
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
|
||||
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
|
||||
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
|
||||
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
|
||||
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
|
||||
'y1', 'y2', 'zoomAndPan']
|
||||
|
||||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster', 'background', 'datasrc',
|
||||
'dynsrc', 'lowsrc', 'ping', 'poster', 'xlink:href', 'xml:base']
|
||||
|
||||
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
|
||||
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
|
||||
'mask', 'stroke']
|
||||
|
||||
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
|
||||
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
|
||||
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
|
||||
'set', 'use']
|
||||
|
||||
acceptable_css_properties = ['azimuth', 'background-color',
|
||||
'border-bottom-color', 'border-collapse', 'border-color',
|
||||
'border-left-color', 'border-right-color', 'border-top-color', 'clear',
|
||||
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
|
||||
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
|
||||
'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
|
||||
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
|
||||
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
|
||||
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
|
||||
'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
|
||||
'white-space', 'width']
|
||||
|
||||
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
|
||||
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
|
||||
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
|
||||
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
|
||||
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
|
||||
'transparent', 'underline', 'white', 'yellow']
|
||||
|
||||
acceptable_svg_properties = ['fill', 'fill-opacity', 'fill-rule',
|
||||
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
|
||||
'stroke-opacity']
|
||||
|
||||
acceptable_protocols = ['ed2k', 'ftp', 'http', 'https', 'irc',
|
||||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
|
||||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
|
||||
'ssh', 'sftp', 'rtsp', 'afs', 'data']
|
||||
|
||||
acceptable_content_types = ['image/png', 'image/jpeg', 'image/gif', 'image/webp', 'image/bmp', 'text/plain']
|
||||
|
||||
# subclasses may define their own versions of these constants
|
||||
allowed_elements = acceptable_elements + mathml_elements + svg_elements
|
||||
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
|
||||
allowed_css_properties = acceptable_css_properties
|
||||
allowed_css_keywords = acceptable_css_keywords
|
||||
allowed_svg_properties = acceptable_svg_properties
|
||||
allowed_protocols = acceptable_protocols
|
||||
allowed_content_types = acceptable_content_types
|
||||
|
||||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
|
||||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
|
||||
# attributes are parsed, and a restricted set, # specified by
|
||||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
|
||||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
|
||||
# in ALLOWED_PROTOCOLS are allowed.
|
||||
#
|
||||
# sanitize_html('<script> do_nasty_stuff() </script>')
|
||||
# => <script> do_nasty_stuff() </script>
|
||||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
|
||||
# => <a>Click here for $100</a>
|
||||
def sanitize_token(self, token):
|
||||
|
||||
# accommodate filters which use token_type differently
|
||||
token_type = token["type"]
|
||||
if token_type in list(tokenTypes.keys()):
|
||||
token_type = tokenTypes[token_type]
|
||||
|
||||
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
|
||||
tokenTypes["EmptyTag"]):
|
||||
if token["name"] in self.allowed_elements:
|
||||
return self.allowed_token(token, token_type)
|
||||
else:
|
||||
return self.disallowed_token(token, token_type)
|
||||
elif token_type == tokenTypes["Comment"]:
|
||||
pass
|
||||
else:
|
||||
return token
|
||||
|
||||
def allowed_token(self, token, token_type):
|
||||
if "data" in token:
|
||||
attrs = dict([(name, val) for name, val in
|
||||
token["data"][::-1]
|
||||
if name in self.allowed_attributes])
|
||||
for attr in self.attr_val_is_uri:
|
||||
if attr not in attrs:
|
||||
continue
|
||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
||||
unescape(attrs[attr])).lower()
|
||||
# remove replacement characters from unescaped characters
|
||||
val_unescaped = val_unescaped.replace("\ufffd", "")
|
||||
try:
|
||||
uri = urlparse.urlparse(val_unescaped)
|
||||
except ValueError:
|
||||
uri = None
|
||||
del attrs[attr]
|
||||
if uri and uri.scheme:
|
||||
if uri.scheme not in self.allowed_protocols:
|
||||
del attrs[attr]
|
||||
if uri.scheme == 'data':
|
||||
m = content_type_rgx.match(uri.path)
|
||||
if not m:
|
||||
del attrs[attr]
|
||||
elif m.group('content_type') not in self.allowed_content_types:
|
||||
del attrs[attr]
|
||||
|
||||
for attr in self.svg_attr_val_allows_ref:
|
||||
if attr in attrs:
|
||||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
|
||||
' ',
|
||||
unescape(attrs[attr]))
|
||||
if (token["name"] in self.svg_allow_local_href and
|
||||
'xlink:href' in attrs and re.search('^\s*[^#\s].*',
|
||||
attrs['xlink:href'])):
|
||||
del attrs['xlink:href']
|
||||
if 'style' in attrs:
|
||||
attrs['style'] = self.sanitize_css(attrs['style'])
|
||||
token["data"] = [[name, val] for name, val in list(attrs.items())]
|
||||
return token
|
||||
|
||||
def disallowed_token(self, token, token_type):
|
||||
if token_type == tokenTypes["EndTag"]:
|
||||
token["data"] = "</%s>" % token["name"]
|
||||
elif token["data"]:
|
||||
attrs = ''.join([' %s="%s"' % (k, escape(v)) for k, v in token["data"]])
|
||||
token["data"] = "<%s%s>" % (token["name"], attrs)
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
if token.get("selfClosing"):
|
||||
token["data"] = token["data"][:-1] + "/>"
|
||||
|
||||
if token["type"] in list(tokenTypes.keys()):
|
||||
token["type"] = "Characters"
|
||||
else:
|
||||
token["type"] = tokenTypes["Characters"]
|
||||
|
||||
del token["name"]
|
||||
return token
|
||||
|
||||
def sanitize_css(self, style):
|
||||
# disallow urls
|
||||
style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
|
||||
|
||||
# gauntlet
|
||||
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
|
||||
return ''
|
||||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
|
||||
return ''
|
||||
|
||||
clean = []
|
||||
for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
|
||||
if not value:
|
||||
continue
|
||||
if prop.lower() in self.allowed_css_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
|
||||
'padding']:
|
||||
for keyword in value.split():
|
||||
if keyword not in self.acceptable_css_keywords and \
|
||||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword):
|
||||
break
|
||||
else:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
elif prop.lower() in self.allowed_svg_properties:
|
||||
clean.append(prop + ': ' + value + ';')
|
||||
|
||||
return ' '.join(clean)
|
||||
|
||||
|
||||
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
||||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
||||
lowercaseElementName=False, lowercaseAttrName=False, parser=None):
|
||||
# Change case matching defaults as we only output lowercase html anyway
|
||||
# This solution doesn't seem ideal...
|
||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
|
||||
lowercaseElementName, lowercaseAttrName, parser=parser)
|
||||
|
||||
def __iter__(self):
|
||||
for token in HTMLTokenizer.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token:
|
||||
yield token
|
|
@ -1,79 +1,87 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
from six import text_type
|
||||
|
||||
try:
|
||||
from functools import reduce
|
||||
except ImportError:
|
||||
pass
|
||||
import re
|
||||
|
||||
from ..constants import voidElements, booleanAttributes, spaceCharacters
|
||||
from ..constants import rcdataElements, entities, xmlEntities
|
||||
from .. import utils
|
||||
from codecs import register_error, xmlcharrefreplace_errors
|
||||
|
||||
from .constants import voidElements, booleanAttributes, spaceCharacters
|
||||
from .constants import rcdataElements, entities, xmlEntities
|
||||
from . import treewalkers, _utils
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
spaceCharacters = "".join(spaceCharacters)
|
||||
_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"
|
||||
_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")
|
||||
_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +
|
||||
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
|
||||
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
|
||||
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
|
||||
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
|
||||
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
|
||||
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
|
||||
"\u3000]")
|
||||
|
||||
try:
|
||||
from codecs import register_error, xmlcharrefreplace_errors
|
||||
except ImportError:
|
||||
unicode_encode_errors = "strict"
|
||||
else:
|
||||
unicode_encode_errors = "htmlentityreplace"
|
||||
|
||||
encode_entity_map = {}
|
||||
is_ucs4 = len("\U0010FFFF") == 1
|
||||
for k, v in list(entities.items()):
|
||||
# skip multi-character entities
|
||||
if ((is_ucs4 and len(v) > 1) or
|
||||
(not is_ucs4 and len(v) > 2)):
|
||||
continue
|
||||
if v != "&":
|
||||
if len(v) == 2:
|
||||
v = utils.surrogatePairToCodepoint(v)
|
||||
else:
|
||||
v = ord(v)
|
||||
if v not in encode_entity_map or k.islower():
|
||||
# prefer < over < and similarly for &, >, etc.
|
||||
encode_entity_map[v] = k
|
||||
|
||||
def htmlentityreplace_errors(exc):
|
||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||
res = []
|
||||
codepoints = []
|
||||
skip = False
|
||||
for i, c in enumerate(exc.object[exc.start:exc.end]):
|
||||
if skip:
|
||||
skip = False
|
||||
continue
|
||||
index = i + exc.start
|
||||
if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
|
||||
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
|
||||
skip = True
|
||||
else:
|
||||
codepoint = ord(c)
|
||||
codepoints.append(codepoint)
|
||||
for cp in codepoints:
|
||||
e = encode_entity_map.get(cp)
|
||||
if e:
|
||||
res.append("&")
|
||||
res.append(e)
|
||||
if not e.endswith(";"):
|
||||
res.append(";")
|
||||
else:
|
||||
res.append("&#x%s;" % (hex(cp)[2:]))
|
||||
return ("".join(res), exc.end)
|
||||
_encode_entity_map = {}
|
||||
_is_ucs4 = len("\U0010FFFF") == 1
|
||||
for k, v in list(entities.items()):
|
||||
# skip multi-character entities
|
||||
if ((_is_ucs4 and len(v) > 1) or
|
||||
(not _is_ucs4 and len(v) > 2)):
|
||||
continue
|
||||
if v != "&":
|
||||
if len(v) == 2:
|
||||
v = _utils.surrogatePairToCodepoint(v)
|
||||
else:
|
||||
return xmlcharrefreplace_errors(exc)
|
||||
v = ord(v)
|
||||
if v not in _encode_entity_map or k.islower():
|
||||
# prefer < over < and similarly for &, >, etc.
|
||||
_encode_entity_map[v] = k
|
||||
|
||||
register_error(unicode_encode_errors, htmlentityreplace_errors)
|
||||
|
||||
del register_error
|
||||
def htmlentityreplace_errors(exc):
|
||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||
res = []
|
||||
codepoints = []
|
||||
skip = False
|
||||
for i, c in enumerate(exc.object[exc.start:exc.end]):
|
||||
if skip:
|
||||
skip = False
|
||||
continue
|
||||
index = i + exc.start
|
||||
if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
|
||||
codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])
|
||||
skip = True
|
||||
else:
|
||||
codepoint = ord(c)
|
||||
codepoints.append(codepoint)
|
||||
for cp in codepoints:
|
||||
e = _encode_entity_map.get(cp)
|
||||
if e:
|
||||
res.append("&")
|
||||
res.append(e)
|
||||
if not e.endswith(";"):
|
||||
res.append(";")
|
||||
else:
|
||||
res.append("&#x%s;" % (hex(cp)[2:]))
|
||||
return ("".join(res), exc.end)
|
||||
else:
|
||||
return xmlcharrefreplace_errors(exc)
|
||||
|
||||
register_error("htmlentityreplace", htmlentityreplace_errors)
|
||||
|
||||
|
||||
def serialize(input, tree="etree", encoding=None, **serializer_opts):
|
||||
# XXX: Should we cache this?
|
||||
walker = treewalkers.getTreeWalker(tree)
|
||||
s = HTMLSerializer(**serializer_opts)
|
||||
return s.render(walker(input), encoding)
|
||||
|
||||
|
||||
class HTMLSerializer(object):
|
||||
|
||||
# attribute quoting options
|
||||
quote_attr_values = False
|
||||
quote_attr_values = "legacy" # be secure by default
|
||||
quote_char = '"'
|
||||
use_best_quote_char = True
|
||||
|
||||
|
@ -109,9 +117,9 @@ class HTMLSerializer(object):
|
|||
inject_meta_charset=True|False
|
||||
Whether it insert a meta element to define the character set of the
|
||||
document.
|
||||
quote_attr_values=True|False
|
||||
quote_attr_values="legacy"|"spec"|"always"
|
||||
Whether to quote attribute values that don't require quoting
|
||||
per HTML5 parsing rules.
|
||||
per legacy browser behaviour, when required by the standard, or always.
|
||||
quote_char=u'"'|u"'"
|
||||
Use given quote character for attribute quoting. Default is to
|
||||
use double quote unless attribute value contains a double quote,
|
||||
|
@ -147,6 +155,9 @@ class HTMLSerializer(object):
|
|||
|
||||
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
|
||||
"""
|
||||
unexpected_args = frozenset(kwargs) - frozenset(self.options)
|
||||
if len(unexpected_args) > 0:
|
||||
raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))
|
||||
if 'quote_char' in kwargs:
|
||||
self.use_best_quote_char = False
|
||||
for attr in self.options:
|
||||
|
@ -157,7 +168,7 @@ class HTMLSerializer(object):
|
|||
def encode(self, string):
|
||||
assert(isinstance(string, text_type))
|
||||
if self.encoding:
|
||||
return string.encode(self.encoding, unicode_encode_errors)
|
||||
return string.encode(self.encoding, "htmlentityreplace")
|
||||
else:
|
||||
return string
|
||||
|
||||
|
@ -169,28 +180,30 @@ class HTMLSerializer(object):
|
|||
return string
|
||||
|
||||
def serialize(self, treewalker, encoding=None):
|
||||
# pylint:disable=too-many-nested-blocks
|
||||
self.encoding = encoding
|
||||
in_cdata = False
|
||||
self.errors = []
|
||||
|
||||
if encoding and self.inject_meta_charset:
|
||||
from ..filters.inject_meta_charset import Filter
|
||||
from .filters.inject_meta_charset import Filter
|
||||
treewalker = Filter(treewalker, encoding)
|
||||
# Alphabetical attributes is here under the assumption that none of
|
||||
# the later filters add or change order of attributes; it needs to be
|
||||
# before the sanitizer so escaped elements come out correctly
|
||||
if self.alphabetical_attributes:
|
||||
from .filters.alphabeticalattributes import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
# WhitespaceFilter should be used before OptionalTagFilter
|
||||
# for maximum efficiently of this latter filter
|
||||
if self.strip_whitespace:
|
||||
from ..filters.whitespace import Filter
|
||||
from .filters.whitespace import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.sanitize:
|
||||
from ..filters.sanitizer import Filter
|
||||
from .filters.sanitizer import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
if self.omit_optional_tags:
|
||||
from ..filters.optionaltags import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
# Alphabetical attributes must be last, as other filters
|
||||
# could add attributes and alter the order
|
||||
if self.alphabetical_attributes:
|
||||
from ..filters.alphabeticalattributes import Filter
|
||||
from .filters.optionaltags import Filter
|
||||
treewalker = Filter(treewalker)
|
||||
|
||||
for token in treewalker:
|
||||
|
@ -229,7 +242,7 @@ class HTMLSerializer(object):
|
|||
in_cdata = True
|
||||
elif in_cdata:
|
||||
self.serializeError("Unexpected child element of a CDATA element")
|
||||
for (attr_namespace, attr_name), attr_value in token["data"].items():
|
||||
for (_, attr_name), attr_value in token["data"].items():
|
||||
# TODO: Add namespace support here
|
||||
k = attr_name
|
||||
v = attr_value
|
||||
|
@ -237,14 +250,18 @@ class HTMLSerializer(object):
|
|||
|
||||
yield self.encodeStrict(k)
|
||||
if not self.minimize_boolean_attributes or \
|
||||
(k not in booleanAttributes.get(name, tuple())
|
||||
and k not in booleanAttributes.get("", tuple())):
|
||||
(k not in booleanAttributes.get(name, tuple()) and
|
||||
k not in booleanAttributes.get("", tuple())):
|
||||
yield self.encodeStrict("=")
|
||||
if self.quote_attr_values or not v:
|
||||
if self.quote_attr_values == "always" or len(v) == 0:
|
||||
quote_attr = True
|
||||
elif self.quote_attr_values == "spec":
|
||||
quote_attr = _quoteAttributeSpec.search(v) is not None
|
||||
elif self.quote_attr_values == "legacy":
|
||||
quote_attr = _quoteAttributeLegacy.search(v) is not None
|
||||
else:
|
||||
quote_attr = reduce(lambda x, y: x or (y in v),
|
||||
spaceCharacters + ">\"'=", False)
|
||||
raise ValueError("quote_attr_values must be one of: "
|
||||
"'always', 'spec', or 'legacy'")
|
||||
v = v.replace("&", "&")
|
||||
if self.escape_lt_in_attrs:
|
||||
v = v.replace("<", "<")
|
||||
|
@ -312,6 +329,6 @@ class HTMLSerializer(object):
|
|||
raise SerializeError
|
||||
|
||||
|
||||
def SerializeError(Exception):
|
||||
class SerializeError(Exception):
|
||||
"""Error in serialized tree"""
|
||||
pass
|
|
@ -1,16 +0,0 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from .. import treewalkers
|
||||
|
||||
from .htmlserializer import HTMLSerializer
|
||||
|
||||
|
||||
def serialize(input, tree="etree", format="html", encoding=None,
|
||||
**serializer_opts):
|
||||
# XXX: Should we cache this?
|
||||
walker = treewalkers.getTreeWalker(tree)
|
||||
if format == "html":
|
||||
s = HTMLSerializer(**serializer_opts)
|
||||
else:
|
||||
raise ValueError("type must be html")
|
||||
return s.render(walker(input), encoding)
|
|
@ -5,7 +5,7 @@ from . import sax
|
|||
__all__ = ["sax"]
|
||||
|
||||
try:
|
||||
from . import genshi # flake8: noqa
|
||||
from . import genshi # noqa
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
|
|
|
@ -28,7 +28,7 @@ to the format used in the unittests
|
|||
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
from ..utils import default_etree
|
||||
from .._utils import default_etree
|
||||
|
||||
treeBuilderCache = {}
|
||||
|
||||
|
|
|
@ -126,6 +126,7 @@ class TreeBuilder(object):
|
|||
commentClass - the class to use for comments
|
||||
doctypeClass - the class to use for doctypes
|
||||
"""
|
||||
# pylint:disable=not-callable
|
||||
|
||||
# Document class
|
||||
documentClass = None
|
||||
|
@ -166,12 +167,17 @@ class TreeBuilder(object):
|
|||
# If we pass a node in we match that. if we pass a string
|
||||
# match any node with that name
|
||||
exactNode = hasattr(target, "nameTuple")
|
||||
if not exactNode:
|
||||
if isinstance(target, text_type):
|
||||
target = (namespaces["html"], target)
|
||||
assert isinstance(target, tuple)
|
||||
|
||||
listElements, invert = listElementsMap[variant]
|
||||
|
||||
for node in reversed(self.openElements):
|
||||
if (node.name == target and not exactNode or
|
||||
node == target and exactNode):
|
||||
if exactNode and node == target:
|
||||
return True
|
||||
elif not exactNode and node.nameTuple == target:
|
||||
return True
|
||||
elif (invert ^ (node.nameTuple in listElements)):
|
||||
return False
|
||||
|
@ -353,8 +359,8 @@ class TreeBuilder(object):
|
|||
def generateImpliedEndTags(self, exclude=None):
|
||||
name = self.openElements[-1].name
|
||||
# XXX td, th and tr are not actually needed
|
||||
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
|
||||
and name != exclude):
|
||||
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) and
|
||||
name != exclude):
|
||||
self.openElements.pop()
|
||||
# XXX This is not entirely what the specification says. We should
|
||||
# investigate it more closely.
|
|
@ -1,54 +1,62 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
|
||||
from collections import MutableMapping
|
||||
from xml.dom import minidom, Node
|
||||
import weakref
|
||||
|
||||
from . import _base
|
||||
from . import base
|
||||
from .. import constants
|
||||
from ..constants import namespaces
|
||||
from ..utils import moduleFactoryFactory
|
||||
from .._utils import moduleFactoryFactory
|
||||
|
||||
|
||||
def getDomBuilder(DomImplementation):
|
||||
Dom = DomImplementation
|
||||
|
||||
class AttrList(object):
|
||||
class AttrList(MutableMapping):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
|
||||
def __iter__(self):
|
||||
return list(self.element.attributes.items()).__iter__()
|
||||
return iter(self.element.attributes.keys())
|
||||
|
||||
def __setitem__(self, name, value):
|
||||
self.element.setAttribute(name, value)
|
||||
|
||||
def __len__(self):
|
||||
return len(list(self.element.attributes.items()))
|
||||
|
||||
def items(self):
|
||||
return [(item[0], item[1]) for item in
|
||||
list(self.element.attributes.items())]
|
||||
|
||||
def keys(self):
|
||||
return list(self.element.attributes.keys())
|
||||
|
||||
def __getitem__(self, name):
|
||||
return self.element.getAttribute(name)
|
||||
|
||||
def __contains__(self, name):
|
||||
if isinstance(name, tuple):
|
||||
raise NotImplementedError
|
||||
else:
|
||||
return self.element.hasAttribute(name)
|
||||
attr = self.element.ownerDocument.createAttribute(name)
|
||||
attr.value = value
|
||||
self.element.attributes[name] = attr
|
||||
|
||||
class NodeBuilder(_base.Node):
|
||||
def __len__(self):
|
||||
return len(self.element.attributes)
|
||||
|
||||
def items(self):
|
||||
return list(self.element.attributes.items())
|
||||
|
||||
def values(self):
|
||||
return list(self.element.attributes.values())
|
||||
|
||||
def __getitem__(self, name):
|
||||
if isinstance(name, tuple):
|
||||
raise NotImplementedError
|
||||
else:
|
||||
return self.element.attributes[name].value
|
||||
|
||||
def __delitem__(self, name):
|
||||
if isinstance(name, tuple):
|
||||
raise NotImplementedError
|
||||
else:
|
||||
del self.element.attributes[name]
|
||||
|
||||
class NodeBuilder(base.Node):
|
||||
def __init__(self, element):
|
||||
_base.Node.__init__(self, element.nodeName)
|
||||
base.Node.__init__(self, element.nodeName)
|
||||
self.element = element
|
||||
|
||||
namespace = property(lambda self: hasattr(self.element, "namespaceURI")
|
||||
and self.element.namespaceURI or None)
|
||||
namespace = property(lambda self: hasattr(self.element, "namespaceURI") and
|
||||
self.element.namespaceURI or None)
|
||||
|
||||
def appendChild(self, node):
|
||||
node.parent = self
|
||||
|
@ -109,7 +117,7 @@ def getDomBuilder(DomImplementation):
|
|||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
|
||||
def documentClass(self):
|
||||
self.dom = Dom.getDOMImplementation().createDocument(None, None, None)
|
||||
return weakref.proxy(self)
|
||||
|
@ -149,15 +157,16 @@ def getDomBuilder(DomImplementation):
|
|||
return self.dom
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self).element
|
||||
return base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
data = data
|
||||
if parent != self:
|
||||
_base.TreeBuilder.insertText(self, data, parent)
|
||||
base.TreeBuilder.insertText(self, data, parent)
|
||||
else:
|
||||
# HACK: allow text nodes as children of the document node
|
||||
if hasattr(self.dom, '_child_node_types'):
|
||||
# pylint:disable=protected-access
|
||||
if Node.TEXT_NODE not in self.dom._child_node_types:
|
||||
self.dom._child_node_types = list(self.dom._child_node_types)
|
||||
self.dom._child_node_types.append(Node.TEXT_NODE)
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
# pylint:disable=protected-access
|
||||
|
||||
from six import text_type
|
||||
|
||||
import re
|
||||
|
||||
from . import _base
|
||||
from .. import ihatexml
|
||||
from . import base
|
||||
from .. import _ihatexml
|
||||
from .. import constants
|
||||
from ..constants import namespaces
|
||||
from ..utils import moduleFactoryFactory
|
||||
from .._utils import moduleFactoryFactory
|
||||
|
||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||
|
||||
|
@ -16,7 +18,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||
ElementTree = ElementTreeImplementation
|
||||
ElementTreeCommentType = ElementTree.Comment("asd").tag
|
||||
|
||||
class Element(_base.Node):
|
||||
class Element(base.Node):
|
||||
def __init__(self, name, namespace=None):
|
||||
self._name = name
|
||||
self._namespace = namespace
|
||||
|
@ -98,6 +100,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self._childNodes.remove(node)
|
||||
self._element.remove(node._element)
|
||||
node.parent = None
|
||||
|
||||
|
@ -139,7 +142,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||
if self._element.text is not None:
|
||||
newParent._element.text += self._element.text
|
||||
self._element.text = ""
|
||||
_base.Node.reparentChildren(self, newParent)
|
||||
base.Node.reparentChildren(self, newParent)
|
||||
|
||||
class Comment(Element):
|
||||
def __init__(self, data):
|
||||
|
@ -253,10 +256,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||
|
||||
return "\n".join(rv)
|
||||
|
||||
def tostring(element):
|
||||
def tostring(element): # pylint:disable=unused-variable
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
filter = ihatexml.InfosetFilter()
|
||||
filter = _ihatexml.InfosetFilter()
|
||||
|
||||
def serializeElement(element):
|
||||
if isinstance(element, ElementTree.ElementTree):
|
||||
|
@ -307,7 +310,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||
|
||||
return "".join(rv)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
class TreeBuilder(base.TreeBuilder): # pylint:disable=unused-variable
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = Element
|
||||
|
@ -329,7 +332,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||
return self.document._element.find("html")
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self)._element
|
||||
return base.TreeBuilder.getFragment(self)._element
|
||||
|
||||
return locals()
|
||||
|
||||
|
|
|
@ -10,16 +10,17 @@ When any of these things occur, we emit a DataLossWarning
|
|||
"""
|
||||
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
# pylint:disable=protected-access
|
||||
|
||||
import warnings
|
||||
import re
|
||||
import sys
|
||||
|
||||
from . import _base
|
||||
from . import base
|
||||
from ..constants import DataLossWarning
|
||||
from .. import constants
|
||||
from . import etree as etree_builders
|
||||
from .. import ihatexml
|
||||
from .. import _ihatexml
|
||||
|
||||
import lxml.etree as etree
|
||||
|
||||
|
@ -53,8 +54,7 @@ class Document(object):
|
|||
|
||||
def testSerializer(element):
|
||||
rv = []
|
||||
finalText = None
|
||||
infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
|
||||
infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
|
||||
|
||||
def serializeElement(element, indent=0):
|
||||
if not hasattr(element, "tag"):
|
||||
|
@ -128,16 +128,12 @@ def testSerializer(element):
|
|||
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
|
||||
serializeElement(element, 0)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("|%s\"%s\"" % (' ' * 2, finalText))
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
|
||||
def tostring(element):
|
||||
"""Serialize an element and its child nodes to a string"""
|
||||
rv = []
|
||||
finalText = None
|
||||
|
||||
def serializeElement(element):
|
||||
if not hasattr(element, "tag"):
|
||||
|
@ -173,13 +169,10 @@ def tostring(element):
|
|||
|
||||
serializeElement(element)
|
||||
|
||||
if finalText is not None:
|
||||
rv.append("%s\"" % (' ' * 2, finalText))
|
||||
|
||||
return "".join(rv)
|
||||
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
class TreeBuilder(base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
elementClass = None
|
||||
|
@ -189,13 +182,15 @@ class TreeBuilder(_base.TreeBuilder):
|
|||
|
||||
def __init__(self, namespaceHTMLElements, fullTree=False):
|
||||
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
|
||||
infosetFilter = self.infosetFilter = ihatexml.InfosetFilter(preventDoubleDashComments=True)
|
||||
infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
|
||||
self.namespaceHTMLElements = namespaceHTMLElements
|
||||
|
||||
class Attributes(dict):
|
||||
def __init__(self, element, value={}):
|
||||
def __init__(self, element, value=None):
|
||||
if value is None:
|
||||
value = {}
|
||||
self._element = element
|
||||
dict.__init__(self, value)
|
||||
dict.__init__(self, value) # pylint:disable=non-parent-init-called
|
||||
for key, value in self.items():
|
||||
if isinstance(key, tuple):
|
||||
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
|
||||
|
@ -259,10 +254,10 @@ class TreeBuilder(_base.TreeBuilder):
|
|||
self.elementClass = Element
|
||||
self.commentClass = Comment
|
||||
# self.fragmentClass = builder.DocumentFragment
|
||||
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
||||
base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
||||
|
||||
def reset(self):
|
||||
_base.TreeBuilder.reset(self)
|
||||
base.TreeBuilder.reset(self)
|
||||
self.insertComment = self.insertCommentInitial
|
||||
self.initial_comments = []
|
||||
self.doctype = None
|
||||
|
@ -303,12 +298,14 @@ class TreeBuilder(_base.TreeBuilder):
|
|||
self.doctype = doctype
|
||||
|
||||
def insertCommentInitial(self, data, parent=None):
|
||||
assert parent is None or parent is self.document
|
||||
assert self.document._elementTree is None
|
||||
self.initial_comments.append(data)
|
||||
|
||||
def insertCommentMain(self, data, parent=None):
|
||||
if (parent == self.document and
|
||||
self.document._elementTree.getroot()[-1].tag == comment_type):
|
||||
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
|
||||
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
|
||||
super(TreeBuilder, self).insertComment(data, parent)
|
||||
|
||||
def insertRoot(self, token):
|
||||
|
|
|
@ -10,10 +10,10 @@ returning an iterator generating tokens.
|
|||
|
||||
from __future__ import absolute_import, division, unicode_literals
|
||||
|
||||
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshistream", "lxmletree"]
|
||||
|
||||
from .. import constants
|
||||
from ..utils import default_etree
|
||||
from .._utils import default_etree
|
||||
|
||||
__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"]
|
||||
|
||||
treeWalkerCache = {}
|
||||
|
||||
|
@ -43,11 +43,11 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
|
|||
from . import dom
|
||||
treeWalkerCache[treeType] = dom.TreeWalker
|
||||
elif treeType == "genshi":
|
||||
from . import genshistream
|
||||
treeWalkerCache[treeType] = genshistream.TreeWalker
|
||||
from . import genshi
|
||||
treeWalkerCache[treeType] = genshi.TreeWalker
|
||||
elif treeType == "lxml":
|
||||
from . import lxmletree
|
||||
treeWalkerCache[treeType] = lxmletree.TreeWalker
|
||||
from . import etree_lxml
|
||||
treeWalkerCache[treeType] = etree_lxml.TreeWalker
|
||||
elif treeType == "etree":
|
||||
from . import etree
|
||||
if implementation is None:
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
from __future__ import absolute_import, division, unicode_literals
|
||||
from six import text_type, string_types
|
||||
|
||||
from xml.dom import Node
|
||||
from ..constants import namespaces, voidElements, spaceCharacters
|
||||
|
||||
__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
|
||||
"TreeWalker", "NonRecursiveTreeWalker"]
|
||||
|
||||
from xml.dom import Node
|
||||
|
||||
DOCUMENT = Node.DOCUMENT_NODE
|
||||
DOCTYPE = Node.DOCUMENT_TYPE_NODE
|
||||
TEXT = Node.TEXT_NODE
|
||||
|
@ -14,28 +14,9 @@ COMMENT = Node.COMMENT_NODE
|
|||
ENTITY = Node.ENTITY_NODE
|
||||
UNKNOWN = "<#UNKNOWN#>"
|
||||
|
||||
from ..constants import voidElements, spaceCharacters
|
||||
spaceCharacters = "".join(spaceCharacters)
|
||||
|
||||
|
||||
def to_text(s, blank_if_none=True):
|
||||
"""Wrapper around six.text_type to convert None to empty string"""
|
||||
if s is None:
|
||||
if blank_if_none:
|
||||
return ""
|
||||
else:
|
||||
return None
|
||||
elif isinstance(s, text_type):
|
||||
return s
|
||||
else:
|
||||
return text_type(s)
|
||||
|
||||
|
||||
def is_text_or_none(string):
|
||||
"""Wrapper around isinstance(string_types) or is None"""
|
||||
return string is None or isinstance(string, string_types)
|
||||
|
||||
|
||||
class TreeWalker(object):
|
||||
def __init__(self, tree):
|
||||
self.tree = tree
|
||||
|
@ -47,47 +28,25 @@ class TreeWalker(object):
|
|||
return {"type": "SerializeError", "data": msg}
|
||||
|
||||
def emptyTag(self, namespace, name, attrs, hasChildren=False):
|
||||
assert namespace is None or isinstance(namespace, string_types), type(namespace)
|
||||
assert isinstance(name, string_types), type(name)
|
||||
assert all((namespace is None or isinstance(namespace, string_types)) and
|
||||
isinstance(name, string_types) and
|
||||
isinstance(value, string_types)
|
||||
for (namespace, name), value in attrs.items())
|
||||
|
||||
yield {"type": "EmptyTag", "name": to_text(name, False),
|
||||
"namespace": to_text(namespace),
|
||||
yield {"type": "EmptyTag", "name": name,
|
||||
"namespace": namespace,
|
||||
"data": attrs}
|
||||
if hasChildren:
|
||||
yield self.error("Void element has children")
|
||||
|
||||
def startTag(self, namespace, name, attrs):
|
||||
assert namespace is None or isinstance(namespace, string_types), type(namespace)
|
||||
assert isinstance(name, string_types), type(name)
|
||||
assert all((namespace is None or isinstance(namespace, string_types)) and
|
||||
isinstance(name, string_types) and
|
||||
isinstance(value, string_types)
|
||||
for (namespace, name), value in attrs.items())
|
||||
|
||||
return {"type": "StartTag",
|
||||
"name": text_type(name),
|
||||
"namespace": to_text(namespace),
|
||||
"data": dict(((to_text(namespace, False), to_text(name)),
|
||||
to_text(value, False))
|
||||
for (namespace, name), value in attrs.items())}
|
||||
"name": name,
|
||||
"namespace": namespace,
|
||||
"data": attrs}
|
||||
|
||||
def endTag(self, namespace, name):
|
||||
assert namespace is None or isinstance(namespace, string_types), type(namespace)
|
||||
assert isinstance(name, string_types), type(namespace)
|
||||
|
||||
return {"type": "EndTag",
|
||||
"name": to_text(name, False),
|
||||
"namespace": to_text(namespace),
|
||||
"data": {}}
|
||||
"name": name,
|
||||
"namespace": namespace}
|
||||
|
||||
def text(self, data):
|
||||
assert isinstance(data, string_types), type(data)
|
||||
|
||||
data = to_text(data)
|
||||
data = data
|
||||
middle = data.lstrip(spaceCharacters)
|
||||
left = data[:len(data) - len(middle)]
|
||||
if left:
|
||||
|
@ -101,25 +60,16 @@ class TreeWalker(object):
|
|||
yield {"type": "SpaceCharacters", "data": right}
|
||||
|
||||
def comment(self, data):
|
||||
assert isinstance(data, string_types), type(data)
|
||||
|
||||
return {"type": "Comment", "data": text_type(data)}
|
||||
|
||||
def doctype(self, name, publicId=None, systemId=None, correct=True):
|
||||
assert is_text_or_none(name), type(name)
|
||||
assert is_text_or_none(publicId), type(publicId)
|
||||
assert is_text_or_none(systemId), type(systemId)
|
||||
return {"type": "Comment", "data": data}
|
||||
|
||||
def doctype(self, name, publicId=None, systemId=None):
|
||||
return {"type": "Doctype",
|
||||
"name": to_text(name),
|
||||
"publicId": to_text(publicId),
|
||||
"systemId": to_text(systemId),
|
||||
"correct": to_text(correct)}
|
||||
"name": name,
|
||||
"publicId": publicId,
|
||||
"systemId": systemId}
|
||||
|
||||
def entity(self, name):
|
||||
assert isinstance(name, string_types), type(name)
|
||||
|
||||
return {"type": "Entity", "name": text_type(name)}
|
||||
return {"type": "Entity", "name": name}
|
||||
|
||||
def unknown(self, nodeType):
|
||||
return self.error("Unknown node type: " + nodeType)
|
||||
|
@ -154,7 +104,7 @@ class NonRecursiveTreeWalker(TreeWalker):
|
|||
|
||||
elif type == ELEMENT:
|
||||
namespace, name, attributes, hasChildren = details
|
||||
if name in voidElements:
|
||||
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
|
||||
for token in self.emptyTag(namespace, name, attributes,
|
||||
hasChildren):
|
||||
yield token
|
||||
|
@ -187,7 +137,7 @@ class NonRecursiveTreeWalker(TreeWalker):
|
|||
type, details = details[0], details[1:]
|
||||
if type == ELEMENT:
|
||||
namespace, name, attributes, hasChildren = details
|
||||
if name not in voidElements:
|
||||
if (namespace and namespace != namespaces["html"]) or name not in voidElements:
|
||||
yield self.endTag(namespace, name)
|
||||
if self.tree is currentNode:
|
||||
currentNode = None
|
|
@ -2,16 +2,16 @@ from __future__ import absolute_import, division, unicode_literals
|
|||
|
||||
from xml.dom import Node
|
||||
|
||||
from . import _base
|
||||
from . import base
|
||||
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
class TreeWalker(base.NonRecursiveTreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
return _base.DOCTYPE, node.name, node.publicId, node.systemId
|
||||
return base.DOCTYPE, node.name, node.publicId, node.systemId
|
||||
|
||||
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
||||
return _base.TEXT, node.nodeValue
|
||||
return base.TEXT, node.nodeValue
|
||||
|
||||
elif node.nodeType == Node.ELEMENT_NODE:
|
||||
attrs = {}
|
||||
|
@ -21,17 +21,17 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
|||
attrs[(attr.namespaceURI, attr.localName)] = attr.value
|
||||
else:
|
||||
attrs[(None, attr.name)] = attr.value
|
||||
return (_base.ELEMENT, node.namespaceURI, node.nodeName,
|
||||
return (base.ELEMENT, node.namespaceURI, node.nodeName,
|
||||
attrs, node.hasChildNodes())
|
||||
|
||||
elif node.nodeType == Node.COMMENT_NODE:
|
||||
return _base.COMMENT, node.nodeValue
|
||||
return base.COMMENT, node.nodeValue
|
||||
|
||||
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
|
||||
return (_base.DOCUMENT,)
|
||||
return (base.DOCUMENT,)
|
||||
|
||||
else:
|
||||
return _base.UNKNOWN, node.nodeType
|
||||
return base.UNKNOWN, node.nodeType
|
||||
|
||||
def getFirstChild(self, node):
|
||||
return node.firstChild
|
||||
|
|
|
@ -12,8 +12,8 @@ import re
|
|||
|
||||
from six import string_types
|
||||
|
||||
from . import _base
|
||||
from ..utils import moduleFactoryFactory
|
||||
from . import base
|
||||
from .._utils import moduleFactoryFactory
|
||||
|
||||
tag_regexp = re.compile("{([^}]*)}(.*)")
|
||||
|
||||
|
@ -22,7 +22,7 @@ def getETreeBuilder(ElementTreeImplementation):
|
|||
ElementTree = ElementTreeImplementation
|
||||
ElementTreeCommentType = ElementTree.Comment("asd").tag
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable
|
||||
"""Given the particular ElementTree representation, this implementation,
|
||||
to avoid using recursion, returns "nodes" as tuples with the following
|
||||
content:
|
||||
|
@ -38,9 +38,9 @@ def getETreeBuilder(ElementTreeImplementation):
|
|||
"""
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # It might be the root Element
|
||||
elt, key, parents, flag = node
|
||||
elt, _, _, flag = node
|
||||
if flag in ("text", "tail"):
|
||||
return _base.TEXT, getattr(elt, flag)
|
||||
return base.TEXT, getattr(elt, flag)
|
||||
else:
|
||||
node = elt
|
||||
|
||||
|
@ -48,14 +48,14 @@ def getETreeBuilder(ElementTreeImplementation):
|
|||
node = node.getroot()
|
||||
|
||||
if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"):
|
||||
return (_base.DOCUMENT,)
|
||||
return (base.DOCUMENT,)
|
||||
|
||||
elif node.tag == "<!DOCTYPE>":
|
||||
return (_base.DOCTYPE, node.text,
|
||||
return (base.DOCTYPE, node.text,
|
||||
node.get("publicId"), node.get("systemId"))
|
||||
|
||||
elif node.tag == ElementTreeCommentType:
|
||||
return _base.COMMENT, node.text
|
||||
return base.COMMENT, node.text
|
||||
|
||||
else:
|
||||
assert isinstance(node.tag, string_types), type(node.tag)
|
||||
|
@ -73,7 +73,7 @@ def getETreeBuilder(ElementTreeImplementation):
|
|||
attrs[(match.group(1), match.group(2))] = value
|
||||
else:
|
||||
attrs[(None, name)] = value
|
||||
return (_base.ELEMENT, namespace, tag,
|
||||
return (base.ELEMENT, namespace, tag,
|
||||
attrs, len(node) or node.text)
|
||||
|
||||
def getFirstChild(self, node):
|
||||
|
|
|
@ -4,9 +4,9 @@ from six import text_type
|
|||
from lxml import etree
|
||||
from ..treebuilders.etree import tag_regexp
|
||||
|
||||
from . import _base
|
||||
from . import base
|
||||
|
||||
from .. import ihatexml
|
||||
from .. import _ihatexml
|
||||
|
||||
|
||||
def ensure_str(s):
|
||||
|
@ -15,20 +15,27 @@ def ensure_str(s):
|
|||
elif isinstance(s, text_type):
|
||||
return s
|
||||
else:
|
||||
return s.decode("utf-8", "strict")
|
||||
return s.decode("ascii", "strict")
|
||||
|
||||
|
||||
class Root(object):
|
||||
def __init__(self, et):
|
||||
self.elementtree = et
|
||||
self.children = []
|
||||
if et.docinfo.internalDTD:
|
||||
self.children.append(Doctype(self,
|
||||
ensure_str(et.docinfo.root_name),
|
||||
ensure_str(et.docinfo.public_id),
|
||||
ensure_str(et.docinfo.system_url)))
|
||||
root = et.getroot()
|
||||
node = root
|
||||
|
||||
try:
|
||||
if et.docinfo.internalDTD:
|
||||
self.children.append(Doctype(self,
|
||||
ensure_str(et.docinfo.root_name),
|
||||
ensure_str(et.docinfo.public_id),
|
||||
ensure_str(et.docinfo.system_url)))
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
try:
|
||||
node = et.getroot()
|
||||
except AttributeError:
|
||||
node = et
|
||||
|
||||
while node.getprevious() is not None:
|
||||
node = node.getprevious()
|
||||
|
@ -115,35 +122,38 @@ class FragmentWrapper(object):
|
|||
return len(self.obj)
|
||||
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
class TreeWalker(base.NonRecursiveTreeWalker):
|
||||
def __init__(self, tree):
|
||||
if hasattr(tree, "getroot"):
|
||||
tree = Root(tree)
|
||||
elif isinstance(tree, list):
|
||||
# pylint:disable=redefined-variable-type
|
||||
if isinstance(tree, list):
|
||||
self.fragmentChildren = set(tree)
|
||||
tree = FragmentRoot(tree)
|
||||
_base.NonRecursiveTreeWalker.__init__(self, tree)
|
||||
self.filter = ihatexml.InfosetFilter()
|
||||
else:
|
||||
self.fragmentChildren = set()
|
||||
tree = Root(tree)
|
||||
base.NonRecursiveTreeWalker.__init__(self, tree)
|
||||
self.filter = _ihatexml.InfosetFilter()
|
||||
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, tuple): # Text node
|
||||
node, key = node
|
||||
assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
|
||||
return _base.TEXT, ensure_str(getattr(node, key))
|
||||
return base.TEXT, ensure_str(getattr(node, key))
|
||||
|
||||
elif isinstance(node, Root):
|
||||
return (_base.DOCUMENT,)
|
||||
return (base.DOCUMENT,)
|
||||
|
||||
elif isinstance(node, Doctype):
|
||||
return _base.DOCTYPE, node.name, node.public_id, node.system_id
|
||||
return base.DOCTYPE, node.name, node.public_id, node.system_id
|
||||
|
||||
elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
|
||||
return _base.TEXT, node.obj
|
||||
return base.TEXT, ensure_str(node.obj)
|
||||
|
||||
elif node.tag == etree.Comment:
|
||||
return _base.COMMENT, ensure_str(node.text)
|
||||
return base.COMMENT, ensure_str(node.text)
|
||||
|
||||
elif node.tag == etree.Entity:
|
||||
return _base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
|
||||
return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
|
||||
|
||||
else:
|
||||
# This is assumed to be an ordinary element
|
||||
|
@ -162,7 +172,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
|||
attrs[(match.group(1), match.group(2))] = value
|
||||
else:
|
||||
attrs[(None, name)] = value
|
||||
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag),
|
||||
return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
|
||||
attrs, len(node) > 0 or node.text)
|
||||
|
||||
def getFirstChild(self, node):
|
||||
|
@ -197,5 +207,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
|||
if key == "text":
|
||||
return node
|
||||
# else: fallback to "normal" processing
|
||||
elif node in self.fragmentChildren:
|
||||
return None
|
||||
|
||||
return node.getparent()
|
|
@ -4,12 +4,12 @@ from genshi.core import QName
|
|||
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
|
||||
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
||||
|
||||
from . import _base
|
||||
from . import base
|
||||
|
||||
from ..constants import voidElements, namespaces
|
||||
|
||||
|
||||
class TreeWalker(_base.TreeWalker):
|
||||
class TreeWalker(base.TreeWalker):
|
||||
def __iter__(self):
|
||||
# Buffer the events so we can pass in the following one
|
||||
previous = None
|
||||
|
@ -25,7 +25,7 @@ class TreeWalker(_base.TreeWalker):
|
|||
yield token
|
||||
|
||||
def tokens(self, event, next):
|
||||
kind, data, pos = event
|
||||
kind, data, _ = event
|
||||
if kind == START:
|
||||
tag, attribs = data
|
||||
name = tag.localname
|
||||
|
@ -39,8 +39,8 @@ class TreeWalker(_base.TreeWalker):
|
|||
|
||||
if namespace == namespaces["html"] and name in voidElements:
|
||||
for token in self.emptyTag(namespace, name, converted_attribs,
|
||||
not next or next[0] != END
|
||||
or next[1] != tag):
|
||||
not next or next[0] != END or
|
||||
next[1] != tag):
|
||||
yield token
|
||||
else:
|
||||
yield self.startTag(namespace, name, converted_attribs)
|
||||
|
@ -48,7 +48,7 @@ class TreeWalker(_base.TreeWalker):
|
|||
elif kind == END:
|
||||
name = data.localname
|
||||
namespace = data.namespace
|
||||
if name not in voidElements:
|
||||
if namespace != namespaces["html"] or name not in voidElements:
|
||||
yield self.endTag(namespace, name)
|
||||
|
||||
elif kind == COMMENT:
|
Loading…
Reference in a new issue