#!/usr/bin/env python # # Copyright 2009 Facebook # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain # a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations # under the License. """Escaping/unescaping methods for HTML, JSON, URLs, and others. Also includes a few other miscellaneous string manipulation functions that have crept in over time. """ from __future__ import absolute_import, division, print_function, with_statement import re import sys from tornado.util import unicode_type, basestring_type, u try: from urllib.parse import parse_qs as _parse_qs # py3 except ImportError: from urlparse import parse_qs as _parse_qs # Python 2.6+ try: import htmlentitydefs # py2 except ImportError: import html.entities as htmlentitydefs # py3 try: import urllib.parse as urllib_parse # py3 except ImportError: import urllib as urllib_parse # py2 import json try: unichr except NameError: unichr = chr _XHTML_ESCAPE_RE = re.compile('[&<>"\']') _XHTML_ESCAPE_DICT = {'&': '&', '<': '<', '>': '>', '"': '"', '\'': '''} def xhtml_escape(value): """Escapes a string so it is valid within HTML or XML. Escapes the characters ``<``, ``>``, ``"``, ``'``, and ``&``. When used in attribute values the escaped strings must be enclosed in quotes. .. versionchanged:: 3.2 Added the single quote to the list of escaped characters. """ return _XHTML_ESCAPE_RE.sub(lambda match: _XHTML_ESCAPE_DICT[match.group(0)], to_basestring(value)) def xhtml_unescape(value): """Un-escapes an XML-escaped string.""" return re.sub(r"&(#?)(\w+?);", _convert_entity, _unicode(value)) # The fact that json_encode wraps json.dumps is an implementation detail. # Please see https://github.com/tornadoweb/tornado/pull/706 # before sending a pull request that adds **kwargs to this function. def json_encode(value): """JSON-encodes the given Python object.""" # JSON permits but does not require forward slashes to be escaped. # This is useful when json data is emitted in a tags from prematurely terminating # the javascript. Some json libraries do this escaping by default, # although python's standard library does not, so we do it here. # http://stackoverflow.com/questions/1580647/json-why-are-forward-slashes-escaped return json.dumps(value).replace("?@\[\]^`{|}~\s]))|(?:\((?:[^\s&()]|&|")*\)))+)""")) def linkify(text, shorten=False, extra_params="", require_protocol=False, permitted_protocols=["http", "https"]): """Converts plain text into HTML with links. For example: ``linkify("Hello http://tornadoweb.org!")`` would return ``Hello http://tornadoweb.org!`` Parameters: * ``shorten``: Long urls will be shortened for display. * ``extra_params``: Extra text to include in the link tag, or a callable taking the link as an argument and returning the extra text e.g. ``linkify(text, extra_params='rel="nofollow" class="external"')``, or:: def extra_params_cb(url): if url.startswith("http://example.com"): return 'class="internal"' else: return 'class="external" rel="nofollow"' linkify(text, extra_params=extra_params_cb) * ``require_protocol``: Only linkify urls which include a protocol. If this is False, urls such as www.facebook.com will also be linkified. * ``permitted_protocols``: List (or set) of protocols which should be linkified, e.g. ``linkify(text, permitted_protocols=["http", "ftp", "mailto"])``. It is very unsafe to include protocols such as ``javascript``. """ if extra_params and not callable(extra_params): extra_params = " " + extra_params.strip() def make_link(m): url = m.group(1) proto = m.group(2) if require_protocol and not proto: return url # not protocol, no linkify if proto and proto not in permitted_protocols: return url # bad protocol, no linkify href = m.group(1) if not proto: href = "http://" + href # no proto specified, use http if callable(extra_params): params = " " + extra_params(href).strip() else: params = extra_params # clip long urls. max_len is just an approximation max_len = 30 if shorten and len(url) > max_len: before_clip = url if proto: proto_len = len(proto) + 1 + len(m.group(3) or "") # +1 for : else: proto_len = 0 parts = url[proto_len:].split("/") if len(parts) > 1: # Grab the whole host part plus the first bit of the path # The path is usually not that interesting once shortened # (no more slug, etc), so it really just provides a little # extra indication of shortening. url = url[:proto_len] + parts[0] + "/" + \ parts[1][:8].split('?')[0].split('.')[0] if len(url) > max_len * 1.5: # still too long url = url[:max_len] if url != before_clip: amp = url.rfind('&') # avoid splitting html char entities if amp > max_len - 5: url = url[:amp] url += "..." if len(url) >= len(before_clip): url = before_clip else: # full url is visible on mouse-over (for those who don't # have a status bar, such as Safari by default) params += ' title="%s"' % href return u('%s') % (href, params, url) # First HTML-escape so that our strings are all safe. # The regex is modified to avoid character entites other than & so # that we won't pick up ", etc. text = _unicode(xhtml_escape(text)) return _URL_RE.sub(make_link, text) def _convert_entity(m): if m.group(1) == "#": try: return unichr(int(m.group(2))) except ValueError: return "&#%s;" % m.group(2) try: return _HTML_UNICODE_MAP[m.group(2)] except KeyError: return "&%s;" % m.group(2) def _build_unicode_map(): unicode_map = {} for name, value in htmlentitydefs.name2codepoint.items(): unicode_map[name] = unichr(value) return unicode_map _HTML_UNICODE_MAP = _build_unicode_map()