SickGear/lib/feedparser/http.py

# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>
# Copyright 2002-2008 Mark Pilgrim
# All rights reserved.
#
# This file is a part of feedparser.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
#   this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice,
#   this list of conditions and the following disclaimer in the documentation
#   and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import base64
import datetime
import gzip
import io
import re
import struct
import urllib.parse
import urllib.request
import zlib

from .datetimes import _parse_date
from .urls import convert_to_idn


# HTTP "Accept" header to send to servers when downloading feeds.  If you don't
# want to send an Accept header, set this to None.
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"


class URLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):
    def http_error_default(self, req, fp, code, msg, headers):
        # The default implementation just raises HTTPError.
        # Forget that.
        fp.status = code
        return fp

    def http_error_301(self, req, fp, code, msg, hdrs):
        result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs)
        if not result:
            return fp
        result.status = code
        result.newurl = result.geturl()
        return result

    # The default implementations in urllib.request.HTTPRedirectHandler
    # are identical, so hardcoding a http_error_301 call above
    # won't affect anything
    http_error_300 = http_error_301
    http_error_302 = http_error_301
    http_error_303 = http_error_301
    http_error_307 = http_error_301

    def http_error_401(self, req, fp, code, msg, headers):
        # Check if
        # - server requires digest auth, AND
        # - we tried (unsuccessfully) with basic auth, AND
        # If all conditions hold, parse authentication information
        # out of the Authorization header we sent the first time
        # (for the username and password) and the WWW-Authenticate
        # header the server sent back (for the realm) and retry
        # the request with the appropriate digest auth headers instead.
        # This evil genius hack has been brought to you by Aaron Swartz.
        host = urllib.parse.urlparse(req.get_full_url())[1]
        if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers:
            return self.http_error_default(req, fp, code, msg, headers)
        auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode()).decode()
        user, passw = auth.split(':')
        realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
        self.add_password(realm, host, user, passw)
        retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
        self.reset_retry_count()
        return retry


def _build_urllib2_request(url, agent, accept_header, etag, modified, referrer, auth, request_headers):
    request = urllib.request.Request(url)
    request.add_header('User-Agent', agent)
    if etag:
        request.add_header('If-None-Match', etag)
    if isinstance(modified, str):
        modified = _parse_date(modified)
    elif isinstance(modified, datetime.datetime):
        modified = modified.utctimetuple()
    if modified:
        # format into an RFC 1123-compliant timestamp. We can't use
        # time.strftime() since the %a and %b directives can be affected
        # by the current locale, but RFC 2616 states that dates must be
        # in English.
        short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
        months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
    if referrer:
        request.add_header('Referer', referrer)
    request.add_header('Accept-encoding', 'gzip, deflate')
    if auth:
        request.add_header('Authorization', 'Basic %s' % auth)
    if accept_header:
        request.add_header('Accept', accept_header)
    # use this for whatever -- cookies, special headers, etc
    # [('Cookie','Something'),('x-special-header','Another Value')]
    for header_name, header_value in request_headers.items():
        request.add_header(header_name, header_value)
    request.add_header('A-IM', 'feed')  # RFC 3229 support
    return request


def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, result=None):
    if handlers is None:
        handlers = []
    elif not isinstance(handlers, list):
        handlers = [handlers]
    if request_headers is None:
        request_headers = {}

    # Deal with the feed URI scheme
    if url.startswith('feed:http'):
        url = url[5:]
    elif url.startswith('feed:'):
        url = 'http:' + url[5:]
    if not agent:
        from . import USER_AGENT
        agent = USER_AGENT
    # Test for inline user:password credentials for HTTP basic auth
    auth = None
    if not url.startswith('ftp:'):
        url_pieces = urllib.parse.urlparse(url)
        if url_pieces.username:
            new_pieces = list(url_pieces)
            new_pieces[1] = url_pieces.hostname
            if url_pieces.port:
                new_pieces[1] = f'{url_pieces.hostname}:{url_pieces.port}'
            url = urllib.parse.urlunparse(new_pieces)
            auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}'.encode()).decode()

    # iri support
    if not isinstance(url, bytes):
        url = convert_to_idn(url)

    # Prevent UnicodeEncodeErrors caused by Unicode characters in the path.
    bits = []
    for c in url:
        try:
            c.encode('ascii')
        except UnicodeEncodeError:
            bits.append(urllib.parse.quote(c))
        else:
            bits.append(c)
    url = ''.join(bits)

    # try to open with urllib2 (to use optional headers)
    request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers)
    opener = urllib.request.build_opener(*tuple(handlers + [URLHandler()]))
    opener.addheaders = []  # RMK - must clear so we only send our custom User-Agent
    f = opener.open(request)
    data = f.read()
    f.close()

    # lowercase all of the HTTP headers for comparisons per RFC 2616
    result['headers'] = {k.lower(): v for k, v in f.headers.items()}

    # if feed is gzip-compressed, decompress it
    if data and 'gzip' in result['headers'].get('content-encoding', ''):
        try:
            data = gzip.GzipFile(fileobj=io.BytesIO(data)).read()
        except (EOFError, IOError, struct.error) as e:
            # IOError can occur if the gzip header is bad.
            # struct.error can occur if the data is damaged.
            result['bozo'] = True
            result['bozo_exception'] = e
            if isinstance(e, struct.error):
                # A gzip header was found but the data is corrupt.
                # Ideally, we should re-request the feed without the
                # 'Accept-encoding: gzip' header, but we don't.
                data = None
    elif data and 'deflate' in result['headers'].get('content-encoding', ''):
        try:
            data = zlib.decompress(data)
        except zlib.error:
            try:
                # The data may have no headers and no checksum.
                data = zlib.decompress(data, -15)
            except zlib.error as e:
                result['bozo'] = True
                result['bozo_exception'] = e

    # save HTTP headers
    if 'etag' in result['headers']:
        etag = result['headers'].get('etag', '')
        if isinstance(etag, bytes):
            etag = etag.decode('utf-8', 'ignore')
        if etag:
            result['etag'] = etag
    if 'last-modified' in result['headers']:
        modified = result['headers'].get('last-modified', '')
        if modified:
            result['modified'] = modified
            result['modified_parsed'] = _parse_date(modified)
    if isinstance(f.url, bytes):
        result['href'] = f.url.decode('utf-8', 'ignore')
    else:
        result['href'] = f.url
    result['status'] = getattr(f, 'status', None) or 200

    # Stop processing if the server sent HTTP 304 Not Modified.
    if getattr(f, 'code', 0) == 304:
        result['version'] = ''
        result['debug_message'] = 'The feed has not changed since you last checked, ' + \
            'so the server sent no data.  This is a feature, not a bug!'

    return data
Update feedparser 6.0.1 (98d189fa) → 6.0.10 (5fcb3ae). 2023-01-13 20:16:45 +00:00			`# Copyright 2010-2022 Kurt McKee <contactme@kurtmckee.org>`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`# Copyright 2002-2008 Mark Pilgrim`
			`# All rights reserved.`
			`#`
			`# This file is a part of feedparser.`
			`#`
			`# Redistribution and use in source and binary forms, with or without`
			`# modification, are permitted provided that the following conditions are met:`
			`#`
			`# * Redistributions of source code must retain the above copyright notice,`
			`# this list of conditions and the following disclaimer.`
			`# * Redistributions in binary form must reproduce the above copyright notice,`
			`# this list of conditions and the following disclaimer in the documentation`
			`# and/or other materials provided with the distribution.`
			`#`
			`# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'`
			`# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE`
			`# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR`
			`# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF`
			`# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS`
			`# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN`
			`# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)`
			`# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE`
			`# POSSIBILITY OF SUCH DAMAGE.`

			`import base64`
			`import datetime`
			`import gzip`
			`import io`
			`import re`
			`import struct`
			`import urllib.parse`
			`import urllib.request`
			`import zlib`

			`from .datetimes import _parse_date`
			`from .urls import convert_to_idn`


			`# HTTP "Accept" header to send to servers when downloading feeds. If you don't`
			`# want to send an Accept header, set this to None.`
			`ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,/;q=0.1"`


Update feedparser 6.0.1 (98d189fa) → 6.0.10 (5fcb3ae). 2023-01-13 20:16:45 +00:00			`class URLHandler(urllib.request.HTTPDigestAuthHandler, urllib.request.HTTPRedirectHandler, urllib.request.HTTPDefaultErrorHandler):`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`def http_error_default(self, req, fp, code, msg, headers):`
			`# The default implementation just raises HTTPError.`
			`# Forget that.`
			`fp.status = code`
			`return fp`

			`def http_error_301(self, req, fp, code, msg, hdrs):`
			`result = urllib.request.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, hdrs)`
Update feedparser 6.0.1 (98d189fa) → 6.0.10 (5fcb3ae). 2023-01-13 20:16:45 +00:00			`if not result:`
			`return fp`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`result.status = code`
			`result.newurl = result.geturl()`
			`return result`

			`# The default implementations in urllib.request.HTTPRedirectHandler`
			`# are identical, so hardcoding a http_error_301 call above`
			`# won't affect anything`
			`http_error_300 = http_error_301`
			`http_error_302 = http_error_301`
			`http_error_303 = http_error_301`
			`http_error_307 = http_error_301`

			`def http_error_401(self, req, fp, code, msg, headers):`
			`# Check if`
			`# - server requires digest auth, AND`
			`# - we tried (unsuccessfully) with basic auth, AND`
			`# If all conditions hold, parse authentication information`
			`# out of the Authorization header we sent the first time`
			`# (for the username and password) and the WWW-Authenticate`
			`# header the server sent back (for the realm) and retry`
			`# the request with the appropriate digest auth headers instead.`
			`# This evil genius hack has been brought to you by Aaron Swartz.`
			`host = urllib.parse.urlparse(req.get_full_url())[1]`
			`if 'Authorization' not in req.headers or 'WWW-Authenticate' not in headers:`
			`return self.http_error_default(req, fp, code, msg, headers)`
Update feedparser 6.0.1 (98d189fa) → 6.0.10 (5fcb3ae). 2023-01-13 20:16:45 +00:00			`auth = base64.decodebytes(req.headers['Authorization'].split(' ')[1].encode()).decode()`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`user, passw = auth.split(':')`
			`realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]`
			`self.add_password(realm, host, user, passw)`
			`retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)`
			`self.reset_retry_count()`
			`return retry`


			`def _build_urllib2_request(url, agent, accept_header, etag, modified, referrer, auth, request_headers):`
			`request = urllib.request.Request(url)`
			`request.add_header('User-Agent', agent)`
			`if etag:`
			`request.add_header('If-None-Match', etag)`
			`if isinstance(modified, str):`
			`modified = _parse_date(modified)`
			`elif isinstance(modified, datetime.datetime):`
			`modified = modified.utctimetuple()`
			`if modified:`
			`# format into an RFC 1123-compliant timestamp. We can't use`
			`# time.strftime() since the %a and %b directives can be affected`
			`# by the current locale, but RFC 2616 states that dates must be`
			`# in English.`
			`short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']`
			`months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']`
			`request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))`
			`if referrer:`
			`request.add_header('Referer', referrer)`
			`request.add_header('Accept-encoding', 'gzip, deflate')`
			`if auth:`
			`request.add_header('Authorization', 'Basic %s' % auth)`
			`if accept_header:`
			`request.add_header('Accept', accept_header)`
			`# use this for whatever -- cookies, special headers, etc`
			`# [('Cookie','Something'),('x-special-header','Another Value')]`
			`for header_name, header_value in request_headers.items():`
			`request.add_header(header_name, header_value)`
			`request.add_header('A-IM', 'feed') # RFC 3229 support`
			`return request`


			`def get(url, etag=None, modified=None, agent=None, referrer=None, handlers=None, request_headers=None, result=None):`
			`if handlers is None:`
			`handlers = []`
			`elif not isinstance(handlers, list):`
			`handlers = [handlers]`
			`if request_headers is None:`
			`request_headers = {}`

			`# Deal with the feed URI scheme`
			`if url.startswith('feed:http'):`
			`url = url[5:]`
			`elif url.startswith('feed:'):`
			`url = 'http:' + url[5:]`
			`if not agent:`
			`from . import USER_AGENT`
			`agent = USER_AGENT`
			`# Test for inline user:password credentials for HTTP basic auth`
			`auth = None`
			`if not url.startswith('ftp:'):`
			`url_pieces = urllib.parse.urlparse(url)`
			`if url_pieces.username:`
			`new_pieces = list(url_pieces)`
			`new_pieces[1] = url_pieces.hostname`
			`if url_pieces.port:`
			`new_pieces[1] = f'{url_pieces.hostname}:{url_pieces.port}'`
			`url = urllib.parse.urlunparse(new_pieces)`
Update feedparser 6.0.1 (98d189fa) → 6.0.10 (5fcb3ae). 2023-01-13 20:16:45 +00:00			`auth = base64.standard_b64encode(f'{url_pieces.username}:{url_pieces.password}'.encode()).decode()`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
			`# iri support`
			`if not isinstance(url, bytes):`
			`url = convert_to_idn(url)`

Update feedparser 6.0.1 (98d189fa) → 6.0.10 (5fcb3ae). 2023-01-13 20:16:45 +00:00			`# Prevent UnicodeEncodeErrors caused by Unicode characters in the path.`
			`bits = []`
			`for c in url:`
			`try:`
			`c.encode('ascii')`
			`except UnicodeEncodeError:`
			`bits.append(urllib.parse.quote(c))`
			`else:`
			`bits.append(c)`
			`url = ''.join(bits)`

Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`# try to open with urllib2 (to use optional headers)`
			`request = _build_urllib2_request(url, agent, ACCEPT_HEADER, etag, modified, referrer, auth, request_headers)`
Update feedparser 6.0.1 (98d189fa) → 6.0.10 (5fcb3ae). 2023-01-13 20:16:45 +00:00			`opener = urllib.request.build_opener(*tuple(handlers + [URLHandler()]))`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00			`opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent`
			`f = opener.open(request)`
			`data = f.read()`
			`f.close()`

			`# lowercase all of the HTTP headers for comparisons per RFC 2616`
			`result['headers'] = {k.lower(): v for k, v in f.headers.items()}`

			`# if feed is gzip-compressed, decompress it`
			`if data and 'gzip' in result['headers'].get('content-encoding', ''):`
			`try:`
			`data = gzip.GzipFile(fileobj=io.BytesIO(data)).read()`
			`except (EOFError, IOError, struct.error) as e:`
			`# IOError can occur if the gzip header is bad.`
			`# struct.error can occur if the data is damaged.`
			`result['bozo'] = True`
			`result['bozo_exception'] = e`
			`if isinstance(e, struct.error):`
			`# A gzip header was found but the data is corrupt.`
			`# Ideally, we should re-request the feed without the`
			`# 'Accept-encoding: gzip' header, but we don't.`
			`data = None`
			`elif data and 'deflate' in result['headers'].get('content-encoding', ''):`
			`try:`
			`data = zlib.decompress(data)`
			`except zlib.error:`
			`try:`
			`# The data may have no headers and no checksum.`
			`data = zlib.decompress(data, -15)`
			`except zlib.error as e:`
			`result['bozo'] = True`
			`result['bozo_exception'] = e`

			`# save HTTP headers`
			`if 'etag' in result['headers']:`
			`etag = result['headers'].get('etag', '')`
			`if isinstance(etag, bytes):`
			`etag = etag.decode('utf-8', 'ignore')`
			`if etag:`
			`result['etag'] = etag`
			`if 'last-modified' in result['headers']:`
			`modified = result['headers'].get('last-modified', '')`
			`if modified:`
			`result['modified'] = modified`
			`result['modified_parsed'] = _parse_date(modified)`
			`if isinstance(f.url, bytes):`
			`result['href'] = f.url.decode('utf-8', 'ignore')`
			`else:`
			`result['href'] = f.url`
Update feedparser 6.0.1 (98d189fa) → 6.0.10 (5fcb3ae). 2023-01-13 20:16:45 +00:00			`result['status'] = getattr(f, 'status', None) or 200`
Change bump to major version 3.xx to signal that this branch supports Python3+ only. Initial SickGear for Python 3. 2023-01-12 01:04:47 +00:00
			`# Stop processing if the server sent HTTP 304 Not Modified.`
			`if getattr(f, 'code', 0) == 304:`
			`result['version'] = ''`
			`result['debug_message'] = 'The feed has not changed since you last checked, ' + \`
			`'so the server sent no data. This is a feature, not a bug!'`

			`return data`