SickGear/lib/cfscrape.py

172 lines
6.7 KiB
Python
Raw Normal View History

from time import sleep
import logging
import random
import re
from requests.sessions import Session
from requests.models import Response
import js2py
from copy import deepcopy
try:
from urlparse import urlparse
except ImportError:
2018-04-03 15:14:54 +00:00
# noinspection PyCompatibility
from urllib.parse import urlparse
DEFAULT_USER_AGENTS = [
2018-04-03 15:14:54 +00:00
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/41.0.2228.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/50.0.2661.102 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/52.0.2743.116 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0)'
' Gecko/20100101 Firefox/46.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0)'
' Gecko/20100101 Firefox/41.0'
]
DEFAULT_USER_AGENT = random.choice(DEFAULT_USER_AGENTS)
class CloudflareScraper(Session):
2018-04-03 15:14:54 +00:00
def __init__(self):
super(CloudflareScraper, self).__init__()
2018-04-03 15:14:54 +00:00
if 'requests' in self.headers['User-Agent']:
# Spoof Firefox on Linux if no custom User-Agent has been set
2018-04-03 15:14:54 +00:00
self.headers['User-Agent'] = DEFAULT_USER_AGENT
def request(self, method, url, *args, **kwargs):
resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
# Check if Cloudflare anti-bot is on
if (isinstance(resp, type(Response())) and isinstance(resp.headers.get('Server'), basestring)
and 503 == resp.status_code
2018-04-03 15:14:54 +00:00
and re.search('(?i)cloudflare', resp.headers.get('Server'))
and b'jschl_vc' in resp.content
and b'jschl_answer' in resp.content):
return self.solve_cf_challenge(resp, **kwargs)
# Otherwise, no Cloudflare anti-bot detected
return resp
def solve_cf_challenge(self, resp, **original_kwargs):
sleep(5) # Cloudflare requires a delay before solving the challenge
body = resp.text
parsed_url = urlparse(resp.url)
domain = urlparse(resp.url).netloc
2018-04-03 15:14:54 +00:00
submit_url = '%s://%s/cdn-cgi/l/chk_jschl' % (parsed_url.scheme, domain)
2018-04-03 15:14:54 +00:00
cloudflare_kwargs = {k: v for k, v in original_kwargs.items() if k not in ['hooks']}
params = cloudflare_kwargs.setdefault('params', {})
headers = cloudflare_kwargs.setdefault('headers', {})
headers['Referer'] = resp.url
try:
2018-04-03 15:14:54 +00:00
params['jschl_vc'] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1)
params['pass'] = re.search(r'name="pass" value="(.+?)"', body).group(1)
# Extract the arithmetic operation
js = self.extract_js(body)
except Exception:
# Something is wrong with the page.
# This may indicate Cloudflare has changed their anti-bot
# technique. If you see this and are running the latest version,
# please open a GitHub issue so I can update the code accordingly.
2018-04-03 15:14:54 +00:00
logging.error('[!] Unable to parse Cloudflare anti-bots page. '
'Try upgrading cloudflare-scrape, or submit a bug report '
'if you are running the latest version. Please read '
'https://github.com/Anorov/cloudflare-scrape#updates '
'before submitting a bug report.')
raise
# Safely evaluate the Javascript expression
2018-04-05 00:23:27 +00:00
params['jschl_answer'] = str(js2py.eval_js(js) + len(domain))
# Requests transforms any request into a GET after a redirect,
# so the redirect has to be handled manually here to allow for
# performing other types of requests even as the first request.
method = resp.request.method
2018-04-03 15:14:54 +00:00
cloudflare_kwargs['allow_redirects'] = False
redirect = self.request(method, submit_url, **cloudflare_kwargs)
2018-04-03 15:14:54 +00:00
return self.request(method, redirect.headers['Location'], **original_kwargs)
2018-04-03 15:14:54 +00:00
@staticmethod
def extract_js(body):
js = re.search(r'setTimeout\(function\(\){\s+(var '
's,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n', body).group(1)
2018-04-05 00:23:27 +00:00
js = re.sub(r'a\.value\s=\s([+]?.+?)\s?\+\s?[^\.]+\.length.*', r'\1', js)
js = re.sub(r'a\.value\s=\s(parseInt\(.+?\)).+', r'\1', js)
2018-04-03 15:14:54 +00:00
js = re.sub(r'\s{3,}[a-z](?: = |\.).+', '', js)
2018-04-05 00:23:27 +00:00
js = re.sub(r';\s+;', ';', js)
# Strip characters that could be used to exit the string context
# These characters are not currently used in Cloudflare's arithmetic snippet
2018-04-03 15:14:54 +00:00
js = re.sub(r'[\n\\"]', '', js)
return js
@classmethod
2018-04-03 15:14:54 +00:00
def create_scraper(cls, sess=None):
"""
Convenience function for creating a ready-to-go requests.Session (subclass) object.
"""
scraper = cls()
if sess:
2018-04-03 15:14:54 +00:00
attrs = ['auth', 'cert', 'cookies', 'headers', 'hooks', 'params', 'proxies', 'data']
for attr in attrs:
val = getattr(sess, attr, None)
if val:
setattr(scraper, attr, val)
return scraper
2018-04-03 15:14:54 +00:00
# Functions for integrating cloudflare-scrape with other applications and scripts
@classmethod
2018-04-03 15:14:54 +00:00
def get_tokens(cls, url, user_agent=None):
scraper = cls.create_scraper()
if user_agent:
2018-04-03 15:14:54 +00:00
scraper.headers['User-Agent'] = user_agent
2018-04-03 15:14:54 +00:00
# noinspection PyUnusedLocal
try:
resp = scraper.get(url)
resp.raise_for_status()
except Exception as e:
2018-04-03 15:14:54 +00:00
logging.error('[%s] returned an error. Could not collect tokens.' % url)
raise
domain = urlparse(resp.url).netloc
2018-04-03 15:14:54 +00:00
# cookie_domain = None
for d in scraper.cookies.list_domains():
2018-04-03 15:14:54 +00:00
if d.startswith('.') and d in ('.' + domain):
cookie_domain = d
break
else:
2018-04-03 15:14:54 +00:00
raise ValueError('Unable to find Cloudflare cookies.'
' Does the site actually have Cloudflare IUAM (\'I\'m Under Attack Mode\') enabled?')
2018-04-03 15:14:54 +00:00
return ({'__cfduid': scraper.cookies.get('__cfduid', '', domain=cookie_domain),
'cf_clearance': scraper.cookies.get('cf_clearance', '', domain=cookie_domain)
},
scraper.headers['User-Agent'])
@classmethod
2018-04-03 15:14:54 +00:00
def get_cookie_string(cls, url, user_agent=None):
"""
Convenience function for building a Cookie HTTP header value.
"""
tokens, user_agent = cls.get_tokens(url, user_agent=user_agent)
2018-04-03 15:14:54 +00:00
return '; '.join('='.join(pair) for pair in tokens.items()), user_agent
create_scraper = CloudflareScraper.create_scraper
get_tokens = CloudflareScraper.get_tokens
get_cookie_string = CloudflareScraper.get_cookie_string