SickGear/lib/cfscrape/__init__.py

210 lines
8.9 KiB
Python
Raw Normal View History

from requests.exceptions import RequestException
from requests.models import Response
from requests.sessions import Session
import logging
import random
import re
import time
from _23 import b64encodestring, urlparse
DEFAULT_USER_AGENTS = [
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/41.0.2228.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/50.0.2661.102 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/52.0.2743.116 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0)'
' Gecko/20100101 Firefox/46.0',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:41.0)'
' Gecko/20100101 Firefox/41.0'
]
class CloudflareError(RequestException):
pass
class CloudflareScraper(Session):
def __init__(self, **kwargs):
super(CloudflareScraper, self).__init__()
if 'requests' in self.headers['User-Agent']:
# Set a random User-Agent if no custom User-Agent has been set
self.headers['User-Agent'] = random.choice(DEFAULT_USER_AGENTS)
self.cf_ua = self.headers['User-Agent']
self.default_delay = 8
self.delay = kwargs.pop('delay', self.default_delay)
self.start_time = None
self.trust_env = False
def request(self, method, url, *args, **kwargs):
url_solver = kwargs.pop('url_solver', None)
if not kwargs.pop('proxy_browser', None):
resp = super(CloudflareScraper, self).request(method, url, *args, **kwargs)
else:
resp = self.get_content(method, url, url_solver,
user_agent=self.headers.get('User-Agent'), proxy_browser=True, **kwargs)
if (isinstance(resp, type(Response()))
and resp.status_code in (503, 429, 403)):
self.start_time = time.time()
if (re.search('(?i)cloudflare', resp.headers.get('Server', ''))
and b'jschl_vc' in resp.content
and b'jschl_answer' in resp.content):
resp = self.solve_cf_challenge(resp, url_solver, **kwargs)
elif b'ddgu' in resp.content:
resp = self.solve_ddg_challenge(resp, **kwargs)
return resp
def wait(self):
delay = self.delay - (time.time() - self.start_time)
time.sleep((0, delay)[0 < delay]) # required delay before solving the challenge
def solve_ddg_challenge(self, resp, **original_kwargs):
parsed_url = urlparse(resp.url)
try:
submit_url = parsed_url.scheme + ':' + re.findall('"frm"[^>]+?action="([^"]+)"', resp.text)[0]
kwargs = {k: v for k, v in original_kwargs.items() if k not in ['hooks']}
kwargs.setdefault('headers', {})
kwargs.setdefault('data', dict(
h=b64encodestring('%s://%s' % (parsed_url.scheme, parsed_url.hostname)),
u=b64encodestring(parsed_url.path), p=b64encodestring(parsed_url.port or '')
))
self.wait()
resp = self.request('POST', submit_url, **kwargs)
except (BaseException, Exception):
pass
return resp
def test_flaresolverr(self, url_solver):
# test if FlareSolverr software is running
response_test = super(CloudflareScraper, self).request('GET', url_solver)
fs_ver = None
if 200 == response_test.status_code and response_test.ok:
json_data = response_test.json()
if any([json_data.get('version')]):
fs_ver = json_data.get('version')
if None is fs_ver:
raise ValueError('FlareSolverr software not found (is it running?)')
return fs_ver
def get_content(self, method, url, url_solver, user_agent, proxy_browser=False, **kwargs):
url_solver = url_solver and re.sub(r'(/|v1)*$', '', url_solver) or 'http://localhost:8191'
if not self.test_flaresolverr(url_solver):
raise ValueError('No FlareSolverr software running %sat %s' % (('to solve Cloudflare challenge ',
'')[proxy_browser], url_solver))
try:
params = {} if 'v1' not in self.test_flaresolverr(url_solver) else dict(userAgent=user_agent)
params.update(dict(
cmd='request.%s' % method.lower(), url=url,
cookies=[{'name': cur_ckee.name, 'value': cur_ckee.value,
'domain': cur_ckee.domain, 'path': cur_ckee.path} for cur_ckee in self.cookies]))
response = super(CloudflareScraper, self).request('POST', '%s/v1' % url_solver, json=params)
except(BaseException, Exception) as e:
raise ValueError('FlareSolverr software unable to %s: %r' % (('solve Cloudflare anti-bot IUAM challenge',
'fetch content')[proxy_browser], e))
if None is not response:
data_json = response.json()
result = ({}, data_json)[isinstance(data_json, (dict, list))]
if response.ok:
if 'ok' == result.get('status'):
self.cookies.clear()
for cur_ckee in result.get('solution', {}).get('cookies', []):
if cur_ckee.get('value') and cur_ckee.get('name') not in ('', None, '_gid', '_ga', '_gat'):
self.cookies.set(
cur_ckee['name'], cur_ckee['value'],
rest={'httpOnly': cur_ckee.get('httpOnly'), 'session': cur_ckee.get('session')},
**dict([(k, cur_ckee.get(k)) for k in ('expires', 'domain', 'path', 'secure')]))
else:
response = None
elif 'error' == result.get('status'):
raise ValueError('Failure with FlareSolverr: %s' % result.get('message', 'See the FlareSolver output'))
return response
def solve_cf_challenge(self, resp, url_solver, **original_kwargs):
body = resp.text
parsed_url = urlparse(resp.url)
domain = parsed_url.netloc
if '/cdn-cgi/l/chk_captcha' in body or 'cf_chl_captcha' in body:
raise CloudflareError(
'Cloudflare captcha presented for %s, safe to ignore as this shouldn\'t happen every time, ua: %s' %
(domain, self.cf_ua), response=resp)
final_response = self.get_content(
'GET', (resp.request.url, '%s://%s/' % (parsed_url.scheme, domain))['POST' == resp.request.method],
url_solver, user_agent=resp.request.headers.get('User-Agent'))
if None is final_response:
raise ValueError('Failed to validate Cloudflare anti-bot IUAM challenge')
return final_response
@classmethod
def create_scraper(cls, sess=None, **kwargs):
"""
Convenience function for creating a ready-to-go CloudflareScraper object.
"""
scraper = cls(**kwargs)
if sess:
attrs = ['auth', 'cert', 'cookies', 'headers', 'hooks', 'params', 'proxies', 'data']
for attr in attrs:
val = getattr(sess, attr, None)
if val:
setattr(scraper, attr, val)
return scraper
# Functions for integrating cloudflare-scrape with other applications and scripts
@classmethod
def get_tokens(cls, url, user_agent=None, **kwargs):
scraper = cls.create_scraper()
if user_agent:
scraper.headers['User-Agent'] = user_agent
try:
resp = scraper.get(url, **kwargs)
resp.raise_for_status()
except (BaseException, Exception):
logging.error('[%s] returned an error. Could not collect tokens.' % url)
raise
domain = urlparse(resp.url).netloc
for d in scraper.cookies.list_domains():
if d.startswith('.') and d in ('.' + domain):
cookie_domain = d
break
else:
raise ValueError('Unable to find Cloudflare cookies.'
' Does the site actually have Cloudflare IUAM (\'I\'m Under Attack Mode\') enabled?')
return (
{'__cfduid': scraper.cookies.get('__cfduid', '', domain=cookie_domain),
'cf_clearance': scraper.cookies.get('cf_clearance', '', domain=cookie_domain)},
scraper.headers['User-Agent'])
@classmethod
def get_cookie_string(cls, url, user_agent=None, **kwargs):
"""
Convenience function for building a Cookie HTTP header value.
"""
tokens, user_agent = cls.get_tokens(url, user_agent=user_agent, **kwargs)
return '; '.join(['='.join(pair) for pair in tokens.items()]), user_agent
create_scraper = CloudflareScraper.create_scraper
get_tokens = CloudflareScraper.get_tokens
get_cookie_string = CloudflareScraper.get_cookie_string