fix a bunch of mypy errors

This commit is contained in:
Nick Sweeting 2019-03-30 20:49:45 -04:00
parent 03ed5d668b
commit f4e018ba0c
3 changed files with 53 additions and 43 deletions

View file

@ -4,7 +4,7 @@ from typing import Dict, List, Tuple, Optional
from collections import defaultdict from collections import defaultdict
from datetime import datetime from datetime import datetime
from .schema import Link, ArchiveResult from .schema import Link, ArchiveResult, ArchiveOutput
from .index import ( from .index import (
write_link_index, write_link_index,
patch_links_index, patch_links_index,
@ -159,13 +159,13 @@ def should_fetch_title(link: Link, link_dir: Optional[str]=None) -> bool:
def fetch_title(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_title(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content""" """try to guess the page's title from its content"""
output = None output: ArchiveOutput = None
cmd = [ cmd = [
CURL_BINARY, CURL_BINARY,
link.url, link.url,
'|', '|',
'grep', 'grep',
'<title>', '<title',
] ]
status = 'succeeded' status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ') timer = TimedProgress(timeout, prefix=' ')
@ -191,6 +191,7 @@ def fetch_title(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT)
@enforce_types @enforce_types
def should_fetch_favicon(link: Link, link_dir: Optional[str]=None) -> bool: def should_fetch_favicon(link: Link, link_dir: Optional[str]=None) -> bool:
link_dir = link_dir or link.link_dir
if os.path.exists(os.path.join(link_dir, 'favicon.ico')): if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
return False return False
@ -200,13 +201,14 @@ def should_fetch_favicon(link: Link, link_dir: Optional[str]=None) -> bool:
def fetch_favicon(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_favicon(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api""" """download site favicon from google's favicon api"""
output = 'favicon.ico' link_dir = link_dir or link.link_dir
output: ArchiveOutput = 'favicon.ico'
cmd = [ cmd = [
CURL_BINARY, CURL_BINARY,
'--max-time', str(timeout), '--max-time', str(timeout),
'--location', '--location',
'--output', output, '--output', str(output),
*(() if CHECK_SSL_VALIDITY else ('--insecure',)), *([] if CHECK_SSL_VALIDITY else ['--insecure']),
'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)), 'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
] ]
status = 'succeeded' status = 'succeeded'
@ -232,6 +234,7 @@ def fetch_favicon(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT
@enforce_types @enforce_types
def should_fetch_wget(link: Link, link_dir: Optional[str]=None) -> bool: def should_fetch_wget(link: Link, link_dir: Optional[str]=None) -> bool:
output_path = wget_output_path(link) output_path = wget_output_path(link)
link_dir = link_dir or link.link_dir
if output_path and os.path.exists(os.path.join(link_dir, output_path)): if output_path and os.path.exists(os.path.join(link_dir, output_path)):
return False return False
@ -242,13 +245,14 @@ def should_fetch_wget(link: Link, link_dir: Optional[str]=None) -> bool:
def fetch_wget(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_wget(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using wget""" """download full site using wget"""
link_dir = link_dir or link.link_dir
if FETCH_WARC: if FETCH_WARC:
warc_dir = os.path.join(link_dir, 'warc') warc_dir = os.path.join(link_dir, 'warc')
os.makedirs(warc_dir, exist_ok=True) os.makedirs(warc_dir, exist_ok=True)
warc_path = os.path.join('warc', str(int(datetime.now().timestamp()))) warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
output = None output: ArchiveOutput = None
cmd = [ cmd = [
WGET_BINARY, WGET_BINARY,
# '--server-response', # print headers for better error parsing # '--server-response', # print headers for better error parsing
@ -262,13 +266,13 @@ def fetch_wget(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -
'-e', 'robots=off', '-e', 'robots=off',
'--restrict-file-names=unix', '--restrict-file-names=unix',
'--timeout={}'.format(timeout), '--timeout={}'.format(timeout),
*(() if FETCH_WARC else ('--timestamping',)), *([] if FETCH_WARC else ['--timestamping']),
*(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()), *(['--warc-file={}'.format(warc_path)] if FETCH_WARC else []),
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()), *(['--page-requisites'] if FETCH_WGET_REQUISITES else []),
*(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()), *(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
*(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()), *(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
*(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()), *(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
*((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
link.url, link.url,
] ]
status = 'succeeded' status = 'succeeded'
@ -320,6 +324,7 @@ def fetch_wget(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -
@enforce_types @enforce_types
def should_fetch_pdf(link: Link, link_dir: Optional[str]=None) -> bool: def should_fetch_pdf(link: Link, link_dir: Optional[str]=None) -> bool:
link_dir = link_dir or link.link_dir
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -333,7 +338,8 @@ def should_fetch_pdf(link: Link, link_dir: Optional[str]=None) -> bool:
def fetch_pdf(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_pdf(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print PDF of site to file using chrome --headless""" """print PDF of site to file using chrome --headless"""
output = 'output.pdf' link_dir = link_dir or link.link_dir
output: ArchiveOutput = 'output.pdf'
cmd = [ cmd = [
*chrome_args(TIMEOUT=timeout), *chrome_args(TIMEOUT=timeout),
'--print-to-pdf', '--print-to-pdf',
@ -366,6 +372,7 @@ def fetch_pdf(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
@enforce_types @enforce_types
def should_fetch_screenshot(link: Link, link_dir: Optional[str]=None) -> bool: def should_fetch_screenshot(link: Link, link_dir: Optional[str]=None) -> bool:
link_dir = link_dir or link.link_dir
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -377,8 +384,9 @@ def should_fetch_screenshot(link: Link, link_dir: Optional[str]=None) -> bool:
@enforce_types @enforce_types
def fetch_screenshot(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_screenshot(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""take screenshot of site using chrome --headless""" """take screenshot of site using chrome --headless"""
output = 'screenshot.png' link_dir = link_dir or link.link_dir
output: ArchiveOutput = 'screenshot.png'
cmd = [ cmd = [
*chrome_args(TIMEOUT=timeout), *chrome_args(TIMEOUT=timeout),
'--screenshot', '--screenshot',
@ -411,6 +419,7 @@ def fetch_screenshot(link: Link, link_dir: Optional[str]=None, timeout: int=TIME
@enforce_types @enforce_types
def should_fetch_dom(link: Link, link_dir: Optional[str]=None) -> bool: def should_fetch_dom(link: Link, link_dir: Optional[str]=None) -> bool:
link_dir = link_dir or link.link_dir
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -423,8 +432,9 @@ def should_fetch_dom(link: Link, link_dir: Optional[str]=None) -> bool:
def fetch_dom(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_dom(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html""" """print HTML of site to file using chrome --dump-html"""
output = 'output.html' link_dir = link_dir or link.link_dir
output_path = os.path.join(link_dir, output) output: ArchiveOutput = 'output.html'
output_path = os.path.join(link_dir, str(output))
cmd = [ cmd = [
*chrome_args(TIMEOUT=timeout), *chrome_args(TIMEOUT=timeout),
'--dump-dom', '--dump-dom',
@ -458,6 +468,7 @@ def fetch_dom(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
@enforce_types @enforce_types
def should_fetch_git(link: Link, link_dir: Optional[str]=None) -> bool: def should_fetch_git(link: Link, link_dir: Optional[str]=None) -> bool:
link_dir = link_dir or link.link_dir
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -478,15 +489,16 @@ def should_fetch_git(link: Link, link_dir: Optional[str]=None) -> bool:
def fetch_git(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: def fetch_git(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using git""" """download full site using git"""
output = 'git' link_dir = link_dir or link.link_dir
output_path = os.path.join(link_dir, 'git') output: ArchiveOutput = 'git'
output_path = os.path.join(link_dir, str(output))
os.makedirs(output_path, exist_ok=True) os.makedirs(output_path, exist_ok=True)
cmd = [ cmd = [
GIT_BINARY, GIT_BINARY,
'clone', 'clone',
'--mirror', '--mirror',
'--recursive', '--recursive',
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')), *([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
without_query(without_fragment(link.url)), without_query(without_fragment(link.url)),
] ]
status = 'succeeded' status = 'succeeded'
@ -519,6 +531,8 @@ def fetch_git(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) ->
@enforce_types @enforce_types
def should_fetch_media(link: Link, link_dir: Optional[str]=None) -> bool: def should_fetch_media(link: Link, link_dir: Optional[str]=None) -> bool:
link_dir = link_dir or link.link_dir
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -531,8 +545,9 @@ def should_fetch_media(link: Link, link_dir: Optional[str]=None) -> bool:
def fetch_media(link: Link, link_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult: def fetch_media(link: Link, link_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl""" """Download playlists or individual video, audio, and subtitles using youtube-dl"""
output = 'media' link_dir = link_dir or link.link_dir
output_path = os.path.join(link_dir, 'media') output: ArchiveOutput = 'media'
output_path = os.path.join(link_dir, str(output))
os.makedirs(output_path, exist_ok=True) os.makedirs(output_path, exist_ok=True)
cmd = [ cmd = [
YOUTUBEDL_BINARY, YOUTUBEDL_BINARY,
@ -553,7 +568,7 @@ def fetch_media(link: Link, link_dir: Optional[str]=None, timeout: int=MEDIA_TIM
'--audio-quality', '320K', '--audio-quality', '320K',
'--embed-thumbnail', '--embed-thumbnail',
'--add-metadata', '--add-metadata',
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)), *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
link.url, link.url,
] ]
status = 'succeeded' status = 'succeeded'
@ -593,6 +608,7 @@ def fetch_media(link: Link, link_dir: Optional[str]=None, timeout: int=MEDIA_TIM
@enforce_types @enforce_types
def should_fetch_archive_dot_org(link: Link, link_dir: Optional[str]=None) -> bool: def should_fetch_archive_dot_org(link: Link, link_dir: Optional[str]=None) -> bool:
link_dir = link_dir or link.link_dir
if is_static_file(link.url): if is_static_file(link.url):
return False return False
@ -606,7 +622,8 @@ def should_fetch_archive_dot_org(link: Link, link_dir: Optional[str]=None) -> bo
def archive_dot_org(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: def archive_dot_org(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""submit site to archive.org for archiving via their service, save returned archive url""" """submit site to archive.org for archiving via their service, save returned archive url"""
output = 'archive.org.txt' link_dir = link_dir or link.link_dir
output: ArchiveOutput = 'archive.org.txt'
archive_org_url = None archive_org_url = None
submit_url = 'https://web.archive.org/save/{}'.format(link.url) submit_url = 'https://web.archive.org/save/{}'.format(link.url)
cmd = [ cmd = [
@ -615,7 +632,7 @@ def archive_dot_org(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEO
'--head', '--head',
'--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from '--user-agent', 'ArchiveBox/{} (+https://github.com/pirate/ArchiveBox/)'.format(VERSION), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
'--max-time', str(timeout), '--max-time', str(timeout),
*(() if CHECK_SSL_VALIDITY else ('--insecure',)), *([] if CHECK_SSL_VALIDITY else ['--insecure']),
submit_url, submit_url,
] ]
status = 'succeeded' status = 'succeeded'
@ -638,13 +655,13 @@ def archive_dot_org(link: Link, link_dir: Optional[str]=None, timeout: int=TIMEO
finally: finally:
timer.end() timer.end()
if not isinstance(output, Exception): if output and not isinstance(output, Exception):
# instead of writing None when archive.org rejects the url write the # instead of writing None when archive.org rejects the url write the
# url to resubmit it to archive.org. This is so when the user visits # url to resubmit it to archive.org. This is so when the user visits
# the URL in person, it will attempt to re-archive it, and it'll show the # the URL in person, it will attempt to re-archive it, and it'll show the
# nicer error message explaining why the url was rejected if it fails. # nicer error message explaining why the url was rejected if it fails.
archive_org_url = archive_org_url or submit_url archive_org_url = archive_org_url or submit_url
with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f: with open(os.path.join(link_dir, str(output)), 'w', encoding='utf-8') as f:
f.write(archive_org_url) f.write(archive_org_url)
chmod_file('archive.org.txt', cwd=link_dir) chmod_file('archive.org.txt', cwd=link_dir)
output = archive_org_url output = archive_org_url

View file

@ -3,13 +3,10 @@ import re
import sys import sys
import shutil import shutil
from typing import Optional, Pattern from typing import Optional
from subprocess import run, PIPE, DEVNULL from subprocess import run, PIPE, DEVNULL
OUTPUT_DIR: str
URL_BLACKLIST: Optional[Pattern[str]]
# ****************************************************************************** # ******************************************************************************
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration # Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
# Use the 'env' command to pass config options to ArchiveBox. e.g.: # Use the 'env' command to pass config options to ArchiveBox. e.g.:
@ -48,6 +45,7 @@ COOKIES_FILE = os.getenv('COOKIES_FILE', None)
CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None) CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true' CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true'
CHROME_USER_AGENT = os.getenv('CHROME_USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36') CHROME_USER_AGENT = os.getenv('CHROME_USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36')
CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True' ).lower() == 'true'
USE_CURL = os.getenv('USE_CURL', 'True' ).lower() == 'true' USE_CURL = os.getenv('USE_CURL', 'True' ).lower() == 'true'
USE_WGET = os.getenv('USE_WGET', 'True' ).lower() == 'true' USE_WGET = os.getenv('USE_WGET', 'True' ).lower() == 'true'
@ -59,12 +57,7 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget')
YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl') YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl')
CHROME_BINARY = os.getenv('CHROME_BINARY', None) CHROME_BINARY = os.getenv('CHROME_BINARY', None)
CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
try:
OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR'))
except Exception:
OUTPUT_DIR = None
# ****************************************************************************** # ******************************************************************************
@ -103,7 +96,7 @@ TEMPLATES_DIR = os.path.join(PYTHON_DIR, 'templates')
if COOKIES_FILE: if COOKIES_FILE:
COOKIES_FILE = os.path.abspath(COOKIES_FILE) COOKIES_FILE = os.path.abspath(COOKIES_FILE)
URL_BLACKLIST = URL_BLACKLIST and re.compile(URL_BLACKLIST, re.IGNORECASE) URL_BLACKLIST_PTN = re.compile(URL_BLACKLIST, re.IGNORECASE) if URL_BLACKLIST else None
########################### Environment & Dependencies ######################### ########################### Environment & Dependencies #########################
@ -147,7 +140,7 @@ def bin_version(binary: str) -> str:
raise SystemExit(1) raise SystemExit(1)
def find_chrome_binary() -> Optional[str]: def find_chrome_binary() -> str:
"""find any installed chrome binaries in the default locations""" """find any installed chrome binaries in the default locations"""
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# make sure data dir finding precedence order always matches binary finding order # make sure data dir finding precedence order always matches binary finding order
@ -244,7 +237,7 @@ try:
else: else:
FETCH_PDF = FETCH_SCREENSHOT = FETCH_DOM = False FETCH_PDF = FETCH_SCREENSHOT = FETCH_DOM = False
if CHROME_BINARY is None: if not CHROME_BINARY:
CHROME_BINARY = find_chrome_binary() or 'chromium-browser' CHROME_BINARY = find_chrome_binary() or 'chromium-browser'
CHROME_VERSION = None CHROME_VERSION = None
if USE_CHROME: if USE_CHROME:

View file

@ -8,7 +8,7 @@ from .util import (
merge_links, merge_links,
) )
from .config import URL_BLACKLIST from .config import URL_BLACKLIST_PTN
def validate_links(links: Iterable[Link]) -> Iterable[Link]: def validate_links(links: Iterable[Link]) -> Iterable[Link]:
@ -26,7 +26,7 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived""" """remove chrome://, about:// or other schemed links that cant be archived"""
for link in links: for link in links:
scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp') scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
not_blacklisted = (not URL_BLACKLIST.match(link.url)) if URL_BLACKLIST else True not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
if scheme_is_valid and not_blacklisted: if scheme_is_valid and not_blacklisted:
yield link yield link