diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 90727e8c..5a747187 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -74,6 +74,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)}, 'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)}, 'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)}, + 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, @@ -104,6 +105,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'DEPENDENCY_CONFIG': { 'USE_CURL': {'type': bool, 'default': True}, 'USE_WGET': {'type': bool, 'default': True}, + 'USE_SINGLEFILE': {'type': bool, 'default': True}, 'USE_GIT': {'type': bool, 'default': True}, 'USE_CHROME': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True}, @@ -111,6 +113,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = { 'CURL_BINARY': {'type': str, 'default': 'curl'}, 'GIT_BINARY': {'type': str, 'default': 'git'}, 'WGET_BINARY': {'type': str, 'default': 'wget'}, + 'SINGLEFILE_BINARY': {'type': str, 'default': 'single-file'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'CHROME_BINARY': {'type': str, 'default': None}, }, @@ -249,6 +252,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = { 'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']}, 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, + 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and (c['SAVE_SINGLEFILE'])}, + 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, + 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, @@ -674,6 +680,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue: 'enabled': config['USE_WGET'], 'is_valid': bool(config['WGET_VERSION']), }, + 'SINGLEFILE_BINARY': { + 'path': bin_path(config['SINGLEFILE_BINARY']), + 'version': config['SINGLEFILE_VERSION'], + 'hash': bin_hash(config['SINGLEFILE_BINARY']), + 'enabled': config['USE_SINGLEFILE'], + 'is_valid': bool(config['SINGLEFILE_VERSION']), + }, 'GIT_BINARY': { 'path': bin_path(config['GIT_BINARY']), 'version': config['GIT_VERSION'], diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 284ce569..bdeae3d7 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -25,6 +25,7 @@ from ..logging_util import ( from .title import should_save_title, save_title from .favicon import should_save_favicon, save_favicon from .wget import should_save_wget, save_wget +from .singlefile import should_save_singlefile, save_singlefile from .pdf import should_save_pdf, save_pdf from .screenshot import should_save_screenshot, save_screenshot from .dom import should_save_dom, save_dom @@ -37,6 +38,7 @@ def get_default_archive_methods(): ('title', should_save_title, save_title), ('favicon', should_save_favicon, save_favicon), ('wget', should_save_wget, save_wget), + ('singlefile', should_save_singlefile, save_singlefile), ('pdf', should_save_pdf, save_pdf), ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py new file mode 100644 index 00000000..0c9718e4 --- /dev/null +++ b/archivebox/extractors/singlefile.py @@ -0,0 +1,81 @@ +__package__ = 'archivebox.extractors' + +import os +from pathlib import Path + +from typing import Optional + +from ..index.schema import Link, ArchiveResult, ArchiveError +from ..system import run, chmod_file +from ..util import ( + enforce_types, +) +from ..config import ( + TIMEOUT, + SAVE_SINGLEFILE, + SINGLEFILE_BINARY, + SINGLEFILE_VERSION, + CHROME_BINARY, +) +from ..logging_util import TimedProgress + + +@enforce_types +def should_save_singlefile(link: Link, out_dir: Optional[str]=None) -> bool: + out_dir = out_dir or link.link_dir + if not os.path.exists(out_dir): + return False + + return SAVE_SINGLEFILE + + +@enforce_types +def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """download full site using single-file""" + + out_dir = out_dir or link.link_dir + output = str(Path(out_dir).absolute() / "single-file.html") + + # WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html + cmd = [ + SINGLEFILE_BINARY, + '--browser-executable-path={}'.format(CHROME_BINARY), + link.url, + output + ] + + status = 'succeeded' + timer = TimedProgress(timeout, prefix=' ') + try: + result = run(cmd, cwd=out_dir, timeout=timeout) + + # parse out number of files downloaded from last line of stderr: + # "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)" + output_tail = [ + line.strip() + for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:] + if line.strip() + ] + hints = ( + 'Got single-file response code: {}.'.format(result.returncode), + *output_tail, + ) + + # Check for common failure cases + if (result.returncode > 0): + raise ArchiveError('SingleFile was not able to archive the page', hints) + chmod_file(output) + except Exception as err: + status = 'failed' + output = err + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=out_dir, + cmd_version=SINGLEFILE_VERSION, + output=output, + status=status, + **timer.stats, + )