From 40659b5e9d345309515873f61e07c213f6b21ac8 Mon Sep 17 00:00:00 2001 From: notevenaperson <66701832+notevenaperson@users.noreply.github.com> Date: Sun, 11 Sep 2022 17:23:15 +0000 Subject: [PATCH] singlefile.py: Code to ensure options are deduplicated --- archivebox/extractors/singlefile.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 80ad90b1..f29d59c3 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -46,11 +46,31 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) - cmd = [ - DEPENDENCIES['SINGLEFILE_BINARY']['path'], + options = [ *SINGLEFILE_ARGS, '--browser-executable-path={}'.format(CHROME_BINARY), browser_args, + ] + + # Deduplicate options (single-file doesn't like when you use the same option two times) + # + # NOTE: Options names that come first clobber conflicting names that come later + # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most + # specificity, therefore the user sets it with a lot intent, therefore it should take precedence + # kind of like the ergonomic principle of lexical scope in programming languages. + seen_option_names = [] + def test_seen(argument): + option_name = argument.split("=")[0] + if option_name in seen_option_names: + return False + else: + seen_option_names.append(option_name) + return True + deduped_options = list(filter(test_seen, options)) + + cmd = [ + DEPENDENCIES['SINGLEFILE_BINARY']['path'], + *deduped_options link.url, output, ]