diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 80ad90b1..f29d59c3 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -46,11 +46,31 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:])) - cmd = [ - DEPENDENCIES['SINGLEFILE_BINARY']['path'], + options = [ *SINGLEFILE_ARGS, '--browser-executable-path={}'.format(CHROME_BINARY), browser_args, + ] + + # Deduplicate options (single-file doesn't like when you use the same option two times) + # + # NOTE: Options names that come first clobber conflicting names that come later + # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most + # specificity, therefore the user sets it with a lot intent, therefore it should take precedence + # kind of like the ergonomic principle of lexical scope in programming languages. + seen_option_names = [] + def test_seen(argument): + option_name = argument.split("=")[0] + if option_name in seen_option_names: + return False + else: + seen_option_names.append(option_name) + return True + deduped_options = list(filter(test_seen, options)) + + cmd = [ + DEPENDENCIES['SINGLEFILE_BINARY']['path'], + *deduped_options link.url, output, ]