From 96b1e4a8ec1eb64c979c185b912ef6d60b25074f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 13 Jul 2020 11:22:58 -0400 Subject: [PATCH] accept local paths as valid link URLs when parsing --- archivebox/parsers/generic_txt.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/archivebox/parsers/generic_txt.py b/archivebox/parsers/generic_txt.py index cc3653a0..61d1973f 100644 --- a/archivebox/parsers/generic_txt.py +++ b/archivebox/parsers/generic_txt.py @@ -5,6 +5,7 @@ import re from typing import IO, Iterable from datetime import datetime +from pathlib import Path from ..index.schema import Link from ..util import ( @@ -13,14 +14,28 @@ from ..util import ( URL_REGEX ) + @enforce_types def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]: """Parse raw links from each line in a text file""" text_file.seek(0) for line in text_file.readlines(): - urls = re.findall(URL_REGEX, line) if line.strip() else () - for url in urls: # type: ignore + if not line.strip(): + continue + + # if the line is a local file path that resolves, then we can archive it + if Path(line).exists(): + yield Link( + url=line, + timestamp=str(datetime.now().timestamp()), + title=None, + tags=None, + sources=[text_file.name], + ) + + # otherwise look for anything that looks like a URL in the line + for url in re.findall(URL_REGEX, line): yield Link( url=htmldecode(url), timestamp=str(datetime.now().timestamp()),