From de8e22efb78cf9d148f0497d903948fb29111eef Mon Sep 17 00:00:00 2001 From: papersnake Date: Tue, 8 Feb 2022 23:17:52 +0800 Subject: [PATCH 1/2] improve title extractor --- archivebox/extractors/__init__.py | 6 +++--- archivebox/extractors/readability.py | 22 +--------------------- archivebox/extractors/title.py | 23 ++++++++++++++++++++++- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index be5832e7..dc1b9692 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -42,7 +42,6 @@ from .headers import should_save_headers, save_headers def get_default_archive_methods(): return [ - ('title', should_save_title, save_title), ('favicon', should_save_favicon, save_favicon), ('headers', should_save_headers, save_headers), ('singlefile', should_save_singlefile, save_singlefile), @@ -50,7 +49,8 @@ def get_default_archive_methods(): ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), ('wget', should_save_wget, save_wget), - ('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them + ('title', should_save_title, save_title), # keep title and readability below wget and singlefile, as it depends on them + ('readability', should_save_readability, save_readability), ('mercury', should_save_mercury, save_mercury), ('git', should_save_git, save_git), ('media', should_save_media, save_media), @@ -182,7 +182,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa except KeyboardInterrupt: log_archiving_paused(num_links, idx, link.timestamp) raise SystemExit(0) - except BaseException: # lgtm [py/catch-base-exception] + except BaseException: print() raise diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 7e5ed592..43fccbb6 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -22,28 +22,8 @@ from ..config import ( READABILITY_VERSION, ) from ..logging_util import TimedProgress +from .title import get_html -@enforce_types -def get_html(link: Link, path: Path) -> str: - """ - Try to find wget, singlefile and then dom files. - If none is found, download the url again. - """ - canonical = link.canonical_outputs() - abs_path = path.absolute() - sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]] - document = None - for source in sources: - try: - with open(abs_path / source, "r", encoding="utf-8") as f: - document = f.read() - break - except (FileNotFoundError, TypeError): - continue - if document is None: - return download_url(link.url) - else: - return document @enforce_types def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index 272eebc8..19a78591 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -58,6 +58,27 @@ class TitleParser(HTMLParser): if tag.lower() == "title": self.inside_title_tag = False +@enforce_types +def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str: + """ + Try to find wget, singlefile and then dom files. + If none is found, download the url again. + """ + canonical = link.canonical_outputs() + abs_path = path.absolute() + sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]] + document = None + for source in sources: + try: + with open(abs_path / source, "r", encoding="utf-8") as f: + document = f.read() + break + except (FileNotFoundError, TypeError): + continue + if document is None: + return download_url(link.url, timeout=timeout) + else: + return document @enforce_types def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: @@ -90,7 +111,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) - status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: - html = download_url(link.url, timeout=timeout) + html = get_html(link, out_dir, timeout=timeout) try: # try using relatively strict html parser first parser = TitleParser() From 011bd104cb0b411b156b62ff686ca2e14c48afb6 Mon Sep 17 00:00:00 2001 From: prnake Date: Wed, 9 Feb 2022 10:48:51 +0800 Subject: [PATCH 2/2] remove unused import --- archivebox/extractors/readability.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 43fccbb6..a1689f95 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -10,9 +10,7 @@ from ..index.schema import Link, ArchiveResult, ArchiveError from ..system import run, atomic_write from ..util import ( enforce_types, - download_url, is_static_file, - ) from ..config import ( TIMEOUT,