From f6cf35a45d41f911e02d275398ef8b6a9efa51a5 Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Wed, 4 Aug 2021 09:26:51 -0400 Subject: [PATCH] Fix Pinboard RSS parsing valid links as `None` `item.find(p)` returns either an `ElementTree.Element` or `None`. The [lambda on line 24][lambda] coerces the return value to a bool, which is `False` if the `` element has no children (see [`ElementTree.py` line 207][etbooldef]), so the lambda returns `None`. Further, returning a `Link` with `url=None` violates [an assertion in `index/schema.py`][assertion], which crashes the `archivebox add` command. [lambda]: https://github.com/ArchiveBox/ArchiveBox/blob/3d54b1321bf8c56627aaa50efcc809cd99caee52/archivebox/parsers/pinboard_rss.py#L24 [etbooldef]: https://github.com/python/cpython/blob/3d8993a744813c5144851da5347d7b4b1885f234/Lib/xml/etree/ElementTree.py#L207 [assertion]: https://github.com/ArchiveBox/ArchiveBox/blob/3d54b1321bf8c56627aaa50efcc809cd99caee52/archivebox/index/schema.py#L165 --- archivebox/parsers/pinboard_rss.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/archivebox/parsers/pinboard_rss.py b/archivebox/parsers/pinboard_rss.py index b7a77a00..d12b219c 100644 --- a/archivebox/parsers/pinboard_rss.py +++ b/archivebox/parsers/pinboard_rss.py @@ -21,13 +21,18 @@ def parse_pinboard_rss_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]: root = ElementTree.parse(rss_file).getroot() items = root.findall("{http://purl.org/rss/1.0/}item") for item in items: - find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore + find = lambda p: item.find(p).text.strip() if item.find(p) is not None else None # type: ignore url = find("{http://purl.org/rss/1.0/}link") tags = find("{http://purl.org/dc/elements/1.1/}subject") title = find("{http://purl.org/rss/1.0/}title") ts_str = find("{http://purl.org/dc/elements/1.1/}date") + if url is None: + # Yielding a Link with no URL will + # crash on a URL validation assertion + continue + # Pinboard includes a colon in its date stamp timezone offsets, which # Python can't parse. Remove it: if ts_str and ts_str[-3:-2] == ":":