fix RSS parser bailing out when lines have whitespace before tags

2024-09-19 15:38:48 -04:00 · 2019-02-19 02:31:53 -05:00 · 2019-02-19 02:31:53 -05:00 · eff0100971
commit eff0100971
parent 3571ef24e4
2 changed files with 2 additions and 3 deletions
--- a/archivebox/archive.py
+++ b/archivebox/archive.py
@ -59,7 +59,6 @@ def load_links(archive_path=OUTPUT_DIR, import_path=None):
    existing_links = []
    if archive_path:
        existing_links = parse_json_links_index(archive_path)
-        existing_links = validate_links(existing_links)

    new_links = []
    if import_path:
@ -178,6 +177,7 @@ if __name__ == '__main__':
    elif stdin_raw_text:
        source = save_source(stdin_raw_text)

+
    # Step 1: Parse the links and dedupe them with existing archive
    all_links, new_links = load_links(archive_path=out_dir, import_path=source)

--- a/archivebox/parse.py
+++ b/archivebox/parse.py
@ -161,7 +161,7 @@ def parse_rss_export(rss_file):
        rows = leading_removed.split('\n')

        def get_row(key):
-            return [r for r in rows if r.startswith('<{}>'.format(key))][0]
+            return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]

        title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
        url = str_between(get_row('link'), '<link>', '</link>')
@ -209,7 +209,6 @@ def parse_shaarli_rss_export(rss_file):
        ts_str = str_between(get_row('published'), '<published>', '</published>')
        time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")

-
        info = {
            'url': url,
            'domain': domain(url),