save all imports to sources dir

2024-09-19 23:49:07 -04:00 · 2019-03-27 20:48:41 -04:00 · 2019-03-27 20:48:41 -04:00 · 4c8e45b8d7
commit 4c8e45b8d7
parent cc3d1e9cc9
3 changed files with 36 additions and 33 deletions
--- a/archivebox/archive.py
+++ b/archivebox/archive.py
@ -43,8 +43,8 @@ from .config import (
 )
 from .util import (
    enforce_types,
-    save_remote_source,
+    handle_stdin_import,
-    save_stdin_source,
+    handle_file_import,
 )
 from .logs import (
    log_archiving_started,
@ -160,12 +160,12 @@ def main(args=None) -> None:
            print_help()
            raise SystemExit(1)
-        import_path = save_stdin_source(stdin_raw_text)
+        import_path = handle_stdin_import(stdin_raw_text)
-    ### Handle ingesting urls from a remote file/feed
+    ### Handle ingesting url from a remote file/feed
    # (e.g. if an RSS feed URL is used as the import path) 
-    if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
+    if import_path:
-        import_path = save_remote_source(import_path)
+        import_path = handle_file_import(import_path)
    ### Run the main archive update process
    update_archive_data(import_path=import_path, resume=resume)
--- a/archivebox/archive_methods.py
+++ b/archivebox/archive_methods.py
@ -90,7 +90,7 @@ def archive_link(link: Link, link_dir: Optional[str]=None) -> Link:
        if is_new:
            os.makedirs(link_dir)
-        link = load_json_link_index(link, link_dir)
+        link = load_json_link_index(link, link_dir=link_dir)
        log_link_archiving_started(link, link_dir, is_new)
        link = link.overwrite(updated=datetime.now())
        stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
@ -103,7 +103,7 @@ def archive_link(link: Link, link_dir: Optional[str]=None) -> Link:
                if should_run(link, link_dir):
                    log_archive_method_started(method_name)
-                    result = method_function(link, link_dir)
+                    result = method_function(link=link, link_dir=link_dir)
                    link.history[method_name].append(result)
--- a/archivebox/util.py
+++ b/archivebox/util.py
@ -187,7 +187,7 @@ def check_url_parsing_invariants() -> None:
 ### Random Helpers
@enforce_types
-def save_stdin_source(raw_text: str) -> str:
+def handle_stdin_import(raw_text: str) -> str:
    if not os.path.exists(SOURCES_DIR):
        os.makedirs(SOURCES_DIR)
@ -195,14 +195,12 @@ def save_stdin_source(raw_text: str) -> str:
    source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
-    with open(source_path, 'w', encoding='utf-8') as f:
+    atomic_write(raw_text, source_path)
        f.write(raw_text)
    return source_path
@enforce_types
-def save_remote_source(url: str, timeout: int=TIMEOUT) -> str:
+def handle_file_import(path: str, timeout: int=TIMEOUT) -> str:
    """download a given url's content into output/sources/domain-<timestamp>.txt"""
    if not os.path.exists(SOURCES_DIR):
@ -210,30 +208,35 @@ def save_remote_source(url: str, timeout: int=TIMEOUT) -> str:
    ts = str(datetime.now().timestamp()).split('.', 1)[0]
-    source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts))
+    source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts))
-    print('{}[*] [{}] Downloading {}{}'.format(
+    if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
-        ANSI['green'],
+        source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts))
-        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+        print('{}[*] [{}] Downloading {}{}'.format(
-        url,
+            ANSI['green'],
-        ANSI['reset'],
+            datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
-    ))
+            path,
    timer = TimedProgress(timeout, prefix='      ')
    try:
        downloaded_xml = download_url(url, timeout=timeout)
        timer.end()
    except Exception as e:
        timer.end()
        print('{}[!] Failed to download {}{}\n'.format(
            ANSI['red'],
            url,
            ANSI['reset'],
        ))
-        print('    ', e)
+        timer = TimedProgress(timeout, prefix='      ')
-        raise SystemExit(1)
+        try:
            raw_source_text = download_url(path, timeout=timeout)
            timer.end()
        except Exception as e:
            timer.end()
            print('{}[!] Failed to download {}{}\n'.format(
                ANSI['red'],
                path,
                ANSI['reset'],
            ))
            print('    ', e)
            raise SystemExit(1)
-    with open(source_path, 'w', encoding='utf-8') as f:
+    else:
-        f.write(downloaded_xml)
+        with open(path, 'r') as f:
            raw_source_text = f.read()
    atomic_write(raw_source_text, source_path)
    print('    > {}'.format(pretty_path(source_path)))