From 4c8e45b8d70b08a817323b7aefff4859432116e9 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 27 Mar 2019 20:48:41 -0400 Subject: [PATCH] save all imports to sources dir --- archivebox/archive.py | 12 ++++---- archivebox/archive_methods.py | 4 +-- archivebox/util.py | 53 ++++++++++++++++++----------------- 3 files changed, 36 insertions(+), 33 deletions(-) diff --git a/archivebox/archive.py b/archivebox/archive.py index 18d31023..b0a28428 100755 --- a/archivebox/archive.py +++ b/archivebox/archive.py @@ -43,8 +43,8 @@ from .config import ( ) from .util import ( enforce_types, - save_remote_source, - save_stdin_source, + handle_stdin_import, + handle_file_import, ) from .logs import ( log_archiving_started, @@ -160,12 +160,12 @@ def main(args=None) -> None: print_help() raise SystemExit(1) - import_path = save_stdin_source(stdin_raw_text) + import_path = handle_stdin_import(stdin_raw_text) - ### Handle ingesting urls from a remote file/feed + ### Handle ingesting url from a remote file/feed # (e.g. if an RSS feed URL is used as the import path) - if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')): - import_path = save_remote_source(import_path) + if import_path: + import_path = handle_file_import(import_path) ### Run the main archive update process update_archive_data(import_path=import_path, resume=resume) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 7a76df13..fdd941da 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -90,7 +90,7 @@ def archive_link(link: Link, link_dir: Optional[str]=None) -> Link: if is_new: os.makedirs(link_dir) - link = load_json_link_index(link, link_dir) + link = load_json_link_index(link, link_dir=link_dir) log_link_archiving_started(link, link_dir, is_new) link = link.overwrite(updated=datetime.now()) stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} @@ -103,7 +103,7 @@ def archive_link(link: Link, link_dir: Optional[str]=None) -> Link: if should_run(link, link_dir): log_archive_method_started(method_name) - result = method_function(link, link_dir) + result = method_function(link=link, link_dir=link_dir) link.history[method_name].append(result) diff --git a/archivebox/util.py b/archivebox/util.py index f3427155..2d8a546a 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -187,7 +187,7 @@ def check_url_parsing_invariants() -> None: ### Random Helpers @enforce_types -def save_stdin_source(raw_text: str) -> str: +def handle_stdin_import(raw_text: str) -> str: if not os.path.exists(SOURCES_DIR): os.makedirs(SOURCES_DIR) @@ -195,14 +195,12 @@ def save_stdin_source(raw_text: str) -> str: source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts)) - with open(source_path, 'w', encoding='utf-8') as f: - f.write(raw_text) - + atomic_write(raw_text, source_path) return source_path @enforce_types -def save_remote_source(url: str, timeout: int=TIMEOUT) -> str: +def handle_file_import(path: str, timeout: int=TIMEOUT) -> str: """download a given url's content into output/sources/domain-.txt""" if not os.path.exists(SOURCES_DIR): @@ -210,30 +208,35 @@ def save_remote_source(url: str, timeout: int=TIMEOUT) -> str: ts = str(datetime.now().timestamp()).split('.', 1)[0] - source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts)) + source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts)) - print('{}[*] [{}] Downloading {}{}'.format( - ANSI['green'], - datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - url, - ANSI['reset'], - )) - timer = TimedProgress(timeout, prefix=' ') - try: - downloaded_xml = download_url(url, timeout=timeout) - timer.end() - except Exception as e: - timer.end() - print('{}[!] Failed to download {}{}\n'.format( - ANSI['red'], - url, + if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')): + source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts)) + print('{}[*] [{}] Downloading {}{}'.format( + ANSI['green'], + datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + path, ANSI['reset'], )) - print(' ', e) - raise SystemExit(1) + timer = TimedProgress(timeout, prefix=' ') + try: + raw_source_text = download_url(path, timeout=timeout) + timer.end() + except Exception as e: + timer.end() + print('{}[!] Failed to download {}{}\n'.format( + ANSI['red'], + path, + ANSI['reset'], + )) + print(' ', e) + raise SystemExit(1) - with open(source_path, 'w', encoding='utf-8') as f: - f.write(downloaded_xml) + else: + with open(path, 'r') as f: + raw_source_text = f.read() + + atomic_write(raw_source_text, source_path) print(' > {}'.format(pretty_path(source_path)))