save all imports to sources dir

This commit is contained in:
Nick Sweeting 2019-03-27 20:48:41 -04:00
parent cc3d1e9cc9
commit 4c8e45b8d7
3 changed files with 36 additions and 33 deletions

View file

@ -43,8 +43,8 @@ from .config import (
) )
from .util import ( from .util import (
enforce_types, enforce_types,
save_remote_source, handle_stdin_import,
save_stdin_source, handle_file_import,
) )
from .logs import ( from .logs import (
log_archiving_started, log_archiving_started,
@ -160,12 +160,12 @@ def main(args=None) -> None:
print_help() print_help()
raise SystemExit(1) raise SystemExit(1)
import_path = save_stdin_source(stdin_raw_text) import_path = handle_stdin_import(stdin_raw_text)
### Handle ingesting urls from a remote file/feed ### Handle ingesting url from a remote file/feed
# (e.g. if an RSS feed URL is used as the import path) # (e.g. if an RSS feed URL is used as the import path)
if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')): if import_path:
import_path = save_remote_source(import_path) import_path = handle_file_import(import_path)
### Run the main archive update process ### Run the main archive update process
update_archive_data(import_path=import_path, resume=resume) update_archive_data(import_path=import_path, resume=resume)

View file

@ -90,7 +90,7 @@ def archive_link(link: Link, link_dir: Optional[str]=None) -> Link:
if is_new: if is_new:
os.makedirs(link_dir) os.makedirs(link_dir)
link = load_json_link_index(link, link_dir) link = load_json_link_index(link, link_dir=link_dir)
log_link_archiving_started(link, link_dir, is_new) log_link_archiving_started(link, link_dir, is_new)
link = link.overwrite(updated=datetime.now()) link = link.overwrite(updated=datetime.now())
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
@ -103,7 +103,7 @@ def archive_link(link: Link, link_dir: Optional[str]=None) -> Link:
if should_run(link, link_dir): if should_run(link, link_dir):
log_archive_method_started(method_name) log_archive_method_started(method_name)
result = method_function(link, link_dir) result = method_function(link=link, link_dir=link_dir)
link.history[method_name].append(result) link.history[method_name].append(result)

View file

@ -187,7 +187,7 @@ def check_url_parsing_invariants() -> None:
### Random Helpers ### Random Helpers
@enforce_types @enforce_types
def save_stdin_source(raw_text: str) -> str: def handle_stdin_import(raw_text: str) -> str:
if not os.path.exists(SOURCES_DIR): if not os.path.exists(SOURCES_DIR):
os.makedirs(SOURCES_DIR) os.makedirs(SOURCES_DIR)
@ -195,14 +195,12 @@ def save_stdin_source(raw_text: str) -> str:
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts)) source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format('stdin', ts))
with open(source_path, 'w', encoding='utf-8') as f: atomic_write(raw_text, source_path)
f.write(raw_text)
return source_path return source_path
@enforce_types @enforce_types
def save_remote_source(url: str, timeout: int=TIMEOUT) -> str: def handle_file_import(path: str, timeout: int=TIMEOUT) -> str:
"""download a given url's content into output/sources/domain-<timestamp>.txt""" """download a given url's content into output/sources/domain-<timestamp>.txt"""
if not os.path.exists(SOURCES_DIR): if not os.path.exists(SOURCES_DIR):
@ -210,30 +208,35 @@ def save_remote_source(url: str, timeout: int=TIMEOUT) -> str:
ts = str(datetime.now().timestamp()).split('.', 1)[0] ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(url), ts)) source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(basename(path), ts))
print('{}[*] [{}] Downloading {}{}'.format( if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
ANSI['green'], source_path = os.path.join(SOURCES_DIR, '{}-{}.txt'.format(domain(path), ts))
datetime.now().strftime('%Y-%m-%d %H:%M:%S'), print('{}[*] [{}] Downloading {}{}'.format(
url, ANSI['green'],
ANSI['reset'], datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
)) path,
timer = TimedProgress(timeout, prefix=' ')
try:
downloaded_xml = download_url(url, timeout=timeout)
timer.end()
except Exception as e:
timer.end()
print('{}[!] Failed to download {}{}\n'.format(
ANSI['red'],
url,
ANSI['reset'], ANSI['reset'],
)) ))
print(' ', e) timer = TimedProgress(timeout, prefix=' ')
raise SystemExit(1) try:
raw_source_text = download_url(path, timeout=timeout)
timer.end()
except Exception as e:
timer.end()
print('{}[!] Failed to download {}{}\n'.format(
ANSI['red'],
path,
ANSI['reset'],
))
print(' ', e)
raise SystemExit(1)
with open(source_path, 'w', encoding='utf-8') as f: else:
f.write(downloaded_xml) with open(path, 'r') as f:
raw_source_text = f.read()
atomic_write(raw_source_text, source_path)
print(' > {}'.format(pretty_path(source_path))) print(' > {}'.format(pretty_path(source_path)))