From c90f4bfd5baa9a866b8877dc2dec349c7bf5289b Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 10 Jun 2018 21:26:11 -0400 Subject: [PATCH] cleanup ARCHIVE_DIR paths --- archiver/archive_methods.py | 3 ++- archiver/config.py | 1 + archiver/util.py | 9 +++++---- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/archiver/archive_methods.py b/archiver/archive_methods.py index 4bb0e9e0..179613fd 100644 --- a/archiver/archive_methods.py +++ b/archiver/archive_methods.py @@ -27,6 +27,7 @@ from config import ( CHROME_USER_DATA_DIR, TIMEOUT, ANSI, + ARCHIVE_DIR, ) from util import ( check_dependencies, @@ -50,7 +51,7 @@ def archive_links(archive_path, links, source=None, resume=None): try: for idx, link in enumerate(to_archive): - link_dir = os.path.join(archive_path, 'archive', link['timestamp']) + link_dir = os.path.join(ARCHIVE_DIR, link['timestamp']) archive_link(link_dir, link) except (KeyboardInterrupt, SystemExit, Exception) as e: diff --git a/archiver/config.py b/archiver/config.py index bde38975..0a49e2b3 100644 --- a/archiver/config.py +++ b/archiver/config.py @@ -36,6 +36,7 @@ FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')) OUTPUT_DIR = os.path.join(REPO_DIR, 'output') +ARCHIVE_DIR = os.path.join(OUTPUT_DIR, 'archive') SOURCES_DIR = os.path.join(OUTPUT_DIR, 'sources') PYTHON_PATH = os.path.join(REPO_DIR, 'archiver') diff --git a/archiver/util.py b/archiver/util.py index 7622a26a..d4f67ae7 100644 --- a/archiver/util.py +++ b/archiver/util.py @@ -16,6 +16,7 @@ from config import ( REPO_DIR, SOURCES_DIR, OUTPUT_DIR, + ARCHIVE_DIR, TIMEOUT, TERM_WIDTH, SHOW_PROGRESS, @@ -262,7 +263,7 @@ def find_link(folder, links): timestamp = folder.split('.')[0] for link in links: if link['timestamp'].startswith(timestamp): - if link['domain'] in os.listdir(os.path.join(OUTPUT_DIR, 'archive', folder)): + if link['domain'] in os.listdir(os.path.join(ARCHIVE_DIR, folder)): return link # careful now, this isn't safe for most ppl if link['domain'] in parse_url(folder): return link @@ -271,7 +272,7 @@ def find_link(folder, links): def parse_url(folder): """for a given archive folder, figure out what url it's for""" - link_json = os.path.join(OUTPUT_DIR, 'archive', folder, 'index.json') + link_json = os.path.join(ARCHIVE_DIR, folder, 'index.json') if os.path.exists(link_json): with open(link_json, 'r') as f: try: @@ -282,7 +283,7 @@ def parse_url(folder): except ValueError: print('File contains invalid JSON: {}!'.format(link_json)) - archive_org_txt = os.path.join(OUTPUT_DIR, 'archive', folder, 'archive.org.txt') + archive_org_txt = os.path.join(ARCHIVE_DIR, folder, 'archive.org.txt') if os.path.exists(archive_org_txt): with open(archive_org_txt, 'r') as f: original_link = f.read().strip().split('/http', 1)[-1] @@ -417,7 +418,7 @@ def wget_output_path(link, look_in=None): # instead of trying to emulate it here, we just look in the output folder # to see what html file wget actually created as the output wget_folder = link['base_url'].rsplit('/', 1)[0].split('/') - look_in = os.path.join(OUTPUT_DIR, 'archive', link['timestamp'], *wget_folder) + look_in = os.path.join(ARCHIVE_DIR, link['timestamp'], *wget_folder) if look_in and os.path.exists(look_in): html_files = [