working argparse based CLI with most commands implemented

This commit is contained in:
Nick Sweeting 2019-04-03 00:27:37 -04:00
parent 68b4c01c6b
commit 51ae634ec9
20 changed files with 807 additions and 424 deletions

View file

@ -0,0 +1,4 @@
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'

View file

@ -1,19 +1,15 @@
#!/usr/bin/env python3
"""
Main ArchiveBox command line application entrypoint.
"""
__package__ = 'archivebox'
import os
import sys
PYTHON_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(PYTHON_DIR)
from .env import *
from .legacy.archive import main
from .cli.archivebox import main
if __name__ == '__main__':

View file

@ -0,0 +1,27 @@
__package__ = 'archivebox.cli'
import os
from importlib import import_module
CLI_DIR = os.path.dirname(os.path.abspath(__file__))
required_attrs = ('__package__', '__command__', '__description__', 'main')
def list_subcommands():
COMMANDS = {}
for filename in os.listdir(CLI_DIR):
if filename.startswith('archivebox_') and filename.endswith('.py'):
subcommand = filename.replace('archivebox_', '').replace('.py', '')
module = import_module('.archivebox_{}'.format(subcommand), __package__)
assert all(hasattr(module, attr) for attr in required_attrs)
assert module.__command__.split(' ')[-1] == subcommand
COMMANDS[subcommand] = module.__description__
return COMMANDS
def run_subcommand(subcommand: str, args=None):
module = import_module('.archivebox_{}'.format(subcommand), __package__)
return module.main(args) # type: ignore

71
archivebox/cli/archivebox.py Executable file
View file

@ -0,0 +1,71 @@
#!/usr/bin/env python3
# archivebox [command]
__package__ = 'archivebox.cli'
__command__ = 'archivebox'
__description__ = 'ArchiveBox: The self-hosted internet archive.'
import sys
import argparse
from . import list_subcommands, run_subcommand
def parse_args(args=None):
args = sys.argv[1:] if args is None else args
subcommands = list_subcommands()
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=False,
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--help', '-h',
action='store_true',
help=subcommands['help'],
)
group.add_argument(
'--version',
action='store_true',
help=subcommands['version'],
)
group.add_argument(
"subcommand",
type=str,
help= "The name of the subcommand to run",
nargs='?',
choices=subcommands.keys(),
default=None,
)
parser.add_argument(
"args",
help="Arguments for the subcommand",
nargs=argparse.REMAINDER,
)
command = parser.parse_args(args)
if command.help:
command.subcommand = 'help'
if command.version:
command.subcommand = 'version'
# print('--------------------------------------------')
# print('Command: ', sys.argv[0])
# print('Subcommand: ', command.subcommand)
# print('Args to pass:', args[1:])
# print('--------------------------------------------')
return command.subcommand, command.args
def main(args=None):
subcommand, subcommand_args = parse_args(args)
run_subcommand(subcommand, subcommand_args)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,84 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox add'
__description__ = 'Add a new URL or list of URLs to your archive'
import os
import sys
import argparse
from ..legacy.util import (
handle_stdin_import,
handle_file_import,
)
from ..legacy.main import update_archive_data
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
# parser.add_argument(
# '--depth', #'-d',
# type=int,
# help='Recursively archive all linked pages up to this many hops away',
# default=0,
# )
parser.add_argument(
'--only-new', #'-n',
action='store_true',
help="Don't attempt to retry previously skipped/failed links when updating",
)
parser.add_argument(
'--mirror', #'-m',
action='store_true',
help='Archive an entire site (finding all linked pages below it on the same domain)',
)
parser.add_argument(
'--crawler', #'-r',
choices=('depth_first', 'breadth_first'),
help='Controls which crawler to use in order to find outlinks in a given page',
default=None,
)
parser.add_argument(
'url',
nargs='?',
type=str,
default=None,
help='URL of page to archive (or path to local file)'
)
command = parser.parse_args(args)
### Handle ingesting urls piped in through stdin
# (.e.g if user does cat example_urls.txt | ./archive)
import_path = None
if not sys.stdin.isatty():
stdin_raw_text = sys.stdin.read()
if stdin_raw_text and command.url:
print(
'[X] You should pass either a path as an argument, '
'or pass a list of links via stdin, but not both.\n'
)
raise SystemExit(1)
import_path = handle_stdin_import(stdin_raw_text)
### Handle ingesting url from a remote file/feed
# (e.g. if an RSS feed URL is used as the import path)
elif command.url:
import_path = handle_file_import(command.url)
update_archive_data(
import_path=import_path,
resume=None,
only_new=command.only_new,
)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,54 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox help'
__description__ = 'Print the ArchiveBox help message and usage'
import sys
import argparse
from ..legacy.util import reject_stdin
from . import list_subcommands
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.parse_args(args)
reject_stdin(__command__)
COMMANDS_HELP_TEXT = '\n '.join(
f'{cmd.ljust(20)} {summary}'
for cmd, summary in list_subcommands().items()
)
print(f'''ArchiveBox: The self-hosted internet archive.
Usage:
archivebox [command] [--help] [--version] [...args]
Comamnds:
{COMMANDS_HELP_TEXT}
Example Use:
mkdir my-archive; cd my-archive/
archivebox init
echo 'https://example.com/some/page' | archivebox add
archivebox add https://example.com/some/other/page
archivebox add --depth=1 ~/Downloads/bookmarks_export.html
archivebox add --depth=1 https://example.com/feed.rss
archivebox update --resume=15109948213.123
Documentation:
https://github.com/pirate/ArchiveBox/wiki
''')
if __name__ == '__main__':
main()

View file

@ -0,0 +1,72 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox init'
__description__ = 'Initialize a new ArchiveBox collection in the current directory'
import os
import sys
import argparse
from ..legacy.util import reject_stdin
from ..legacy.config import (
OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
DATABASE_DIR,
ANSI,
)
def init(output_dir: str=OUTPUT_DIR):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
harmless_files = {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'}
is_empty = not len(set(os.listdir(output_dir)) - harmless_files)
existing_index = os.path.exists(os.path.join(output_dir, 'index.json'))
if not is_empty:
if existing_index:
print('You already have an archive in this folder!')
# TODO: import old archivebox version's archive data folder
raise SystemExit(1)
else:
print(
("{red}[X] This folder already has files in it. You must run init inside a completely empty directory.{reset}"
"\n\n"
" {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
" just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
).format(output_dir, **ANSI)
)
raise SystemExit(1)
print('{green}[+] Initializing new archive directory: {}{reset}'.format(output_dir, **ANSI))
os.makedirs(SOURCES_DIR)
print(f' > {SOURCES_DIR}')
os.makedirs(ARCHIVE_DIR)
print(f' > {ARCHIVE_DIR}')
os.makedirs(DATABASE_DIR)
print(f' > {DATABASE_DIR}')
print('{green}[√] Done.{reset}'.format(**ANSI))
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.parse_args(args)
reject_stdin(__command__)
init()
if __name__ == '__main__':
main()

View file

@ -0,0 +1,81 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox list'
__description__ = 'List all the URLs currently in the archive.'
import sys
import json
import argparse
from ..legacy.util import reject_stdin, ExtendedEncoder
from ..legacy.main import list_archive_data, csv_format
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--csv', #'-c',
type=str,
help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension",
default=None,
)
group.add_argument(
'--json', #'-j',
action='store_true',
help="Print the output in JSON format with all columns included.",
)
parser.add_argument(
'--filter', #'-f',
type=str,
help="List only URLs matching the given regex pattern.",
default=None,
)
parser.add_argument(
'--sort', #'-s',
type=str,
help="List the links sorted using the given key, e.g. timestamp or updated",
default=None,
)
parser.add_argument(
'--before', #'-b',
type=float,
help="List only URLs bookmarked before the given timestamp.",
default=None,
)
parser.add_argument(
'--after', #'-a',
type=float,
help="List only URLs bookmarked after the given timestamp.",
default=None,
)
command = parser.parse_args(args)
reject_stdin(__command__)
links = list_archive_data(
filter_regex=command.filter,
before=command.before,
after=command.after,
)
if command.sort:
links = sorted(links, key=lambda link: getattr(link, command.sort))
if command.csv:
print(command.csv)
print('\n'.join(csv_format(link, command.csv) for link in links))
elif command.json:
print(json.dumps(list(links), indent=4, cls=ExtendedEncoder))
else:
print('\n'.join(link.url for link in links))
if __name__ == '__main__':
main()

View file

@ -0,0 +1,45 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox update'
__description__ = 'Import any new links from subscriptions and retry any previously failed/skipped links.'
import sys
import argparse
from ..legacy.util import reject_stdin
from ..legacy.main import update_archive_data
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.add_argument(
'--only-new', #'-n',
action='store_true',
help="Don't attempt to retry previously skipped/failed links when updating",
)
parser.add_argument(
'--resume', #'-r',
type=float,
help='Resume the update process from a given timestamp',
default=None,
)
command = parser.parse_args(args)
reject_stdin(__command__)
update_archive_data(
import_path=None,
resume=command.resume,
only_new=command.only_new,
)
if __name__ == '__main__':
main()

View file

@ -0,0 +1,103 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox version'
__description__ = 'Print the ArchiveBox version and dependency information'
import sys
import shutil
import argparse
from ..legacy.util import reject_stdin
from ..legacy.config import (
VERSION,
REPO_DIR,
PYTHON_DIR,
LEGACY_DIR,
TEMPLATES_DIR,
OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
DATABASE_DIR,
USE_CURL,
USE_WGET,
USE_CHROME,
FETCH_GIT,
FETCH_MEDIA,
DJANGO_BINARY,
CURL_BINARY,
GIT_BINARY,
WGET_BINARY,
YOUTUBEDL_BINARY,
CHROME_BINARY,
DJANGO_VERSION,
CURL_VERSION,
GIT_VERSION,
WGET_VERSION,
YOUTUBEDL_VERSION,
CHROME_VERSION,
)
def main(args=None):
args = sys.argv[1:] if args is None else args
parser = argparse.ArgumentParser(
prog=__command__,
description=__description__,
add_help=True,
)
parser.parse_args(args)
reject_stdin(__command__)
print('ArchiveBox v{}'.format(VERSION))
print()
print('[i] Folder locations:')
print(' REPO_DIR: ', REPO_DIR)
print(' PYTHON_DIR: ', PYTHON_DIR)
print(' LEGACY_DIR: ', LEGACY_DIR)
print(' TEMPLATES_DIR: ', TEMPLATES_DIR)
print()
print(' OUTPUT_DIR: ', OUTPUT_DIR)
print(' SOURCES_DIR: ', SOURCES_DIR)
print(' ARCHIVE_DIR: ', ARCHIVE_DIR)
print(' DATABASE_DIR: ', DATABASE_DIR)
print()
print(
'[√] Django:'.ljust(14),
'python3 {} --version\n'.format(DJANGO_BINARY),
' '*13, DJANGO_VERSION, '\n',
)
print(
'[{}] CURL:'.format('' if USE_CURL else 'X').ljust(14),
'{} --version\n'.format(shutil.which(CURL_BINARY)),
' '*13, CURL_VERSION, '\n',
)
print(
'[{}] GIT:'.format('' if FETCH_GIT else 'X').ljust(14),
'{} --version\n'.format(shutil.which(GIT_BINARY)),
' '*13, GIT_VERSION, '\n',
)
print(
'[{}] WGET:'.format('' if USE_WGET else 'X').ljust(14),
'{} --version\n'.format(shutil.which(WGET_BINARY)),
' '*13, WGET_VERSION, '\n',
)
print(
'[{}] YOUTUBEDL:'.format('' if FETCH_MEDIA else 'X').ljust(14),
'{} --version\n'.format(shutil.which(YOUTUBEDL_BINARY)),
' '*13, YOUTUBEDL_VERSION, '\n',
)
print(
'[{}] CHROME:'.format('' if USE_CHROME else 'X').ljust(14),
'{} --version\n'.format(shutil.which(CHROME_BINARY)),
' '*13, CHROME_VERSION, '\n',
)
if __name__ == '__main__':
main()

View file

@ -13,12 +13,12 @@ DEBUG = True
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
# 'django.contrib.admin',
# 'django.contrib.auth',
# 'django.contrib.contenttypes',
# 'django.contrib.sessions',
# 'django.contrib.messages',
# 'django.contrib.staticfiles',
'core',
]
@ -53,10 +53,11 @@ TEMPLATES = [
WSGI_APPLICATION = 'core.wsgi.application'
DATABASE_FILE = os.path.join(DATABASE_DIR, 'database.sqlite3')
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': os.path.join(DATABASE_DIR, 'database.sqlite3'),
'NAME': DATABASE_FILE,
}
}

View file

@ -9,3 +9,7 @@ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings")
import django
django.setup()
from django.conf import settings
DATABASE_FILE = settings.DATABASE_FILE

View file

@ -1,5 +0,0 @@
#__name__ = 'archivebox'
#__package__ = 'archivebox'

View file

@ -1,243 +0,0 @@
#!/usr/bin/env python3
"""
ArchiveBox command line application.
./archive and ./bin/archivebox both point to this file,
but you can also run it directly using `python3 archive.py`
Usage & Documentation:
https://github.com/pirate/ArchiveBox/Wiki
"""
__package__ = 'legacy'
import os
import sys
import shutil
from typing import List, Optional
from .schema import Link
from .links import links_after_timestamp
from .index import write_links_index, load_links_index
from .archive_methods import archive_link
from .config import (
ONLY_NEW,
VERSION,
ANSI,
REPO_DIR,
PYTHON_DIR,
LEGACY_DIR,
TEMPLATES_DIR,
OUTPUT_DIR,
SOURCES_DIR,
ARCHIVE_DIR,
DATABASE_DIR,
USE_CURL,
USE_WGET,
USE_CHROME,
FETCH_GIT,
FETCH_MEDIA,
DJANGO_BINARY,
CURL_BINARY,
GIT_BINARY,
WGET_BINARY,
YOUTUBEDL_BINARY,
CHROME_BINARY,
DJANGO_VERSION,
CURL_VERSION,
GIT_VERSION,
WGET_VERSION,
YOUTUBEDL_VERSION,
CHROME_VERSION,
)
from .util import (
enforce_types,
handle_stdin_import,
handle_file_import,
)
from .logs import (
log_archiving_started,
log_archiving_paused,
log_archiving_finished,
)
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
__VERSION__ = VERSION
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
def print_help():
print('ArchiveBox: The self-hosted internet archive.\n')
print("Documentation:")
print(" https://github.com/pirate/ArchiveBox/wiki\n")
print("UI Usage:")
print(" Open output/index.html to view your archive.\n")
print("CLI Usage:")
print(" mkdir data; cd data/")
print(" archivebox init\n")
print(" echo 'https://example.com/some/page' | archivebox add")
print(" archivebox add https://example.com/some/other/page")
print(" archivebox add --depth=1 ~/Downloads/bookmarks_export.html")
print(" archivebox add --depth=1 https://example.com/feed.rss")
print(" archivebox update --resume=15109948213.123")
def print_version():
print('ArchiveBox v{}'.format(__VERSION__))
print()
print('[i] Folder locations:')
print(' REPO_DIR: ', REPO_DIR)
print(' PYTHON_DIR: ', PYTHON_DIR)
print(' LEGACY_DIR: ', LEGACY_DIR)
print(' TEMPLATES_DIR: ', TEMPLATES_DIR)
print()
print(' OUTPUT_DIR: ', OUTPUT_DIR)
print(' SOURCES_DIR: ', SOURCES_DIR)
print(' ARCHIVE_DIR: ', ARCHIVE_DIR)
print(' DATABASE_DIR: ', DATABASE_DIR)
print()
print(
'[√] Django:'.ljust(14),
'python3 {} --version\n'.format(DJANGO_BINARY),
' '*13, DJANGO_VERSION, '\n',
)
print(
'[{}] CURL:'.format('' if USE_CURL else 'X').ljust(14),
'{} --version\n'.format(shutil.which(CURL_BINARY)),
' '*13, CURL_VERSION, '\n',
)
print(
'[{}] GIT:'.format('' if FETCH_GIT else 'X').ljust(14),
'{} --version\n'.format(shutil.which(GIT_BINARY)),
' '*13, GIT_VERSION, '\n',
)
print(
'[{}] WGET:'.format('' if USE_WGET else 'X').ljust(14),
'{} --version\n'.format(shutil.which(WGET_BINARY)),
' '*13, WGET_VERSION, '\n',
)
print(
'[{}] YOUTUBEDL:'.format('' if FETCH_MEDIA else 'X').ljust(14),
'{} --version\n'.format(shutil.which(YOUTUBEDL_BINARY)),
' '*13, YOUTUBEDL_VERSION, '\n',
)
print(
'[{}] CHROME:'.format('' if USE_CHROME else 'X').ljust(14),
'{} --version\n'.format(shutil.which(CHROME_BINARY)),
' '*13, CHROME_VERSION, '\n',
)
def main(args=None) -> None:
if args is None:
args = sys.argv
if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
print_help()
raise SystemExit(0)
if set(args).intersection(('--version', 'version')):
print_version()
raise SystemExit(0)
### Handle CLI arguments
# ./archive bookmarks.html
# ./archive 1523422111.234
import_path, resume = None, None
if len(args) == 2:
# if the argument is a string, it's a import_path file to import
# if it's a number, it's a timestamp to resume archiving from
if args[1].replace('.', '').isdigit():
import_path, resume = None, args[1]
else:
import_path, resume = args[1], None
### Set up output folder
if not os.path.exists(OUTPUT_DIR):
print('{green}[+] Created a new archive directory: {}{reset}'.format(OUTPUT_DIR, **ANSI))
os.makedirs(OUTPUT_DIR)
os.makedirs(SOURCES_DIR)
os.makedirs(ARCHIVE_DIR)
os.makedirs(DATABASE_DIR)
else:
not_empty = len(set(os.listdir(OUTPUT_DIR)) - {'.DS_Store', '.venv', 'venv', 'virtualenv', '.virtualenv'})
index_exists = os.path.exists(os.path.join(OUTPUT_DIR, 'index.json'))
if not_empty and not index_exists:
print(
("{red}[X] Could not find index.json in the OUTPUT_DIR: {reset}{}\n\n"
" If you're trying to update an existing archive, you must set OUTPUT_DIR to or run archivebox from inside the archive folder you're trying to update.\n"
" If you're trying to create a new archive, you must run archivebox inside a completely empty directory."
"\n\n"
" {lightred}Hint:{reset} To import a data folder created by an older version of ArchiveBox, \n"
" just cd into the folder and run the archivebox command to pick up where you left off.\n\n"
" (Always make sure your data folder is backed up first before updating ArchiveBox)"
).format(OUTPUT_DIR, **ANSI)
)
raise SystemExit(1)
### Handle ingesting urls piped in through stdin
# (.e.g if user does cat example_urls.txt | ./archive)
if not sys.stdin.isatty():
stdin_raw_text = sys.stdin.read()
if stdin_raw_text and import_path:
print(
'[X] You should pass either a path as an argument, '
'or pass a list of links via stdin, but not both.\n'
)
print_help()
raise SystemExit(1)
import_path = handle_stdin_import(stdin_raw_text)
### Handle ingesting url from a remote file/feed
# (e.g. if an RSS feed URL is used as the import path)
if import_path:
import_path = handle_file_import(import_path)
### Run the main archive update process
update_archive_data(import_path=import_path, resume=resume)
@enforce_types
def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None) -> List[Link]:
"""The main ArchiveBox entrancepoint. Everything starts here."""
# Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
# Step 2: Write updated index with deduped old and new links back to disk
write_links_index(links=list(all_links), out_dir=OUTPUT_DIR)
# Step 3: Run the archive methods for each link
links = new_links if ONLY_NEW else all_links
log_archiving_started(len(links), resume)
idx: int = 0
link: Optional[Link] = None
try:
for idx, link in enumerate(links_after_timestamp(links, resume)):
archive_link(link, link_dir=link.link_dir)
except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link.timestamp if link else '0')
raise SystemExit(0)
except:
print()
raise
log_archiving_finished(len(links))
# Step 4: Re-write links index with updated titles, icons, and resources
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
return all_links
if __name__ == '__main__':
main(sys.argv)

View file

@ -3,7 +3,8 @@ import json
from datetime import datetime
from string import Template
from typing import List, Tuple, Iterator, Optional, Mapping
from typing import List, Tuple, Iterator, Optional, Mapping, Iterable
from collections import OrderedDict
from .schema import Link, ArchiveResult
from .config import (
@ -13,14 +14,15 @@ from .config import (
GIT_SHA,
FOOTER_INFO,
TIMEOUT,
URL_BLACKLIST_PTN,
)
from .util import (
scheme,
fuzzy_url,
ts_to_date,
merge_links,
urlencode,
htmlencode,
urldecode,
derived_link_info,
wget_output_path,
enforce_types,
TimedProgress,
@ -28,7 +30,6 @@ from .util import (
atomic_write,
)
from .parse import parse_links
from .links import validate_links
from .logs import (
log_indexing_process_started,
log_indexing_started,
@ -41,6 +42,147 @@ TITLE_LOADING_MSG = 'Not yet archived...'
### Link filtering and checking
@enforce_types
def derived_link_info(link: Link) -> dict:
"""extend link info with the archive urls and other derived data"""
info = link._asdict(extended=True)
info.update(link.canonical_outputs())
return info
@enforce_types
def merge_links(a: Link, b: Link) -> Link:
"""deterministially merge two links, favoring longer field values over shorter,
and "cleaner" values over worse ones.
"""
assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
url = a.url if len(a.url) > len(b.url) else b.url
possible_titles = [
title
for title in (a.title, b.title)
if title and title.strip() and '://' not in title
]
title = None
if len(possible_titles) == 2:
title = max(possible_titles, key=lambda t: len(t))
elif len(possible_titles) == 1:
title = possible_titles[0]
timestamp = (
a.timestamp
if float(a.timestamp or 0) < float(b.timestamp or 0) else
b.timestamp
)
tags_set = (
set(tag.strip() for tag in (a.tags or '').split(','))
| set(tag.strip() for tag in (b.tags or '').split(','))
)
tags = ','.join(tags_set) or None
sources = list(set(a.sources + b.sources))
all_methods = set(list(a.history.keys()) + list(a.history.keys()))
history = {
method: (a.history.get(method) or []) + (b.history.get(method) or [])
for method in all_methods
}
return Link(
url=url,
timestamp=timestamp,
title=title,
tags=tags,
sources=sources,
history=history,
)
def validate_links(links: Iterable[Link]) -> Iterable[Link]:
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = sorted_links(links) # deterministically sort the links based on timstamp, url
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
if not links:
print('[X] No links found :(')
raise SystemExit(1)
return links
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived"""
for link in links:
scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
if scheme_is_valid and not_blacklisted:
yield link
def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
"""
ensures that all non-duplicate links have monotonically increasing timestamps
"""
unique_urls: OrderedDict[str, Link] = OrderedDict()
for link in sorted_links:
fuzzy = fuzzy_url(link.url)
if fuzzy in unique_urls:
# merge with any other links that share the same url
link = merge_links(unique_urls[fuzzy], link)
unique_urls[fuzzy] = link
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
for link in unique_urls.values():
new_link = link.overwrite(
timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp),
)
unique_timestamps[new_link.timestamp] = new_link
return unique_timestamps.values()
def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
return sorted(links, key=sort_func, reverse=True)
def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]:
if not resume:
yield from links
return
for link in links:
try:
if float(link.timestamp) <= resume:
yield link
except (ValueError, TypeError):
print('Resume value and all timestamp values must be valid numbers.')
def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
timestamp = timestamp.split('.')[0]
nonce = 0
# first try 152323423 before 152323423.0
if timestamp not in used_timestamps:
return timestamp
new_timestamp = '{}.{}'.format(timestamp, nonce)
while new_timestamp in used_timestamps:
nonce += 1
new_timestamp = '{}.{}'.format(timestamp, nonce)
return new_timestamp
### Homepage index for all the links

View file

@ -1,93 +0,0 @@
from typing import Iterable
from collections import OrderedDict
from .schema import Link
from .util import (
scheme,
fuzzy_url,
merge_links,
)
from .config import URL_BLACKLIST_PTN
def validate_links(links: Iterable[Link]) -> Iterable[Link]:
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = sorted_links(links) # deterministically sort the links based on timstamp, url
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
if not links:
print('[X] No links found :(')
raise SystemExit(1)
return links
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived"""
for link in links:
scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
if scheme_is_valid and not_blacklisted:
yield link
def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
"""
ensures that all non-duplicate links have monotonically increasing timestamps
"""
unique_urls: OrderedDict[str, Link] = OrderedDict()
for link in sorted_links:
fuzzy = fuzzy_url(link.url)
if fuzzy in unique_urls:
# merge with any other links that share the same url
link = merge_links(unique_urls[fuzzy], link)
unique_urls[fuzzy] = link
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
for link in unique_urls.values():
new_link = link.overwrite(
timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp),
)
unique_timestamps[new_link.timestamp] = new_link
return unique_timestamps.values()
def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
return sorted(links, key=sort_func, reverse=True)
def links_after_timestamp(links: Iterable[Link], resume: float=None) -> Iterable[Link]:
if not resume:
yield from links
return
for link in links:
try:
if float(link.timestamp) <= resume:
yield link
except (ValueError, TypeError):
print('Resume value and all timestamp values must be valid numbers.')
def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
timestamp = timestamp.split('.')[0]
nonce = 0
# first try 152323423 before 152323423.0
if timestamp not in used_timestamps:
return timestamp
new_timestamp = '{}.{}'.format(timestamp, nonce)
while new_timestamp in used_timestamps:
nonce += 1
new_timestamp = '{}.{}'.format(timestamp, nonce)
return new_timestamp

80
archivebox/legacy/main.py Normal file
View file

@ -0,0 +1,80 @@
import re
import json
from typing import List, Optional, Iterable
from .schema import Link
from .util import enforce_types, ExtendedEncoder
from .index import (
links_after_timestamp,
load_links_index,
write_links_index,
)
from .archive_methods import archive_link
from .config import (
ONLY_NEW,
OUTPUT_DIR,
)
from .logs import (
log_archiving_started,
log_archiving_paused,
log_archiving_finished,
)
@enforce_types
def update_archive_data(import_path: Optional[str]=None, resume: Optional[float]=None, only_new: bool=False) -> List[Link]:
"""The main ArchiveBox entrancepoint. Everything starts here."""
# Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
# Step 2: Write updated index with deduped old and new links back to disk
write_links_index(links=list(all_links), out_dir=OUTPUT_DIR)
# Step 3: Run the archive methods for each link
links = new_links if ONLY_NEW else all_links
log_archiving_started(len(links), resume)
idx: int = 0
link: Optional[Link] = None
try:
for idx, link in enumerate(links_after_timestamp(links, resume)):
archive_link(link, link_dir=link.link_dir)
except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link.timestamp if link else '0')
raise SystemExit(0)
except:
print()
raise
log_archiving_finished(len(links))
# Step 4: Re-write links index with updated titles, icons, and resources
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
write_links_index(links=list(all_links), out_dir=OUTPUT_DIR, finished=True)
return all_links
@enforce_types
def list_archive_data(filter_regex: Optional[str]=None, after: Optional[float]=None, before: Optional[float]=None) -> Iterable[Link]:
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
pattern = re.compile(filter_regex, re.IGNORECASE) if filter_regex else None
for link in all_links:
if pattern and not pattern.match(link.url):
continue
if after is not None and float(link.timestamp) < after:
continue
if before is not None and float(link.timestamp) > before:
continue
yield link
def csv_format(link: Link, csv_cols: str) -> str:
return ','.join(json.dumps(getattr(link, col), cls=ExtendedEncoder) for col in csv_cols.split(','))

View file

@ -7,7 +7,11 @@ from shutil import rmtree
from typing import List
from .config import ARCHIVE_DIR, OUTPUT_DIR
from .index import parse_json_links_index, write_html_links_index, write_json_links_index
from .index import (
parse_json_links_index,
write_html_links_index,
write_json_links_index,
)
def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:

View file

@ -404,59 +404,6 @@ def parse_date(date: Any) -> Optional[datetime]:
raise ValueError('Tried to parse invalid date! {}'.format(date))
### Link Helpers
@enforce_types
def merge_links(a: Link, b: Link) -> Link:
"""deterministially merge two links, favoring longer field values over shorter,
and "cleaner" values over worse ones.
"""
assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
url = a.url if len(a.url) > len(b.url) else b.url
possible_titles = [
title
for title in (a.title, b.title)
if title and title.strip() and '://' not in title
]
title = None
if len(possible_titles) == 2:
title = max(possible_titles, key=lambda t: len(t))
elif len(possible_titles) == 1:
title = possible_titles[0]
timestamp = (
a.timestamp
if float(a.timestamp or 0) < float(b.timestamp or 0) else
b.timestamp
)
tags_set = (
set(tag.strip() for tag in (a.tags or '').split(','))
| set(tag.strip() for tag in (b.tags or '').split(','))
)
tags = ','.join(tags_set) or None
sources = list(set(a.sources + b.sources))
all_methods = set(list(a.history.keys()) + list(a.history.keys()))
history = {
method: (a.history.get(method) or []) + (b.history.get(method) or [])
for method in all_methods
}
return Link(
url=url,
timestamp=timestamp,
title=title,
tags=tags,
sources=sources,
history=history,
)
@enforce_types
def is_static_file(url: str) -> bool:
"""Certain URLs just point to a single static file, and
@ -467,16 +414,6 @@ def is_static_file(url: str) -> bool:
return extension(url) in STATICFILE_EXTENSIONS
@enforce_types
def derived_link_info(link: Link) -> dict:
"""extend link info with the archive urls and other derived data"""
info = link._asdict(extended=True)
info.update(link.canonical_outputs())
return info
### Python / System Helpers
@ -696,3 +633,22 @@ def atomic_write(contents: Union[dict, str], path: str) -> None:
finally:
if os.path.exists(tmp_file):
os.remove(tmp_file)
def reject_stdin(caller: str) -> None:
"""Tell the user they passed stdin to a command that doesn't accept it"""
if not sys.stdin.isatty():
stdin_raw_text = sys.stdin.read().strip()
if stdin_raw_text:
print(
'{red}[X] The "{}" command does not accept stdin.{reset}\n'.format(
caller,
**ANSI,
)
)
print(' Run archivebox "{} --help" to see usage and examples.'.format(
caller,
))
print()
raise SystemExit(1)

View file

@ -8,8 +8,8 @@ BIN_DIR = os.path.dirname(os.path.abspath(__file__))
REPO_DIR = os.path.abspath(os.path.join(BIN_DIR, os.pardir))
sys.path.append(REPO_DIR)
from archivebox.__main__ import main
from archivebox.cli.archivebox import main
if __name__ == '__main__':
main(sys.argv)
main()