mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2024-12-10 10:39:26 -05:00
improve config loading of TMP_DIR, LIB_DIR, move to separate files
This commit is contained in:
parent
7a895d9285
commit
cf1ea8f80f
49 changed files with 767 additions and 527 deletions
14
Dockerfile
14
Dockerfile
|
@ -287,22 +287,12 @@ WORKDIR "$DATA_DIR"
|
|||
RUN openssl rand -hex 16 > /etc/machine-id \
|
||||
&& chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/tmp"
|
||||
ENV IN_DOCKER=True \
|
||||
SYSTEM_LIB_DIR=/app/lib \
|
||||
SYSTEM_TMP_DIR=/tmp \
|
||||
SYSTEM_LIB_DIR=/usr/share/archivebox \
|
||||
SYSTEM_TMP_DIR=/tmp/archivebox \
|
||||
GOOGLE_API_KEY=no \
|
||||
GOOGLE_DEFAULT_CLIENT_ID=no \
|
||||
GOOGLE_DEFAULT_CLIENT_SECRET=no \
|
||||
ALLOWED_HOSTS=*
|
||||
## No need to set explicitly, these values will be autodetected by archivebox in docker:
|
||||
# WGET_BINARY="wget" \
|
||||
# YOUTUBEDL_BINARY="yt-dlp" \
|
||||
# CHROME_BINARY="/usr/bin/chromium-browser" \
|
||||
# USE_SINGLEFILE=True \
|
||||
# SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \
|
||||
# USE_READABILITY=True \
|
||||
# READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \
|
||||
# USE_MERCURY=True \
|
||||
# MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser"
|
||||
|
||||
# Print version for nice docker finish summary
|
||||
RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \
|
||||
|
|
|
@ -13,7 +13,7 @@ __package__ = 'archivebox'
|
|||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
ASCII_LOGO = """
|
||||
|
@ -25,37 +25,36 @@ ASCII_LOGO = """
|
|||
╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═══╝ ╚══════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝
|
||||
"""
|
||||
|
||||
SYSTEM_TMP_DIR = Path(tempfile.gettempdir()) / 'archivebox'
|
||||
SYSTEM_TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
os.environ['SYSTEM_TMP_DIR'] = str(SYSTEM_TMP_DIR)
|
||||
os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
|
||||
# detect ArchiveBox user's UID/GID based on data dir ownership
|
||||
from archivebox.config.permissions import drop_privileges # noqa
|
||||
drop_privileges()
|
||||
|
||||
# if we are outside a data dir, cd into an ephemeral tmp dir so that
|
||||
# we can run version/help without polluting cwd with an index.sqlite3
|
||||
if len(sys.argv) > 1 and sys.argv[1] in ('version', 'help'):
|
||||
current_dir = Path(os.getcwd()).resolve()
|
||||
if not (current_dir / 'index.sqlite3').exists():
|
||||
os.chdir(SYSTEM_TMP_DIR)
|
||||
from archivebox.misc.checks import check_not_root, check_io_encoding # noqa
|
||||
check_not_root()
|
||||
check_io_encoding()
|
||||
|
||||
# make sure PACKAGE_DIR is in sys.path so we can import all subfolders
|
||||
# without necessarily waiting for django to load them thorugh INSTALLED_APPS
|
||||
PACKAGE_DIR = Path(__file__).resolve().parent
|
||||
if str(PACKAGE_DIR) not in sys.path:
|
||||
sys.path.append(str(PACKAGE_DIR))
|
||||
os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
|
||||
|
||||
|
||||
# print('INSTALLING MONKEY PATCHES')
|
||||
from .monkey_patches import * # noqa
|
||||
from archivebox.monkey_patches import * # noqa
|
||||
# print('DONE INSTALLING MONKEY PATCHES')
|
||||
|
||||
|
||||
# print('LOADING VENDORED LIBRARIES')
|
||||
from .vendor import load_vendored_libs # noqa
|
||||
from archivebox.vendor import load_vendored_libs # noqa
|
||||
load_vendored_libs()
|
||||
# print('DONE LOADING VENDORED LIBRARIES')
|
||||
|
||||
|
||||
from .config.constants import CONSTANTS, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, VERSION # noqa
|
||||
from archivebox.config.constants import CONSTANTS # noqa
|
||||
from archivebox.config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||
from archivebox.config.version import VERSION # noqa
|
||||
|
||||
__version__ = VERSION
|
||||
__author__ = 'Nick Sweeting'
|
||||
|
|
|
@ -12,12 +12,13 @@ from ninja import NinjaAPI, Swagger
|
|||
|
||||
# TODO: explore adding https://eadwincode.github.io/django-ninja-extra/
|
||||
|
||||
from archivebox.config import SHELL_CONFIG, VERSION
|
||||
from archivebox.config import VERSION
|
||||
from archivebox.config.version import get_COMMIT_HASH
|
||||
|
||||
from api.auth import API_AUTH_METHODS
|
||||
|
||||
|
||||
COMMIT_HASH = SHELL_CONFIG.COMMIT_HASH or 'unknown'
|
||||
COMMIT_HASH = get_COMMIT_HASH() or 'unknown'
|
||||
|
||||
html_description=f'''
|
||||
<h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>
|
||||
|
|
|
@ -13,7 +13,7 @@ from ..main import (
|
|||
schedule,
|
||||
)
|
||||
from archivebox.misc.util import ansi_to_html
|
||||
from archivebox.config import ARCHIVING_CONFIG
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
from .auth import API_AUTH_METHODS
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import threading
|
||||
|
@ -25,6 +26,10 @@ if len(sys.argv) > 1 and sys.argv[1] == 'setup':
|
|||
print(':warning: [bold red]DEPRECATED[/bold red] `archivebox setup` is deprecated, use `archivebox install` instead')
|
||||
sys.argv[1] = 'install'
|
||||
|
||||
if '--debug' in sys.argv:
|
||||
os.environ['DEBUG'] = 'True'
|
||||
sys.argv.remove('--debug')
|
||||
|
||||
|
||||
# def list_subcommands() -> Dict[str, str]:
|
||||
# """find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
|
||||
|
@ -50,8 +55,8 @@ SUBCOMMAND_MODULES = {
|
|||
|
||||
'init': 'archivebox_init',
|
||||
'install': 'archivebox_install',
|
||||
##############################################
|
||||
'config': 'archivebox_config',
|
||||
|
||||
'add': 'archivebox_add',
|
||||
'remove': 'archivebox_remove',
|
||||
'update': 'archivebox_update',
|
||||
|
@ -63,7 +68,7 @@ SUBCOMMAND_MODULES = {
|
|||
'shell': 'archivebox_shell',
|
||||
'manage': 'archivebox_manage',
|
||||
|
||||
'oneshot': 'archivebox_oneshot',
|
||||
# 'oneshot': 'archivebox_oneshot',
|
||||
}
|
||||
|
||||
# every imported command module must have these properties in order to be valid
|
||||
|
@ -102,11 +107,11 @@ CLI_SUBCOMMANDS = LazySubcommands()
|
|||
|
||||
# these common commands will appear sorted before any others for ease-of-use
|
||||
meta_cmds = ('help', 'version') # dont require valid data folder at all
|
||||
main_cmds = ('init', 'config', 'setup', 'install') # dont require existing db present
|
||||
archive_cmds = ('add', 'remove', 'update', 'list', 'status') # require existing db present
|
||||
setup_cmds = ('init', 'setup', 'install') # require valid data folder, but dont require DB present in it yet
|
||||
archive_cmds = ('add', 'remove', 'update', 'list', 'status', 'schedule', 'server', 'shell', 'manage') # require valid data folder + existing db present
|
||||
fake_db = ("oneshot",) # use fake in-memory db
|
||||
|
||||
display_first = (*meta_cmds, *main_cmds, *archive_cmds)
|
||||
display_first = (*meta_cmds, *setup_cmds, *archive_cmds)
|
||||
|
||||
|
||||
IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler') # threads we dont have to wait for before exiting
|
||||
|
@ -157,14 +162,16 @@ def run_subcommand(subcommand: str,
|
|||
from archivebox.config.legacy import setup_django
|
||||
|
||||
# print('DATA_DIR is', DATA_DIR)
|
||||
# print('pwd is', os.getcwd())
|
||||
# print('pwd is', os.getcwd())
|
||||
|
||||
cmd_requires_db = subcommand in archive_cmds
|
||||
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
|
||||
|
||||
setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
|
||||
check_db = cmd_requires_db and not init_pending
|
||||
|
||||
if subcommand not in meta_cmds:
|
||||
setup_django(in_memory_db=subcommand in fake_db, check_db=check_db)
|
||||
|
||||
if subcommand in archive_cmds:
|
||||
if cmd_requires_db:
|
||||
check_migrations()
|
||||
|
||||
|
|
|
@ -9,7 +9,8 @@ import argparse
|
|||
from typing import List, Optional, IO
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
from archivebox.config import DATA_DIR, ARCHIVING_CONFIG
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
from ..main import add
|
||||
from ..parsers import PARSERS
|
||||
|
|
|
@ -9,7 +9,8 @@ from pathlib import Path
|
|||
from typing import Optional, List, IO
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
from archivebox.config import DATA_DIR, SERVER_CONFIG
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
from ..main import server
|
||||
|
||||
|
|
|
@ -1,27 +1,9 @@
|
|||
__package__ = 'archivebox.config'
|
||||
|
||||
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
|
||||
from .defaults import (
|
||||
SHELL_CONFIG,
|
||||
STORAGE_CONFIG,
|
||||
GENERAL_CONFIG,
|
||||
SERVER_CONFIG,
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
from .paths import (
|
||||
PACKAGE_DIR, # noqa
|
||||
DATA_DIR, # noqa
|
||||
ARCHIVE_DIR, # noqa
|
||||
)
|
||||
|
||||
|
||||
__all__ = [
|
||||
'CONSTANTS',
|
||||
'PACKAGE_DIR',
|
||||
'DATA_DIR',
|
||||
'ARCHIVE_DIR',
|
||||
'VERSION',
|
||||
'SHELL_CONFIG',
|
||||
'STORAGE_CONFIG',
|
||||
'GENERAL_CONFIG',
|
||||
'SERVER_CONFIG',
|
||||
'ARCHIVING_CONFIG',
|
||||
'SEARCH_BACKEND_CONFIG',
|
||||
'CONSTANTS_CONFIG',
|
||||
]
|
||||
from .constants import CONSTANTS, CONSTANTS_CONFIG # noqa
|
||||
from .version import VERSION # noqa
|
||||
|
|
|
@ -8,7 +8,7 @@ from abx.archivebox.base_hook import BaseHook
|
|||
|
||||
|
||||
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||
from .defaults import (
|
||||
from .common import (
|
||||
ShellConfig, # noqa: F401
|
||||
StorageConfig, # noqa: F401
|
||||
GeneralConfig, # noqa: F401
|
||||
|
|
|
@ -1,47 +0,0 @@
|
|||
# def get_versions_available_on_github(config):
|
||||
# """
|
||||
# returns a dictionary containing the ArchiveBox GitHub release info for
|
||||
# the recommended upgrade version and the currently installed version
|
||||
# """
|
||||
|
||||
# # we only want to perform the (relatively expensive) check for new versions
|
||||
# # when its most relevant, e.g. when the user runs a long-running command
|
||||
# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
|
||||
# long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
|
||||
# if subcommand_run_by_user not in long_running_commands:
|
||||
# return None
|
||||
|
||||
# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
|
||||
# response = requests.get(github_releases_api)
|
||||
# if response.status_code != 200:
|
||||
# stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
|
||||
# return None
|
||||
# all_releases = response.json()
|
||||
|
||||
# installed_version = parse_version_string(config['VERSION'])
|
||||
|
||||
# # find current version or nearest older version (to link to)
|
||||
# current_version = None
|
||||
# for idx, release in enumerate(all_releases):
|
||||
# release_version = parse_version_string(release['tag_name'])
|
||||
# if release_version <= installed_version:
|
||||
# current_version = release
|
||||
# break
|
||||
|
||||
# current_version = current_version or all_releases[-1]
|
||||
|
||||
# # recommended version is whatever comes after current_version in the release list
|
||||
# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
|
||||
# try:
|
||||
# recommended_version = all_releases[idx+1]
|
||||
# except IndexError:
|
||||
# recommended_version = None
|
||||
|
||||
# return {'recommended_version': recommended_version, 'current_version': current_version}
|
||||
|
||||
# def can_upgrade(config):
|
||||
# if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
|
||||
# recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
|
||||
# current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
|
||||
# return recommended_version > current_version
|
||||
# return False
|
|
@ -1,21 +1,21 @@
|
|||
__package__ = 'archivebox.config'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
|
||||
from typing import Dict, Optional
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from rich import print
|
||||
from pydantic import Field, field_validator, model_validator, computed_field
|
||||
from pydantic import Field, field_validator, computed_field
|
||||
from django.utils.crypto import get_random_string
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
|
||||
from .constants import CONSTANTS, PACKAGE_DIR
|
||||
from .constants import CONSTANTS
|
||||
from .version import get_COMMIT_HASH, get_BUILD_TIME
|
||||
from .permissions import IN_DOCKER
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
@ -27,14 +27,8 @@ class ShellConfig(BaseConfigSet):
|
|||
USE_COLOR: bool = Field(default=lambda c: c.IS_TTY)
|
||||
SHOW_PROGRESS: bool = Field(default=lambda c: c.IS_TTY)
|
||||
|
||||
IN_DOCKER: bool = Field(default=False)
|
||||
IN_DOCKER: bool = Field(default=IN_DOCKER)
|
||||
IN_QEMU: bool = Field(default=False)
|
||||
|
||||
USER: str = Field(default=Path('~').expanduser().resolve().name)
|
||||
PUID: int = Field(default=os.getuid())
|
||||
PGID: int = Field(default=os.getgid())
|
||||
|
||||
PYTHON_ENCODING: str = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8'))
|
||||
|
||||
ANSI: Dict[str, str] = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
|
||||
|
||||
|
@ -52,63 +46,12 @@ class ShellConfig(BaseConfigSet):
|
|||
@computed_field
|
||||
@property
|
||||
def COMMIT_HASH(self) -> Optional[str]:
|
||||
try:
|
||||
git_dir = PACKAGE_DIR / '../.git'
|
||||
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
|
||||
commit_hash = git_dir.joinpath(ref).read_text().strip()
|
||||
return commit_hash
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
return list((PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
return get_COMMIT_HASH()
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
def BUILD_TIME(self) -> str:
|
||||
if self.IN_DOCKER:
|
||||
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
|
||||
return docker_build_end_time
|
||||
|
||||
src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
|
||||
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
|
||||
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_not_running_as_root(self):
|
||||
attempted_command = ' '.join(sys.argv[:3])
|
||||
if self.PUID == 0 and attempted_command not in ('setup', 'install'):
|
||||
# stderr('[!] ArchiveBox should never be run as root!', color='red')
|
||||
# stderr(' For more information, see the security overview documentation:')
|
||||
# stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
|
||||
print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr)
|
||||
print(' For more information, see the security overview documentation:', file=sys.stderr)
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr)
|
||||
|
||||
if self.IN_DOCKER:
|
||||
print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr)
|
||||
print(' docker compose run archivebox {attempted_command}', file=sys.stderr)
|
||||
print(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr)
|
||||
print(' or:', file=sys.stderr)
|
||||
print(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
|
||||
print(f' docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
|
||||
raise SystemExit(2)
|
||||
|
||||
# check python locale
|
||||
if self.PYTHON_ENCODING != 'UTF-8':
|
||||
print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {self.PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr)
|
||||
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr)
|
||||
print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr)
|
||||
print('')
|
||||
print(' Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr)
|
||||
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr)
|
||||
raise SystemExit(2)
|
||||
|
||||
return self
|
||||
return get_BUILD_TIME()
|
||||
|
||||
SHELL_CONFIG = ShellConfig()
|
||||
|
|
@ -1,115 +0,0 @@
|
|||
from pathlib import Path
|
||||
from typing import Optional, Dict, Union, Tuple, Callable, Pattern, Type, Any, List
|
||||
from mypy_extensions import TypedDict
|
||||
|
||||
from benedict import benedict
|
||||
|
||||
SimpleConfigValue = Union[str, bool, int, None, Pattern, Dict[str, Any]]
|
||||
SimpleConfigValueDict = Dict[str, SimpleConfigValue]
|
||||
SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
|
||||
ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
|
||||
|
||||
|
||||
|
||||
class BaseConfig(TypedDict):
|
||||
pass
|
||||
|
||||
class ConfigDict(BaseConfig, benedict, total=False):
|
||||
"""
|
||||
# Regenerate by pasting this quine into `archivebox shell` 🥚
|
||||
from archivebox.config import ConfigDict, CONFIG_DEFAULTS
|
||||
print('class ConfigDict(BaseConfig, total=False):')
|
||||
print(' ' + '"'*3 + ConfigDict.__doc__ + '"'*3)
|
||||
for section, configs in CONFIG_DEFAULTS.items():
|
||||
for key, attrs in configs.items():
|
||||
Type, default = attrs['type'], attrs['default']
|
||||
if default is None:
|
||||
print(f' {key}: Optional[{Type.__name__}]')
|
||||
else:
|
||||
print(f' {key}: {Type.__name__}')
|
||||
print()
|
||||
"""
|
||||
|
||||
IS_TTY: bool
|
||||
USE_COLOR: bool
|
||||
SHOW_PROGRESS: bool
|
||||
IN_DOCKER: bool
|
||||
|
||||
PACKAGE_DIR: Path
|
||||
CONFIG_FILE: Path
|
||||
ONLY_NEW: bool
|
||||
TIMEOUT: int
|
||||
MEDIA_TIMEOUT: int
|
||||
OUTPUT_PERMISSIONS: str
|
||||
RESTRICT_FILE_NAMES: str
|
||||
URL_DENYLIST: str
|
||||
|
||||
SECRET_KEY: Optional[str]
|
||||
BIND_ADDR: str
|
||||
ALLOWED_HOSTS: str
|
||||
DEBUG: bool
|
||||
PUBLIC_INDEX: bool
|
||||
PUBLIC_SNAPSHOTS: bool
|
||||
FOOTER_INFO: str
|
||||
|
||||
SAVE_TITLE: bool
|
||||
SAVE_FAVICON: bool
|
||||
SAVE_WGET: bool
|
||||
SAVE_WGET_REQUISITES: bool
|
||||
SAVE_SINGLEFILE: bool
|
||||
SAVE_READABILITY: bool
|
||||
SAVE_MERCURY: bool
|
||||
SAVE_PDF: bool
|
||||
SAVE_SCREENSHOT: bool
|
||||
SAVE_DOM: bool
|
||||
SAVE_WARC: bool
|
||||
SAVE_GIT: bool
|
||||
SAVE_MEDIA: bool
|
||||
SAVE_ARCHIVE_DOT_ORG: bool
|
||||
|
||||
RESOLUTION: str
|
||||
GIT_DOMAINS: str
|
||||
CHECK_SSL_VALIDITY: bool
|
||||
CURL_USER_AGENT: str
|
||||
WGET_USER_AGENT: str
|
||||
CHROME_USER_AGENT: str
|
||||
COOKIES_FILE: Union[str, Path, None]
|
||||
CHROME_USER_DATA_DIR: Union[str, Path, None]
|
||||
CHROME_TIMEOUT: int
|
||||
CHROME_HEADLESS: bool
|
||||
CHROME_SANDBOX: bool
|
||||
|
||||
USE_CURL: bool
|
||||
USE_WGET: bool
|
||||
USE_SINGLEFILE: bool
|
||||
USE_READABILITY: bool
|
||||
USE_MERCURY: bool
|
||||
USE_GIT: bool
|
||||
USE_CHROME: bool
|
||||
USE_YOUTUBEDL: bool
|
||||
CURL_BINARY: str
|
||||
GIT_BINARY: str
|
||||
WGET_BINARY: str
|
||||
SINGLEFILE_BINARY: str
|
||||
READABILITY_BINARY: str
|
||||
MERCURY_BINARY: str
|
||||
YOUTUBEDL_BINARY: str
|
||||
CHROME_BINARY: Optional[str]
|
||||
|
||||
YOUTUBEDL_ARGS: List[str]
|
||||
WGET_ARGS: List[str]
|
||||
CURL_ARGS: List[str]
|
||||
GIT_ARGS: List[str]
|
||||
TAG_SEPARATOR_PATTERN: str
|
||||
|
||||
|
||||
ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]
|
||||
ConfigDefaultValue = Union[ConfigValue, ConfigDefaultValueGetter]
|
||||
|
||||
ConfigDefault = TypedDict('ConfigDefault', {
|
||||
'default': ConfigDefaultValue,
|
||||
'type': Optional[Type],
|
||||
'aliases': Optional[Tuple[str, ...]],
|
||||
}, total=False)
|
||||
|
||||
ConfigDefaultDict = Dict[str, ConfigDefault]
|
|
@ -1,118 +1,115 @@
|
|||
__package__ = 'archivebox.config'
|
||||
|
||||
|
||||
import os
|
||||
import re
|
||||
import platform
|
||||
import tempfile
|
||||
|
||||
from typing import Dict
|
||||
from pathlib import Path
|
||||
import importlib.metadata
|
||||
from collections.abc import Mapping
|
||||
|
||||
from benedict import benedict
|
||||
|
||||
from ..misc.logging import DEFAULT_CLI_COLORS
|
||||
|
||||
from .paths import (
|
||||
PACKAGE_DIR,
|
||||
DATA_DIR,
|
||||
ARCHIVE_DIR,
|
||||
get_collection_id,
|
||||
get_LIB_DIR,
|
||||
get_TMP_DIR,
|
||||
)
|
||||
from .permissions import (
|
||||
IS_ROOT,
|
||||
IN_DOCKER,
|
||||
RUNNING_AS_UID,
|
||||
RUNNING_AS_GID,
|
||||
DEFAULT_PUID,
|
||||
DEFAULT_PGID,
|
||||
ARCHIVEBOX_USER,
|
||||
ARCHIVEBOX_GROUP,
|
||||
)
|
||||
from .version import detect_installed_version
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
||||
DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
|
||||
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
|
||||
|
||||
def _detect_installed_version(PACKAGE_DIR: Path):
|
||||
"""Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file"""
|
||||
try:
|
||||
# if in production install, use pip-installed package metadata
|
||||
return importlib.metadata.version(__package__ or 'archivebox').strip()
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
pass
|
||||
|
||||
try:
|
||||
# if in dev Git repo dir, use pyproject.toml file
|
||||
pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n')
|
||||
for line in pyproject_config:
|
||||
if line.startswith('version = '):
|
||||
return line.split(' = ', 1)[-1].strip('"').strip()
|
||||
except FileNotFoundError:
|
||||
# building docs, pyproject.toml is not available
|
||||
pass
|
||||
|
||||
# raise Exception('Failed to detect installed archivebox version!')
|
||||
return 'dev'
|
||||
|
||||
VERSION: str = _detect_installed_version(PACKAGE_DIR)
|
||||
|
||||
|
||||
|
||||
|
||||
class ConstantsDict(Mapping):
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'yes')
|
||||
OS = platform.system().lower() # darwin, linux, etc.
|
||||
ARCH = platform.machine().lower() # arm64, x86_64, etc.
|
||||
LIB_DIR_SCOPE = f'{ARCH}-{OS}' + ('-docker' if IN_DOCKER else '')
|
||||
|
||||
PACKAGE_DIR: Path = PACKAGE_DIR # archivebox source code dir
|
||||
DATA_DIR: Path = DATA_DIR # archivebox user data dir
|
||||
ARCHIVE_DIR: Path = ARCHIVE_DIR # archivebox snapshot data dir
|
||||
VERSION: str = VERSION
|
||||
PACKAGE_DIR: Path = PACKAGE_DIR
|
||||
DATA_DIR: Path = DATA_DIR
|
||||
ARCHIVE_DIR: Path = ARCHIVE_DIR
|
||||
COLLECTION_ID: str = get_collection_id(DATA_DIR)
|
||||
|
||||
# Host system
|
||||
VERSION: str = detect_installed_version(PACKAGE_DIR)
|
||||
OS: str = platform.system().lower() # darwin, linux, etc.
|
||||
ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc.
|
||||
IN_DOCKER: bool = IN_DOCKER
|
||||
|
||||
# Permissions
|
||||
IS_ROOT: bool = IS_ROOT
|
||||
ARCHIVEBOX_USER: int = ARCHIVEBOX_USER
|
||||
ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP
|
||||
RUNNING_AS_UID: int = RUNNING_AS_UID
|
||||
RUNNING_AS_GID: int = RUNNING_AS_GID
|
||||
DEFAULT_PUID: int = DEFAULT_PUID
|
||||
DEFAULT_PGID: int = DEFAULT_PGID
|
||||
|
||||
# Source code dirs
|
||||
PACKAGE_DIR_NAME: str = PACKAGE_DIR.name
|
||||
TEMPLATES_DIR_NAME: str = 'templates'
|
||||
TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
|
||||
STATIC_DIR: Path = TEMPLATES_DIR / 'static'
|
||||
STATIC_DIR_NAME: str = 'static'
|
||||
STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME
|
||||
|
||||
# Data dirs
|
||||
ARCHIVE_DIR_NAME: str = 'archive'
|
||||
SOURCES_DIR_NAME: str = 'sources'
|
||||
PERSONAS_DIR_NAME: str = 'personas'
|
||||
CRONTABS_DIR_NAME: str = 'crontabs'
|
||||
CACHE_DIR_NAME: str = 'cache'
|
||||
LOGS_DIR_NAME: str = 'logs'
|
||||
USER_PLUGINS_DIR_NAME: str = 'user_plugins'
|
||||
CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates'
|
||||
|
||||
ARCHIVE_DIR_NAME: str = 'archive'
|
||||
SOURCES_DIR_NAME: str = 'sources'
|
||||
PERSONAS_DIR_NAME: str = 'personas'
|
||||
CRONTABS_DIR_NAME: str = 'crontabs'
|
||||
CACHE_DIR_NAME: str = 'cache'
|
||||
LOGS_DIR_NAME: str = 'logs'
|
||||
LIB_DIR_NAME: str = 'lib'
|
||||
TMP_DIR_NAME: str = 'tmp'
|
||||
|
||||
SYSTEM_TMP_DIR: Path = Path(os.environ['SYSTEM_TMP_DIR']) if 'SYSTEM_TMP_DIR' in os.environ else (Path(tempfile.gettempdir()) / 'archivebox')
|
||||
# DATA_DIR_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / machineid.hashed_id('archivebox')[:16] # cant be used because of socket path length restrictions break too often if data dir is in some deep subdir: ocket.error reported AF_UNIX path too long
|
||||
SYSTEM_LIB_DIR: Path = Path(os.environ['SYSTEM_LIB_DIR']) if 'SYSTEM_LIB_DIR' in os.environ else (PACKAGE_DIR / LIB_DIR_NAME)
|
||||
DATA_DIR_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / LIB_DIR_SCOPE
|
||||
|
||||
ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
|
||||
SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
|
||||
PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
|
||||
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
|
||||
LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
|
||||
LIB_DIR: Path = SYSTEM_LIB_DIR if IN_DOCKER else DATA_DIR_LIB_DIR # e.g. /app/lib or ./data/lib/arm64-darwin-docker
|
||||
TMP_DIR: Path = SYSTEM_TMP_DIR
|
||||
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
|
||||
CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
|
||||
USER_PLUGINS_DIR: Path = DATA_DIR / USER_PLUGINS_DIR_NAME
|
||||
|
||||
# Data dir files
|
||||
CONFIG_FILENAME: str = 'ArchiveBox.conf'
|
||||
SQL_INDEX_FILENAME: str = 'index.sqlite3'
|
||||
QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
|
||||
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
|
||||
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
|
||||
QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
|
||||
|
||||
JSON_INDEX_FILENAME: str = 'index.json'
|
||||
HTML_INDEX_FILENAME: str = 'index.html'
|
||||
ROBOTS_TXT_FILENAME: str = 'robots.txt'
|
||||
FAVICON_FILENAME: str = 'favicon.ico'
|
||||
|
||||
# Runtime dirs
|
||||
TMP_DIR_NAME: str = 'tmp'
|
||||
TMP_DIR: Path = get_TMP_DIR()
|
||||
LIB_DIR_NAME: str = 'lib'
|
||||
LIB_DIR: Path = get_LIB_DIR()
|
||||
LIB_PIP_DIR: Path = LIB_DIR / 'pip'
|
||||
LIB_NPM_DIR: Path = LIB_DIR / 'npm'
|
||||
LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers'
|
||||
LIB_BIN_DIR: Path = LIB_DIR / 'bin'
|
||||
BIN_DIR: Path = LIB_BIN_DIR
|
||||
|
||||
CONFIG_FILENAME: str = 'ArchiveBox.conf'
|
||||
SQL_INDEX_FILENAME: str = 'index.sqlite3'
|
||||
QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
|
||||
# Config constants
|
||||
TIMEZONE: str = 'UTC'
|
||||
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
|
||||
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
|
||||
|
||||
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
|
||||
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
|
||||
QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
|
||||
|
||||
JSON_INDEX_FILENAME: str = 'index.json'
|
||||
HTML_INDEX_FILENAME: str = 'index.html'
|
||||
ROBOTS_TXT_FILENAME: str = 'robots.txt'
|
||||
FAVICON_FILENAME: str = 'favicon.ico'
|
||||
|
||||
TIMEZONE: str = 'UTC'
|
||||
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
|
||||
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
|
||||
|
||||
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
|
||||
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
|
||||
|
||||
STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
|
||||
# 99.999% of the time, URLs ending in these extensions are static files
|
||||
|
@ -136,17 +133,6 @@ class ConstantsDict(Mapping):
|
|||
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
||||
))
|
||||
|
||||
INGORED_PATHS: frozenset[str] = frozenset((
|
||||
".git",
|
||||
".svn",
|
||||
".DS_Store",
|
||||
".gitignore",
|
||||
"lost+found",
|
||||
".DS_Store",
|
||||
".env",
|
||||
"Dockerfile",
|
||||
".ArchiveBox.conf.bak",
|
||||
))
|
||||
PIP_RELATED_NAMES: frozenset[str] = frozenset((
|
||||
".venv",
|
||||
"venv",
|
||||
|
@ -160,7 +146,15 @@ class ConstantsDict(Mapping):
|
|||
"yarn.lock",
|
||||
))
|
||||
|
||||
DATA_DIR_NAMES: frozenset[str] = frozenset((
|
||||
# When initializing archivebox in a new directory, we check to make sure the dir is
|
||||
# actually empty so that we dont clobber someone's home directory or desktop by accident.
|
||||
# These files are exceptions to the is_empty check when we're trying to init a new dir,
|
||||
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
|
||||
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
|
||||
*PIP_RELATED_NAMES,
|
||||
*NPM_RELATED_NAMES,
|
||||
|
||||
### Dirs:
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
|
@ -171,9 +165,12 @@ class ConstantsDict(Mapping):
|
|||
CUSTOM_TEMPLATES_DIR_NAME,
|
||||
USER_PLUGINS_DIR_NAME,
|
||||
CRONTABS_DIR_NAME,
|
||||
))
|
||||
DATA_DIRS: frozenset[Path] = frozenset(DATA_DIR / dirname for dirname in DATA_DIR_NAMES)
|
||||
DATA_FILE_NAMES: frozenset[str] = frozenset((
|
||||
"static", # created by old static exports <v0.6.0
|
||||
"sonic", # created by docker bind mount / sonic FTS process
|
||||
".git",
|
||||
".svn",
|
||||
|
||||
### Files:
|
||||
CONFIG_FILENAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
f"{SQL_INDEX_FILENAME}-wal",
|
||||
|
@ -188,43 +185,37 @@ class ConstantsDict(Mapping):
|
|||
FAVICON_FILENAME,
|
||||
CONFIG_FILENAME,
|
||||
f"{CONFIG_FILENAME}.bak",
|
||||
f".{CONFIG_FILENAME}.bak",
|
||||
"static_index.json",
|
||||
))
|
||||
|
||||
# When initializing archivebox in a new directory, we check to make sure the dir is
|
||||
# actually empty so that we dont clobber someone's home directory or desktop by accident.
|
||||
# These files are exceptions to the is_empty check when we're trying to init a new dir,
|
||||
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
|
||||
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
|
||||
*INGORED_PATHS,
|
||||
*PIP_RELATED_NAMES,
|
||||
*NPM_RELATED_NAMES,
|
||||
*DATA_DIR_NAMES,
|
||||
*DATA_FILE_NAMES,
|
||||
"static", # created by old static exports <v0.6.0
|
||||
"sonic", # created by docker bind mount
|
||||
".DS_Store",
|
||||
".gitignore",
|
||||
"lost+found",
|
||||
".DS_Store",
|
||||
".env",
|
||||
".collection_id",
|
||||
"Dockerfile",
|
||||
))
|
||||
|
||||
CODE_LOCATIONS = benedict({
|
||||
'PACKAGE_DIR': {
|
||||
'path': (PACKAGE_DIR).resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': (PACKAGE_DIR / '__main__.py').exists(),
|
||||
'is_valid': (PACKAGE_DIR / '__main__.py').exists(), # read + list
|
||||
},
|
||||
'TEMPLATES_DIR': {
|
||||
'path': TEMPLATES_DIR.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': STATIC_DIR.exists(),
|
||||
'is_valid': STATIC_DIR.exists() and os.access(STATIC_DIR, os.R_OK) and os.access(STATIC_DIR, os.X_OK), # read + list
|
||||
},
|
||||
'LIB_DIR': {
|
||||
'path': LIB_DIR.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': LIB_DIR.is_dir(),
|
||||
'is_valid': LIB_DIR.is_dir() and os.access(LIB_DIR, os.R_OK) and os.access(LIB_DIR, os.X_OK) and os.access(LIB_DIR, os.W_OK), # read + write
|
||||
},
|
||||
'TMP_DIR': {
|
||||
'path': TMP_DIR.resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': TMP_DIR.is_dir(),
|
||||
'is_valid': TMP_DIR.is_dir() and os.access(TMP_DIR, os.R_OK) and os.access(TMP_DIR, os.X_OK) and os.access(TMP_DIR, os.W_OK), # read + write
|
||||
},
|
||||
})
|
||||
|
||||
|
@ -232,61 +223,61 @@ class ConstantsDict(Mapping):
|
|||
"DATA_DIR": {
|
||||
"path": DATA_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": DATABASE_FILE.exists(),
|
||||
"is_valid": DATABASE_FILE.exists() and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK) and os.access(DATA_DIR, os.X_OK),
|
||||
"is_mount": os.path.ismount(DATA_DIR.resolve()),
|
||||
},
|
||||
"CONFIG_FILE": {
|
||||
"path": CONFIG_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": CONFIG_FILE.exists(),
|
||||
"is_valid": CONFIG_FILE.exists() and os.access(CONFIG_FILE, os.W_OK),
|
||||
},
|
||||
"SQL_INDEX": {
|
||||
"path": DATABASE_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": DATABASE_FILE.exists(),
|
||||
"is_valid": DATABASE_FILE.exists() and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
|
||||
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
|
||||
},
|
||||
"QUEUE_DATABASE": {
|
||||
"path": QUEUE_DATABASE_FILE.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": QUEUE_DATABASE_FILE.exists(),
|
||||
"is_valid": QUEUE_DATABASE_FILE.exists() and os.access(QUEUE_DATABASE_FILE, os.R_OK) and os.access(QUEUE_DATABASE_FILE, os.W_OK),
|
||||
"is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
|
||||
},
|
||||
"ARCHIVE_DIR": {
|
||||
"path": ARCHIVE_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": ARCHIVE_DIR.exists(),
|
||||
"is_valid": ARCHIVE_DIR.exists() and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK) and os.access(ARCHIVE_DIR, os.X_OK),
|
||||
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
|
||||
},
|
||||
"SOURCES_DIR": {
|
||||
"path": SOURCES_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": SOURCES_DIR.exists(),
|
||||
"is_valid": SOURCES_DIR.exists() and os.access(SOURCES_DIR, os.R_OK) and os.access(SOURCES_DIR, os.W_OK) and os.access(SOURCES_DIR, os.X_OK),
|
||||
},
|
||||
"LOGS_DIR": {
|
||||
"path": LOGS_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": LOGS_DIR.is_dir(),
|
||||
"is_valid": LOGS_DIR.is_dir() and os.access(LOGS_DIR, os.R_OK) and os.access(LOGS_DIR, os.W_OK) and os.access(LOGS_DIR, os.X_OK), # read + write
|
||||
},
|
||||
# "CACHE_DIR": {
|
||||
# "path": CACHE_DIR.resolve(),
|
||||
# "enabled": True,
|
||||
# "is_valid": CACHE_DIR.is_dir(),
|
||||
# "is_valid": CACHE_DIR.is_dir() and os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK) and os.access(CACHE_DIR, os.X_OK), # read + write
|
||||
# },
|
||||
"PERSONAS_DIR": {
|
||||
"path": PERSONAS_DIR.resolve(),
|
||||
"enabled": PERSONAS_DIR.exists(),
|
||||
"is_valid": PERSONAS_DIR.is_dir(),
|
||||
"is_valid": PERSONAS_DIR.is_dir() and os.access(PERSONAS_DIR, os.R_OK) and os.access(PERSONAS_DIR, os.W_OK) and os.access(PERSONAS_DIR, os.X_OK), # read + write
|
||||
},
|
||||
'CUSTOM_TEMPLATES_DIR': {
|
||||
'path': CUSTOM_TEMPLATES_DIR.resolve(),
|
||||
'enabled': CUSTOM_TEMPLATES_DIR.exists(),
|
||||
'is_valid': CUSTOM_TEMPLATES_DIR.is_dir(),
|
||||
'is_valid': CUSTOM_TEMPLATES_DIR.is_dir() and os.access(CUSTOM_TEMPLATES_DIR, os.R_OK) and os.access(CUSTOM_TEMPLATES_DIR, os.X_OK), # read
|
||||
},
|
||||
'USER_PLUGINS_DIR': {
|
||||
'path': USER_PLUGINS_DIR.resolve(),
|
||||
'enabled': USER_PLUGINS_DIR.exists(),
|
||||
'is_valid': USER_PLUGINS_DIR.is_dir(),
|
||||
'is_valid': USER_PLUGINS_DIR.is_dir() and os.access(USER_PLUGINS_DIR, os.R_OK) and os.access(USER_PLUGINS_DIR, os.X_OK), # read
|
||||
},
|
||||
})
|
||||
|
||||
|
@ -314,5 +305,6 @@ globals().update(CONSTANTS)
|
|||
|
||||
|
||||
# these need to always exist as we need them to run almost everything
|
||||
# TODO: figure out a better time to make these than import-time
|
||||
CONSTANTS.LIB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
CONSTANTS.TMP_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
|
|
@ -22,41 +22,34 @@ Documentation:
|
|||
__package__ = 'archivebox.config'
|
||||
|
||||
import os
|
||||
import io
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import shutil
|
||||
|
||||
from hashlib import md5
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, Type, Tuple, Dict
|
||||
from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
|
||||
from typing import Optional, Type, Tuple, Dict, Any
|
||||
from subprocess import run, DEVNULL
|
||||
from configparser import ConfigParser
|
||||
|
||||
from rich.progress import Progress
|
||||
from rich.console import Console
|
||||
from benedict import benedict
|
||||
from pydantic_pkgr import SemVer
|
||||
|
||||
import django
|
||||
from django.db.backends.sqlite3.base import Database as sqlite3
|
||||
|
||||
|
||||
from .constants import CONSTANTS, TIMEZONE
|
||||
from .constants import CONSTANTS
|
||||
from .constants import *
|
||||
from .config_stubs import (
|
||||
ConfigValue,
|
||||
ConfigDefaultValue,
|
||||
ConfigDefaultDict,
|
||||
)
|
||||
|
||||
from ..misc.logging import (
|
||||
stderr,
|
||||
hint, # noqa
|
||||
)
|
||||
|
||||
from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
|
||||
from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
|
||||
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
|
||||
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
|
||||
from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
|
||||
|
@ -67,7 +60,7 @@ LDAP = LDAP_CONFIG.LDAP_ENABLED
|
|||
|
||||
############################### Config Schema ##################################
|
||||
|
||||
CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||
CONFIG_SCHEMA: Dict[str, Dict[str, Any]] = {
|
||||
'SHELL_CONFIG': SHELL_CONFIG.as_legacy_config_schema(),
|
||||
|
||||
'SERVER_CONFIG': SERVER_CONFIG.as_legacy_config_schema(),
|
||||
|
@ -194,7 +187,7 @@ def get_real_name(key: str) -> str:
|
|||
|
||||
# These are derived/computed values calculated *after* all user-provided config values are ingested
|
||||
# they appear in `archivebox config` output and are intended to be read-only for the user
|
||||
DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||
DYNAMIC_CONFIG_SCHEMA: Dict[str, Any] = {
|
||||
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
|
||||
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
|
||||
|
||||
|
@ -209,12 +202,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
|
||||
|
||||
def load_config_val(key: str,
|
||||
default: ConfigDefaultValue=None,
|
||||
default: Any=None,
|
||||
type: Optional[Type]=None,
|
||||
aliases: Optional[Tuple[str, ...]]=None,
|
||||
config: Optional[benedict]=None,
|
||||
env_vars: Optional[os._Environ]=None,
|
||||
config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue:
|
||||
config_file_vars: Optional[Dict[str, str]]=None) -> Any:
|
||||
"""parse bool, int, and str key=value pairs from env"""
|
||||
|
||||
assert isinstance(config, dict)
|
||||
|
@ -372,7 +365,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA
|
|||
|
||||
|
||||
|
||||
def load_config(defaults: ConfigDefaultDict,
|
||||
def load_config(defaults: Dict[str, Any],
|
||||
config: Optional[benedict]=None,
|
||||
out_dir: Optional[str]=None,
|
||||
env_vars: Optional[os._Environ]=None,
|
||||
|
@ -505,7 +498,7 @@ def load_all_config():
|
|||
# add all final config values in CONFIG to globals in this file
|
||||
CONFIG: benedict = load_all_config()
|
||||
globals().update(CONFIG)
|
||||
# this lets us do: from .config import DEBUG, MEDIA_TIMEOUT, ...
|
||||
|
||||
|
||||
# print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV")
|
||||
|
||||
|
@ -521,8 +514,8 @@ globals().update(CONFIG)
|
|||
|
||||
|
||||
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
|
||||
assert TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {TIMEZONE})' # noqa: F821
|
||||
os.environ["TZ"] = TIMEZONE # noqa: F821
|
||||
assert CONSTANTS.TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {CONSTANTS.TIMEZONE})' # noqa: F821
|
||||
os.environ["TZ"] = CONSTANTS.TIMEZONE # noqa: F821
|
||||
os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
|
||||
|
||||
########################### Config Validity Checkers ###########################
|
||||
|
@ -533,7 +526,8 @@ if not SHELL_CONFIG.SHOW_PROGRESS:
|
|||
os.environ['TERM'] = 'dumb'
|
||||
|
||||
# recreate rich console obj based on new config values
|
||||
CONSOLE = Console()
|
||||
STDOUT = CONSOLE = Console()
|
||||
STDERR = Console(stderr=True)
|
||||
from ..misc import logging
|
||||
logging.CONSOLE = CONSOLE
|
||||
|
||||
|
@ -541,11 +535,11 @@ logging.CONSOLE = CONSOLE
|
|||
INITIAL_STARTUP_PROGRESS = None
|
||||
INITIAL_STARTUP_PROGRESS_TASK = 0
|
||||
|
||||
def bump_startup_progress_bar():
|
||||
def bump_startup_progress_bar(advance=1):
|
||||
global INITIAL_STARTUP_PROGRESS
|
||||
global INITIAL_STARTUP_PROGRESS_TASK
|
||||
if INITIAL_STARTUP_PROGRESS:
|
||||
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore
|
||||
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=advance) # type: ignore
|
||||
|
||||
|
||||
def setup_django_minimal():
|
||||
|
@ -559,6 +553,8 @@ DJANGO_SET_UP = False
|
|||
|
||||
|
||||
def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CONFIG, in_memory_db=False) -> None:
|
||||
from rich.panel import Panel
|
||||
|
||||
global INITIAL_STARTUP_PROGRESS
|
||||
global INITIAL_STARTUP_PROGRESS_TASK
|
||||
global DJANGO_SET_UP
|
||||
|
@ -568,7 +564,7 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
|
|||
# TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes
|
||||
return
|
||||
|
||||
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
|
||||
with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
|
||||
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
|
||||
|
||||
output_dir = out_dir or CONSTANTS.DATA_DIR
|
||||
|
@ -595,7 +591,14 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
|
|||
else:
|
||||
# Otherwise use default sqlite3 file-based database and initialize django
|
||||
# without running migrations automatically (user runs them manually by calling init)
|
||||
django.setup()
|
||||
try:
|
||||
django.setup()
|
||||
except Exception as e:
|
||||
bump_startup_progress_bar(advance=1000)
|
||||
STDERR.print()
|
||||
STDERR.print(Panel(f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n', title='\n\n[red][X] Error while trying to load database!', subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]', expand=False, style='bold red'))
|
||||
STDERR.print()
|
||||
return
|
||||
|
||||
bump_startup_progress_bar()
|
||||
|
||||
|
@ -608,6 +611,17 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
|
|||
f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
|
||||
|
||||
if check_db:
|
||||
# make sure the data dir is owned by a non-root user
|
||||
if CONSTANTS.DATA_DIR.stat().st_uid == 0:
|
||||
STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
|
||||
STDERR.print(f' {CONSTANTS.DATA_DIR}')
|
||||
STDERR.print()
|
||||
STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
|
||||
STDERR.print(' cd path/to/your/archive/data')
|
||||
STDERR.print(' archivebox [command]')
|
||||
STDERR.print()
|
||||
raise SystemExit(9)
|
||||
|
||||
# Create cache table in DB if needed
|
||||
try:
|
||||
from django.core.cache import cache
|
||||
|
|
152
archivebox/config/paths.py
Normal file
152
archivebox/config/paths.py
Normal file
|
@ -0,0 +1,152 @@
|
|||
__package__ = 'archivebox.config'
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
|
||||
from functools import cache
|
||||
from platformdirs import PlatformDirs
|
||||
|
||||
from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
|
||||
|
||||
#############################################################################################
|
||||
|
||||
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
||||
DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
|
||||
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
|
||||
|
||||
#############################################################################################
|
||||
|
||||
@cache
|
||||
def get_collection_id(DATA_DIR=DATA_DIR):
|
||||
"""Get a short, stable, unique ID for the current collection"""
|
||||
collection_id_file = DATA_DIR / '.collection_id'
|
||||
|
||||
try:
|
||||
return collection_id_file.read_text().strip()
|
||||
except (OSError, FileNotFoundError, PermissionError):
|
||||
pass
|
||||
|
||||
hash_key = str(DATA_DIR.resolve()).encode()
|
||||
collection_id = hashlib.sha256(hash_key).hexdigest()[:8]
|
||||
try:
|
||||
collection_id_file.write_text(collection_id)
|
||||
except (OSError, FileNotFoundError, PermissionError):
|
||||
pass
|
||||
return collection_id
|
||||
|
||||
|
||||
def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True) -> bool:
|
||||
"""Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)"""
|
||||
current_uid, current_gid = os.geteuid(), os.getegid()
|
||||
uid, gid = uid or current_uid, gid or current_gid
|
||||
|
||||
test_file = dir_path / '.permissions_test'
|
||||
try:
|
||||
with SudoPermission(uid=uid, fallback=fallback):
|
||||
test_file.exists()
|
||||
test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir')
|
||||
test_file.unlink()
|
||||
return True
|
||||
except (IOError, OSError, PermissionError):
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
|
||||
|
||||
@cache
|
||||
def get_LIB_DIR():
|
||||
"""
|
||||
- should be shared with other collections on the same host
|
||||
- must be scoped by CPU architecture, OS family, and archivebox version
|
||||
- should not be shared with other hosts/archivebox versions
|
||||
- must be writable by any archivebox user
|
||||
- should be persistent across reboots
|
||||
- can be on a docker bin mount but probably shouldnt be
|
||||
- ok to have a long path (doesnt contain SOCKETS)
|
||||
"""
|
||||
from .version import detect_installed_version
|
||||
|
||||
HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
|
||||
|
||||
if 'SYSTEM_LIB_DIR' in os.environ:
|
||||
lib_dir = Path(os.environ['SYSTEM_LIB_DIR'])
|
||||
else:
|
||||
with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True):
|
||||
lib_dir = HOST_DIRS.site_data_path
|
||||
|
||||
# Docker: /usr/local/share/archivebox/0.8.5
|
||||
# Ubuntu: /usr/local/share/archivebox/0.8.5
|
||||
# macOS: /Library/Application Support/archivebox
|
||||
try:
|
||||
with SudoPermission(uid=0, fallback=True):
|
||||
lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
except PermissionError:
|
||||
# our user cannot
|
||||
lib_dir = HOST_DIRS.user_data_path
|
||||
lib_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not dir_is_writable(lib_dir):
|
||||
if IS_ROOT:
|
||||
# make sure lib dir is owned by the archivebox user, not root
|
||||
with SudoPermission(uid=0):
|
||||
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"')
|
||||
else:
|
||||
raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
|
||||
|
||||
return lib_dir
|
||||
|
||||
@cache
|
||||
def get_TMP_DIR():
|
||||
"""
|
||||
- must NOT be inside DATA_DIR / inside a docker volume bind mount
|
||||
- must NOT have a long PATH (UNIX socket path length restrictions)
|
||||
- must NOT be shared with other collections/hosts
|
||||
- must be writable by archivebox user & root
|
||||
- must be cleared on every boot / not persisted
|
||||
- must be cleared on every archivebox version upgrade
|
||||
"""
|
||||
from .version import detect_installed_version
|
||||
|
||||
HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
|
||||
|
||||
# print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP)
|
||||
# print('RUNNING AS:', self.PUID, self.PGID)
|
||||
|
||||
if 'SYSTEM_TMP_DIR' in os.environ:
|
||||
run_dir = Path(os.environ['SYSTEM_TMP_DIR']).resolve() / get_collection_id(DATA_DIR=DATA_DIR)
|
||||
with SudoPermission(uid=0, fallback=True):
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
if not dir_is_writable(run_dir):
|
||||
if IS_ROOT:
|
||||
with SudoPermission(uid=0, fallback=False):
|
||||
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
|
||||
else:
|
||||
raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
|
||||
assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
|
||||
return run_dir
|
||||
|
||||
run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve()
|
||||
try:
|
||||
assert len(str(run_dir)) + len('/supervisord.sock') < 95
|
||||
except AssertionError:
|
||||
run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR)
|
||||
assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
|
||||
|
||||
with SudoPermission(uid=0, fallback=True):
|
||||
run_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not dir_is_writable(run_dir):
|
||||
if IS_ROOT:
|
||||
with SudoPermission(uid=0):
|
||||
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
|
||||
else:
|
||||
raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
|
||||
|
||||
# Docker: /tmp/archivebox/0.8.5/abc324235
|
||||
# Ubuntu: /tmp/archivebox/0.8.5/abc324235
|
||||
# macOS: /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/0.8.5/abc324235
|
||||
return run_dir
|
||||
|
70
archivebox/config/permissions.py
Normal file
70
archivebox/config/permissions.py
Normal file
|
@ -0,0 +1,70 @@
|
|||
__package__ = 'archivebox.config'
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from contextlib import contextmanager
|
||||
|
||||
#############################################################################################
|
||||
|
||||
DATA_DIR = Path(os.getcwd())
|
||||
|
||||
DATA_DIR_STAT = Path(DATA_DIR).stat()
|
||||
DATA_DIR_UID = DATA_DIR_STAT.st_uid
|
||||
DATA_DIR_GID = DATA_DIR_STAT.st_gid
|
||||
DEFAULT_PUID = 911
|
||||
DEFAULT_PGID = 911
|
||||
RUNNING_AS_UID = os.getuid()
|
||||
RUNNING_AS_GID = os.getgid()
|
||||
EUID = os.geteuid()
|
||||
EGID = os.getegid()
|
||||
USER: str = Path('~').expanduser().resolve().name
|
||||
|
||||
IS_ROOT = RUNNING_AS_UID == 0
|
||||
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
||||
|
||||
os.environ.setdefault('PUID', str(DATA_DIR_UID or RUNNING_AS_UID or DEFAULT_PUID))
|
||||
os.environ.setdefault('PGID', str(DATA_DIR_GID or RUNNING_AS_GID or DEFAULT_PGID))
|
||||
|
||||
ARCHIVEBOX_USER = int(os.environ['PUID'])
|
||||
ARCHIVEBOX_GROUP = int(os.environ['PGID'])
|
||||
|
||||
#############################################################################################
|
||||
|
||||
def drop_privileges():
|
||||
"""If running as root, drop privileges to the user that owns the data dir (or PUID, or default=911)"""
|
||||
|
||||
# always run archivebox as the user that owns the data dir, never as root
|
||||
if os.getuid() == 0:
|
||||
# drop permissions to the user that owns the data dir / provided PUID
|
||||
if os.geteuid() != ARCHIVEBOX_USER:
|
||||
os.seteuid(ARCHIVEBOX_USER)
|
||||
# if we need sudo (e.g. for installing dependencies) code should use SudoPermissions() context manager to regain root
|
||||
|
||||
|
||||
@contextmanager
|
||||
def SudoPermission(uid=0, fallback=False):
|
||||