improve config loading of TMP_DIR, LIB_DIR, move to separate files

This commit is contained in:
Nick Sweeting 2024-10-07 23:45:11 -07:00
parent 7a895d9285
commit cf1ea8f80f
No known key found for this signature in database
49 changed files with 767 additions and 527 deletions

View file

@ -287,22 +287,12 @@ WORKDIR "$DATA_DIR"
RUN openssl rand -hex 16 > /etc/machine-id \
&& chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/tmp"
ENV IN_DOCKER=True \
SYSTEM_LIB_DIR=/app/lib \
SYSTEM_TMP_DIR=/tmp \
SYSTEM_LIB_DIR=/usr/share/archivebox \
SYSTEM_TMP_DIR=/tmp/archivebox \
GOOGLE_API_KEY=no \
GOOGLE_DEFAULT_CLIENT_ID=no \
GOOGLE_DEFAULT_CLIENT_SECRET=no \
ALLOWED_HOSTS=*
## No need to set explicitly, these values will be autodetected by archivebox in docker:
# WGET_BINARY="wget" \
# YOUTUBEDL_BINARY="yt-dlp" \
# CHROME_BINARY="/usr/bin/chromium-browser" \
# USE_SINGLEFILE=True \
# SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \
# USE_READABILITY=True \
# READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \
# USE_MERCURY=True \
# MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser"
# Print version for nice docker finish summary
RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \

View file

@ -13,7 +13,7 @@ __package__ = 'archivebox'
import os
import sys
import tempfile
from pathlib import Path
ASCII_LOGO = """
@ -25,37 +25,36 @@ ASCII_LOGO = """
"""
SYSTEM_TMP_DIR = Path(tempfile.gettempdir()) / 'archivebox'
SYSTEM_TMP_DIR.mkdir(parents=True, exist_ok=True)
os.environ['SYSTEM_TMP_DIR'] = str(SYSTEM_TMP_DIR)
os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
# detect ArchiveBox user's UID/GID based on data dir ownership
from archivebox.config.permissions import drop_privileges # noqa
drop_privileges()
# if we are outside a data dir, cd into an ephemeral tmp dir so that
# we can run version/help without polluting cwd with an index.sqlite3
if len(sys.argv) > 1 and sys.argv[1] in ('version', 'help'):
current_dir = Path(os.getcwd()).resolve()
if not (current_dir / 'index.sqlite3').exists():
os.chdir(SYSTEM_TMP_DIR)
from archivebox.misc.checks import check_not_root, check_io_encoding # noqa
check_not_root()
check_io_encoding()
# make sure PACKAGE_DIR is in sys.path so we can import all subfolders
# without necessarily waiting for django to load them thorugh INSTALLED_APPS
PACKAGE_DIR = Path(__file__).resolve().parent
if str(PACKAGE_DIR) not in sys.path:
sys.path.append(str(PACKAGE_DIR))
os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
# print('INSTALLING MONKEY PATCHES')
from .monkey_patches import * # noqa
from archivebox.monkey_patches import * # noqa
# print('DONE INSTALLING MONKEY PATCHES')
# print('LOADING VENDORED LIBRARIES')
from .vendor import load_vendored_libs # noqa
from archivebox.vendor import load_vendored_libs # noqa
load_vendored_libs()
# print('DONE LOADING VENDORED LIBRARIES')
from .config.constants import CONSTANTS, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, VERSION # noqa
from archivebox.config.constants import CONSTANTS # noqa
from archivebox.config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from archivebox.config.version import VERSION # noqa
__version__ = VERSION
__author__ = 'Nick Sweeting'

View file

@ -12,12 +12,13 @@ from ninja import NinjaAPI, Swagger
# TODO: explore adding https://eadwincode.github.io/django-ninja-extra/
from archivebox.config import SHELL_CONFIG, VERSION
from archivebox.config import VERSION
from archivebox.config.version import get_COMMIT_HASH
from api.auth import API_AUTH_METHODS
COMMIT_HASH = SHELL_CONFIG.COMMIT_HASH or 'unknown'
COMMIT_HASH = get_COMMIT_HASH() or 'unknown'
html_description=f'''
<h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>

View file

@ -13,7 +13,7 @@ from ..main import (
schedule,
)
from archivebox.misc.util import ansi_to_html
from archivebox.config import ARCHIVING_CONFIG
from archivebox.config.common import ARCHIVING_CONFIG
from .auth import API_AUTH_METHODS

View file

@ -1,6 +1,7 @@
__package__ = 'archivebox.cli'
__command__ = 'archivebox'
import os
import sys
import argparse
import threading
@ -25,6 +26,10 @@ if len(sys.argv) > 1 and sys.argv[1] == 'setup':
print(':warning: [bold red]DEPRECATED[/bold red] `archivebox setup` is deprecated, use `archivebox install` instead')
sys.argv[1] = 'install'
if '--debug' in sys.argv:
os.environ['DEBUG'] = 'True'
sys.argv.remove('--debug')
# def list_subcommands() -> Dict[str, str]:
# """find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
@ -50,8 +55,8 @@ SUBCOMMAND_MODULES = {
'init': 'archivebox_init',
'install': 'archivebox_install',
##############################################
'config': 'archivebox_config',
'add': 'archivebox_add',
'remove': 'archivebox_remove',
'update': 'archivebox_update',
@ -63,7 +68,7 @@ SUBCOMMAND_MODULES = {
'shell': 'archivebox_shell',
'manage': 'archivebox_manage',
'oneshot': 'archivebox_oneshot',
# 'oneshot': 'archivebox_oneshot',
}
# every imported command module must have these properties in order to be valid
@ -102,11 +107,11 @@ CLI_SUBCOMMANDS = LazySubcommands()
# these common commands will appear sorted before any others for ease-of-use
meta_cmds = ('help', 'version') # dont require valid data folder at all
main_cmds = ('init', 'config', 'setup', 'install') # dont require existing db present
archive_cmds = ('add', 'remove', 'update', 'list', 'status') # require existing db present
setup_cmds = ('init', 'setup', 'install') # require valid data folder, but dont require DB present in it yet
archive_cmds = ('add', 'remove', 'update', 'list', 'status', 'schedule', 'server', 'shell', 'manage') # require valid data folder + existing db present
fake_db = ("oneshot",) # use fake in-memory db
display_first = (*meta_cmds, *main_cmds, *archive_cmds)
display_first = (*meta_cmds, *setup_cmds, *archive_cmds)
IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler') # threads we dont have to wait for before exiting
@ -157,14 +162,16 @@ def run_subcommand(subcommand: str,
from archivebox.config.legacy import setup_django
# print('DATA_DIR is', DATA_DIR)
# print('pwd is', os.getcwd())
# print('pwd is', os.getcwd())
cmd_requires_db = subcommand in archive_cmds
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
check_db = cmd_requires_db and not init_pending
if subcommand not in meta_cmds:
setup_django(in_memory_db=subcommand in fake_db, check_db=check_db)
if subcommand in archive_cmds:
if cmd_requires_db:
check_migrations()

View file

@ -9,7 +9,8 @@ import argparse
from typing import List, Optional, IO
from archivebox.misc.util import docstring
from archivebox.config import DATA_DIR, ARCHIVING_CONFIG
from archivebox.config import DATA_DIR
from archivebox.config.common import ARCHIVING_CONFIG
from ..main import add
from ..parsers import PARSERS

View file

@ -9,7 +9,8 @@ from pathlib import Path
from typing import Optional, List, IO
from archivebox.misc.util import docstring
from archivebox.config import DATA_DIR, SERVER_CONFIG
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from ..logging_util import SmartFormatter, reject_stdin
from ..main import server

View file

@ -1,27 +1,9 @@
__package__ = 'archivebox.config'
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
from .defaults import (
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
from .paths import (
PACKAGE_DIR, # noqa
DATA_DIR, # noqa
ARCHIVE_DIR, # noqa
)
__all__ = [
'CONSTANTS',
'PACKAGE_DIR',
'DATA_DIR',
'ARCHIVE_DIR',
'VERSION',
'SHELL_CONFIG',
'STORAGE_CONFIG',
'GENERAL_CONFIG',
'SERVER_CONFIG',
'ARCHIVING_CONFIG',
'SEARCH_BACKEND_CONFIG',
'CONSTANTS_CONFIG',
]
from .constants import CONSTANTS, CONSTANTS_CONFIG # noqa
from .version import VERSION # noqa

View file

@ -8,7 +8,7 @@ from abx.archivebox.base_hook import BaseHook
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .defaults import (
from .common import (
ShellConfig, # noqa: F401
StorageConfig, # noqa: F401
GeneralConfig, # noqa: F401

View file

@ -1,47 +0,0 @@
# def get_versions_available_on_github(config):
# """
# returns a dictionary containing the ArchiveBox GitHub release info for
# the recommended upgrade version and the currently installed version
# """
# # we only want to perform the (relatively expensive) check for new versions
# # when its most relevant, e.g. when the user runs a long-running command
# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
# long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
# if subcommand_run_by_user not in long_running_commands:
# return None
# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
# response = requests.get(github_releases_api)
# if response.status_code != 200:
# stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
# return None
# all_releases = response.json()
# installed_version = parse_version_string(config['VERSION'])
# # find current version or nearest older version (to link to)
# current_version = None
# for idx, release in enumerate(all_releases):
# release_version = parse_version_string(release['tag_name'])
# if release_version <= installed_version:
# current_version = release
# break
# current_version = current_version or all_releases[-1]
# # recommended version is whatever comes after current_version in the release list
# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
# try:
# recommended_version = all_releases[idx+1]
# except IndexError:
# recommended_version = None
# return {'recommended_version': recommended_version, 'current_version': current_version}
# def can_upgrade(config):
# if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
# recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
# current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
# return recommended_version > current_version
# return False

View file

@ -1,21 +1,21 @@
__package__ = 'archivebox.config'
import os
import sys
import shutil
from typing import Dict, Optional
from datetime import datetime
from pathlib import Path
from rich import print
from pydantic import Field, field_validator, model_validator, computed_field
from pydantic import Field, field_validator, computed_field
from django.utils.crypto import get_random_string
from abx.archivebox.base_configset import BaseConfigSet
from .constants import CONSTANTS, PACKAGE_DIR
from .constants import CONSTANTS
from .version import get_COMMIT_HASH, get_BUILD_TIME
from .permissions import IN_DOCKER
###################### Config ##########################
@ -27,14 +27,8 @@ class ShellConfig(BaseConfigSet):
USE_COLOR: bool = Field(default=lambda c: c.IS_TTY)
SHOW_PROGRESS: bool = Field(default=lambda c: c.IS_TTY)
IN_DOCKER: bool = Field(default=False)
IN_DOCKER: bool = Field(default=IN_DOCKER)
IN_QEMU: bool = Field(default=False)
USER: str = Field(default=Path('~').expanduser().resolve().name)
PUID: int = Field(default=os.getuid())
PGID: int = Field(default=os.getgid())
PYTHON_ENCODING: str = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8'))
ANSI: Dict[str, str] = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
@ -52,63 +46,12 @@ class ShellConfig(BaseConfigSet):
@computed_field
@property
def COMMIT_HASH(self) -> Optional[str]:
try:
git_dir = PACKAGE_DIR / '../.git'
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
commit_hash = git_dir.joinpath(ref).read_text().strip()
return commit_hash
except Exception:
pass
try:
return list((PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
except Exception:
pass
return None
return get_COMMIT_HASH()
@computed_field
@property
def BUILD_TIME(self) -> str:
if self.IN_DOCKER:
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
return docker_build_end_time
src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
@model_validator(mode='after')
def validate_not_running_as_root(self):
attempted_command = ' '.join(sys.argv[:3])
if self.PUID == 0 and attempted_command not in ('setup', 'install'):
# stderr('[!] ArchiveBox should never be run as root!', color='red')
# stderr(' For more information, see the security overview documentation:')
# stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr)
print(' For more information, see the security overview documentation:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr)
if self.IN_DOCKER:
print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr)
print(' docker compose run archivebox {attempted_command}', file=sys.stderr)
print(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr)
print(' or:', file=sys.stderr)
print(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
print(f' docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
raise SystemExit(2)
# check python locale
if self.PYTHON_ENCODING != 'UTF-8':
print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {self.PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr)
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr)
print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr)
print('')
print(' Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr)
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr)
raise SystemExit(2)
return self
return get_BUILD_TIME()
SHELL_CONFIG = ShellConfig()

View file

@ -1,115 +0,0 @@
from pathlib import Path
from typing import Optional, Dict, Union, Tuple, Callable, Pattern, Type, Any, List
from mypy_extensions import TypedDict
from benedict import benedict
SimpleConfigValue = Union[str, bool, int, None, Pattern, Dict[str, Any]]
SimpleConfigValueDict = Dict[str, SimpleConfigValue]
SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
class BaseConfig(TypedDict):
pass
class ConfigDict(BaseConfig, benedict, total=False):
"""
# Regenerate by pasting this quine into `archivebox shell` 🥚
from archivebox.config import ConfigDict, CONFIG_DEFAULTS
print('class ConfigDict(BaseConfig, total=False):')
print(' ' + '"'*3 + ConfigDict.__doc__ + '"'*3)
for section, configs in CONFIG_DEFAULTS.items():
for key, attrs in configs.items():
Type, default = attrs['type'], attrs['default']
if default is None:
print(f' {key}: Optional[{Type.__name__}]')
else:
print(f' {key}: {Type.__name__}')
print()
"""
IS_TTY: bool
USE_COLOR: bool
SHOW_PROGRESS: bool
IN_DOCKER: bool
PACKAGE_DIR: Path
CONFIG_FILE: Path
ONLY_NEW: bool
TIMEOUT: int
MEDIA_TIMEOUT: int
OUTPUT_PERMISSIONS: str
RESTRICT_FILE_NAMES: str
URL_DENYLIST: str
SECRET_KEY: Optional[str]
BIND_ADDR: str
ALLOWED_HOSTS: str
DEBUG: bool
PUBLIC_INDEX: bool
PUBLIC_SNAPSHOTS: bool
FOOTER_INFO: str
SAVE_TITLE: bool
SAVE_FAVICON: bool
SAVE_WGET: bool
SAVE_WGET_REQUISITES: bool
SAVE_SINGLEFILE: bool
SAVE_READABILITY: bool
SAVE_MERCURY: bool
SAVE_PDF: bool
SAVE_SCREENSHOT: bool
SAVE_DOM: bool
SAVE_WARC: bool
SAVE_GIT: bool
SAVE_MEDIA: bool
SAVE_ARCHIVE_DOT_ORG: bool
RESOLUTION: str
GIT_DOMAINS: str
CHECK_SSL_VALIDITY: bool
CURL_USER_AGENT: str
WGET_USER_AGENT: str
CHROME_USER_AGENT: str
COOKIES_FILE: Union[str, Path, None]
CHROME_USER_DATA_DIR: Union[str, Path, None]
CHROME_TIMEOUT: int
CHROME_HEADLESS: bool
CHROME_SANDBOX: bool
USE_CURL: bool
USE_WGET: bool
USE_SINGLEFILE: bool
USE_READABILITY: bool
USE_MERCURY: bool
USE_GIT: bool
USE_CHROME: bool
USE_YOUTUBEDL: bool
CURL_BINARY: str
GIT_BINARY: str
WGET_BINARY: str
SINGLEFILE_BINARY: str
READABILITY_BINARY: str
MERCURY_BINARY: str
YOUTUBEDL_BINARY: str
CHROME_BINARY: Optional[str]
YOUTUBEDL_ARGS: List[str]
WGET_ARGS: List[str]
CURL_ARGS: List[str]
GIT_ARGS: List[str]
TAG_SEPARATOR_PATTERN: str
ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]
ConfigDefaultValue = Union[ConfigValue, ConfigDefaultValueGetter]
ConfigDefault = TypedDict('ConfigDefault', {
'default': ConfigDefaultValue,
'type': Optional[Type],
'aliases': Optional[Tuple[str, ...]],
}, total=False)
ConfigDefaultDict = Dict[str, ConfigDefault]

View file

@ -1,118 +1,115 @@
__package__ = 'archivebox.config'
import os
import re
import platform
import tempfile
from typing import Dict
from pathlib import Path
import importlib.metadata
from collections.abc import Mapping
from benedict import benedict
from ..misc.logging import DEFAULT_CLI_COLORS
from .paths import (
PACKAGE_DIR,
DATA_DIR,
ARCHIVE_DIR,
get_collection_id,
get_LIB_DIR,
get_TMP_DIR,
)
from .permissions import (
IS_ROOT,
IN_DOCKER,
RUNNING_AS_UID,
RUNNING_AS_GID,
DEFAULT_PUID,
DEFAULT_PGID,
ARCHIVEBOX_USER,
ARCHIVEBOX_GROUP,
)
from .version import detect_installed_version
###################### Config ##########################
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
def _detect_installed_version(PACKAGE_DIR: Path):
"""Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file"""
try:
# if in production install, use pip-installed package metadata
return importlib.metadata.version(__package__ or 'archivebox').strip()
except importlib.metadata.PackageNotFoundError:
pass
try:
# if in dev Git repo dir, use pyproject.toml file
pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n')
for line in pyproject_config:
if line.startswith('version = '):
return line.split(' = ', 1)[-1].strip('"').strip()
except FileNotFoundError:
# building docs, pyproject.toml is not available
pass
# raise Exception('Failed to detect installed archivebox version!')
return 'dev'
VERSION: str = _detect_installed_version(PACKAGE_DIR)
class ConstantsDict(Mapping):
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'yes')
OS = platform.system().lower() # darwin, linux, etc.
ARCH = platform.machine().lower() # arm64, x86_64, etc.
LIB_DIR_SCOPE = f'{ARCH}-{OS}' + ('-docker' if IN_DOCKER else '')
PACKAGE_DIR: Path = PACKAGE_DIR # archivebox source code dir
DATA_DIR: Path = DATA_DIR # archivebox user data dir
ARCHIVE_DIR: Path = ARCHIVE_DIR # archivebox snapshot data dir
VERSION: str = VERSION
PACKAGE_DIR: Path = PACKAGE_DIR
DATA_DIR: Path = DATA_DIR
ARCHIVE_DIR: Path = ARCHIVE_DIR
COLLECTION_ID: str = get_collection_id(DATA_DIR)
# Host system
VERSION: str = detect_installed_version(PACKAGE_DIR)
OS: str = platform.system().lower() # darwin, linux, etc.
ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc.
IN_DOCKER: bool = IN_DOCKER
# Permissions
IS_ROOT: bool = IS_ROOT
ARCHIVEBOX_USER: int = ARCHIVEBOX_USER
ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP
RUNNING_AS_UID: int = RUNNING_AS_UID
RUNNING_AS_GID: int = RUNNING_AS_GID
DEFAULT_PUID: int = DEFAULT_PUID
DEFAULT_PGID: int = DEFAULT_PGID
# Source code dirs
PACKAGE_DIR_NAME: str = PACKAGE_DIR.name
TEMPLATES_DIR_NAME: str = 'templates'
TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
STATIC_DIR: Path = TEMPLATES_DIR / 'static'
STATIC_DIR_NAME: str = 'static'
STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME
# Data dirs
ARCHIVE_DIR_NAME: str = 'archive'
SOURCES_DIR_NAME: str = 'sources'
PERSONAS_DIR_NAME: str = 'personas'
CRONTABS_DIR_NAME: str = 'crontabs'
CACHE_DIR_NAME: str = 'cache'
LOGS_DIR_NAME: str = 'logs'
USER_PLUGINS_DIR_NAME: str = 'user_plugins'
CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates'
ARCHIVE_DIR_NAME: str = 'archive'
SOURCES_DIR_NAME: str = 'sources'
PERSONAS_DIR_NAME: str = 'personas'
CRONTABS_DIR_NAME: str = 'crontabs'
CACHE_DIR_NAME: str = 'cache'
LOGS_DIR_NAME: str = 'logs'
LIB_DIR_NAME: str = 'lib'
TMP_DIR_NAME: str = 'tmp'
SYSTEM_TMP_DIR: Path = Path(os.environ['SYSTEM_TMP_DIR']) if 'SYSTEM_TMP_DIR' in os.environ else (Path(tempfile.gettempdir()) / 'archivebox')
# DATA_DIR_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / machineid.hashed_id('archivebox')[:16] # cant be used because of socket path length restrictions break too often if data dir is in some deep subdir: ocket.error reported AF_UNIX path too long
SYSTEM_LIB_DIR: Path = Path(os.environ['SYSTEM_LIB_DIR']) if 'SYSTEM_LIB_DIR' in os.environ else (PACKAGE_DIR / LIB_DIR_NAME)
DATA_DIR_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / LIB_DIR_SCOPE
ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
LIB_DIR: Path = SYSTEM_LIB_DIR if IN_DOCKER else DATA_DIR_LIB_DIR # e.g. /app/lib or ./data/lib/arm64-darwin-docker
TMP_DIR: Path = SYSTEM_TMP_DIR
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
USER_PLUGINS_DIR: Path = DATA_DIR / USER_PLUGINS_DIR_NAME
# Data dir files
CONFIG_FILENAME: str = 'ArchiveBox.conf'
SQL_INDEX_FILENAME: str = 'index.sqlite3'
QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
JSON_INDEX_FILENAME: str = 'index.json'
HTML_INDEX_FILENAME: str = 'index.html'
ROBOTS_TXT_FILENAME: str = 'robots.txt'
FAVICON_FILENAME: str = 'favicon.ico'
# Runtime dirs
TMP_DIR_NAME: str = 'tmp'
TMP_DIR: Path = get_TMP_DIR()
LIB_DIR_NAME: str = 'lib'
LIB_DIR: Path = get_LIB_DIR()
LIB_PIP_DIR: Path = LIB_DIR / 'pip'
LIB_NPM_DIR: Path = LIB_DIR / 'npm'
LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers'
LIB_BIN_DIR: Path = LIB_DIR / 'bin'
BIN_DIR: Path = LIB_BIN_DIR
CONFIG_FILENAME: str = 'ArchiveBox.conf'
SQL_INDEX_FILENAME: str = 'index.sqlite3'
QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
# Config constants
TIMEZONE: str = 'UTC'
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
JSON_INDEX_FILENAME: str = 'index.json'
HTML_INDEX_FILENAME: str = 'index.html'
ROBOTS_TXT_FILENAME: str = 'robots.txt'
FAVICON_FILENAME: str = 'favicon.ico'
TIMEZONE: str = 'UTC'
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
# 99.999% of the time, URLs ending in these extensions are static files
@ -136,17 +133,6 @@ class ConstantsDict(Mapping):
# html, htm, shtml, xhtml, xml, aspx, php, cgi
))
INGORED_PATHS: frozenset[str] = frozenset((
".git",
".svn",
".DS_Store",
".gitignore",
"lost+found",
".DS_Store",
".env",
"Dockerfile",
".ArchiveBox.conf.bak",
))
PIP_RELATED_NAMES: frozenset[str] = frozenset((
".venv",
"venv",
@ -160,7 +146,15 @@ class ConstantsDict(Mapping):
"yarn.lock",
))
DATA_DIR_NAMES: frozenset[str] = frozenset((
# When initializing archivebox in a new directory, we check to make sure the dir is
# actually empty so that we dont clobber someone's home directory or desktop by accident.
# These files are exceptions to the is_empty check when we're trying to init a new dir,
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
*PIP_RELATED_NAMES,
*NPM_RELATED_NAMES,
### Dirs:
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
@ -171,9 +165,12 @@ class ConstantsDict(Mapping):
CUSTOM_TEMPLATES_DIR_NAME,
USER_PLUGINS_DIR_NAME,
CRONTABS_DIR_NAME,
))
DATA_DIRS: frozenset[Path] = frozenset(DATA_DIR / dirname for dirname in DATA_DIR_NAMES)
DATA_FILE_NAMES: frozenset[str] = frozenset((
"static", # created by old static exports <v0.6.0
"sonic", # created by docker bind mount / sonic FTS process
".git",
".svn",
### Files:
CONFIG_FILENAME,
SQL_INDEX_FILENAME,
f"{SQL_INDEX_FILENAME}-wal",
@ -188,43 +185,37 @@ class ConstantsDict(Mapping):
FAVICON_FILENAME,
CONFIG_FILENAME,
f"{CONFIG_FILENAME}.bak",
f".{CONFIG_FILENAME}.bak",
"static_index.json",
))
# When initializing archivebox in a new directory, we check to make sure the dir is
# actually empty so that we dont clobber someone's home directory or desktop by accident.
# These files are exceptions to the is_empty check when we're trying to init a new dir,
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
*INGORED_PATHS,
*PIP_RELATED_NAMES,
*NPM_RELATED_NAMES,
*DATA_DIR_NAMES,
*DATA_FILE_NAMES,
"static", # created by old static exports <v0.6.0
"sonic", # created by docker bind mount
".DS_Store",
".gitignore",
"lost+found",
".DS_Store",
".env",
".collection_id",
"Dockerfile",
))
CODE_LOCATIONS = benedict({
'PACKAGE_DIR': {
'path': (PACKAGE_DIR).resolve(),
'enabled': True,
'is_valid': (PACKAGE_DIR / '__main__.py').exists(),
'is_valid': (PACKAGE_DIR / '__main__.py').exists(), # read + list
},
'TEMPLATES_DIR': {
'path': TEMPLATES_DIR.resolve(),
'enabled': True,
'is_valid': STATIC_DIR.exists(),
'is_valid': STATIC_DIR.exists() and os.access(STATIC_DIR, os.R_OK) and os.access(STATIC_DIR, os.X_OK), # read + list
},
'LIB_DIR': {
'path': LIB_DIR.resolve(),
'enabled': True,
'is_valid': LIB_DIR.is_dir(),
'is_valid': LIB_DIR.is_dir() and os.access(LIB_DIR, os.R_OK) and os.access(LIB_DIR, os.X_OK) and os.access(LIB_DIR, os.W_OK), # read + write
},
'TMP_DIR': {
'path': TMP_DIR.resolve(),
'enabled': True,
'is_valid': TMP_DIR.is_dir(),
'is_valid': TMP_DIR.is_dir() and os.access(TMP_DIR, os.R_OK) and os.access(TMP_DIR, os.X_OK) and os.access(TMP_DIR, os.W_OK), # read + write
},
})
@ -232,61 +223,61 @@ class ConstantsDict(Mapping):
"DATA_DIR": {
"path": DATA_DIR.resolve(),
"enabled": True,
"is_valid": DATABASE_FILE.exists(),
"is_valid": DATABASE_FILE.exists() and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK) and os.access(DATA_DIR, os.X_OK),
"is_mount": os.path.ismount(DATA_DIR.resolve()),
},
"CONFIG_FILE": {
"path": CONFIG_FILE.resolve(),
"enabled": True,
"is_valid": CONFIG_FILE.exists(),
"is_valid": CONFIG_FILE.exists() and os.access(CONFIG_FILE, os.W_OK),
},
"SQL_INDEX": {
"path": DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": DATABASE_FILE.exists(),
"is_valid": DATABASE_FILE.exists() and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
},
"QUEUE_DATABASE": {
"path": QUEUE_DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": QUEUE_DATABASE_FILE.exists(),
"is_valid": QUEUE_DATABASE_FILE.exists() and os.access(QUEUE_DATABASE_FILE, os.R_OK) and os.access(QUEUE_DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
},
"ARCHIVE_DIR": {
"path": ARCHIVE_DIR.resolve(),
"enabled": True,
"is_valid": ARCHIVE_DIR.exists(),
"is_valid": ARCHIVE_DIR.exists() and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK) and os.access(ARCHIVE_DIR, os.X_OK),
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
},
"SOURCES_DIR": {
"path": SOURCES_DIR.resolve(),
"enabled": True,
"is_valid": SOURCES_DIR.exists(),
"is_valid": SOURCES_DIR.exists() and os.access(SOURCES_DIR, os.R_OK) and os.access(SOURCES_DIR, os.W_OK) and os.access(SOURCES_DIR, os.X_OK),
},
"LOGS_DIR": {
"path": LOGS_DIR.resolve(),
"enabled": True,
"is_valid": LOGS_DIR.is_dir(),
"is_valid": LOGS_DIR.is_dir() and os.access(LOGS_DIR, os.R_OK) and os.access(LOGS_DIR, os.W_OK) and os.access(LOGS_DIR, os.X_OK), # read + write
},
# "CACHE_DIR": {
# "path": CACHE_DIR.resolve(),
# "enabled": True,
# "is_valid": CACHE_DIR.is_dir(),
# "is_valid": CACHE_DIR.is_dir() and os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK) and os.access(CACHE_DIR, os.X_OK), # read + write
# },
"PERSONAS_DIR": {
"path": PERSONAS_DIR.resolve(),
"enabled": PERSONAS_DIR.exists(),
"is_valid": PERSONAS_DIR.is_dir(),
"is_valid": PERSONAS_DIR.is_dir() and os.access(PERSONAS_DIR, os.R_OK) and os.access(PERSONAS_DIR, os.W_OK) and os.access(PERSONAS_DIR, os.X_OK), # read + write
},
'CUSTOM_TEMPLATES_DIR': {
'path': CUSTOM_TEMPLATES_DIR.resolve(),
'enabled': CUSTOM_TEMPLATES_DIR.exists(),
'is_valid': CUSTOM_TEMPLATES_DIR.is_dir(),
'is_valid': CUSTOM_TEMPLATES_DIR.is_dir() and os.access(CUSTOM_TEMPLATES_DIR, os.R_OK) and os.access(CUSTOM_TEMPLATES_DIR, os.X_OK), # read
},
'USER_PLUGINS_DIR': {
'path': USER_PLUGINS_DIR.resolve(),
'enabled': USER_PLUGINS_DIR.exists(),
'is_valid': USER_PLUGINS_DIR.is_dir(),
'is_valid': USER_PLUGINS_DIR.is_dir() and os.access(USER_PLUGINS_DIR, os.R_OK) and os.access(USER_PLUGINS_DIR, os.X_OK), # read
},
})
@ -314,5 +305,6 @@ globals().update(CONSTANTS)
# these need to always exist as we need them to run almost everything
# TODO: figure out a better time to make these than import-time
CONSTANTS.LIB_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.TMP_DIR.mkdir(parents=True, exist_ok=True)

View file

@ -22,41 +22,34 @@ Documentation:
__package__ = 'archivebox.config'
import os
import io
import re
import sys
import json
import shutil
from hashlib import md5
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Type, Tuple, Dict
from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
from typing import Optional, Type, Tuple, Dict, Any
from subprocess import run, DEVNULL
from configparser import ConfigParser
from rich.progress import Progress
from rich.console import Console
from benedict import benedict
from pydantic_pkgr import SemVer
import django
from django.db.backends.sqlite3.base import Database as sqlite3
from .constants import CONSTANTS, TIMEZONE
from .constants import CONSTANTS
from .constants import *
from .config_stubs import (
ConfigValue,
ConfigDefaultValue,
ConfigDefaultDict,
)
from ..misc.logging import (
stderr,
hint, # noqa
)
from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
@ -67,7 +60,7 @@ LDAP = LDAP_CONFIG.LDAP_ENABLED
############################### Config Schema ##################################
CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
CONFIG_SCHEMA: Dict[str, Dict[str, Any]] = {
'SHELL_CONFIG': SHELL_CONFIG.as_legacy_config_schema(),
'SERVER_CONFIG': SERVER_CONFIG.as_legacy_config_schema(),
@ -194,7 +187,7 @@ def get_real_name(key: str) -> str:
# These are derived/computed values calculated *after* all user-provided config values are ingested
# they appear in `archivebox config` output and are intended to be read-only for the user
DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
DYNAMIC_CONFIG_SCHEMA: Dict[str, Any] = {
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
@ -209,12 +202,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
def load_config_val(key: str,
default: ConfigDefaultValue=None,
default: Any=None,
type: Optional[Type]=None,
aliases: Optional[Tuple[str, ...]]=None,
config: Optional[benedict]=None,
env_vars: Optional[os._Environ]=None,
config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue:
config_file_vars: Optional[Dict[str, str]]=None) -> Any:
"""parse bool, int, and str key=value pairs from env"""
assert isinstance(config, dict)
@ -372,7 +365,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA
def load_config(defaults: ConfigDefaultDict,
def load_config(defaults: Dict[str, Any],
config: Optional[benedict]=None,
out_dir: Optional[str]=None,
env_vars: Optional[os._Environ]=None,
@ -505,7 +498,7 @@ def load_all_config():
# add all final config values in CONFIG to globals in this file
CONFIG: benedict = load_all_config()
globals().update(CONFIG)
# this lets us do: from .config import DEBUG, MEDIA_TIMEOUT, ...
# print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV")
@ -521,8 +514,8 @@ globals().update(CONFIG)
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
assert TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {TIMEZONE})' # noqa: F821
os.environ["TZ"] = TIMEZONE # noqa: F821
assert CONSTANTS.TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {CONSTANTS.TIMEZONE})' # noqa: F821
os.environ["TZ"] = CONSTANTS.TIMEZONE # noqa: F821
os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
########################### Config Validity Checkers ###########################
@ -533,7 +526,8 @@ if not SHELL_CONFIG.SHOW_PROGRESS:
os.environ['TERM'] = 'dumb'
# recreate rich console obj based on new config values
CONSOLE = Console()
STDOUT = CONSOLE = Console()
STDERR = Console(stderr=True)
from ..misc import logging
logging.CONSOLE = CONSOLE
@ -541,11 +535,11 @@ logging.CONSOLE = CONSOLE
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = 0
def bump_startup_progress_bar():
def bump_startup_progress_bar(advance=1):
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
if INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=advance) # type: ignore
def setup_django_minimal():
@ -559,6 +553,8 @@ DJANGO_SET_UP = False
def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CONFIG, in_memory_db=False) -> None:
from rich.panel import Panel
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
global DJANGO_SET_UP
@ -568,7 +564,7 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
# TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes
return
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
output_dir = out_dir or CONSTANTS.DATA_DIR
@ -595,7 +591,14 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
else:
# Otherwise use default sqlite3 file-based database and initialize django
# without running migrations automatically (user runs them manually by calling init)
django.setup()
try:
django.setup()
except Exception as e:
bump_startup_progress_bar(advance=1000)
STDERR.print()
STDERR.print(Panel(f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n', title='\n\n[red][X] Error while trying to load database!', subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]', expand=False, style='bold red'))
STDERR.print()
return
bump_startup_progress_bar()
@ -608,6 +611,17 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
if check_db:
# make sure the data dir is owned by a non-root user
if CONSTANTS.DATA_DIR.stat().st_uid == 0:
STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
STDERR.print(f' {CONSTANTS.DATA_DIR}')
STDERR.print()
STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
STDERR.print(' cd path/to/your/archive/data')
STDERR.print(' archivebox [command]')
STDERR.print()
raise SystemExit(9)
# Create cache table in DB if needed
try:
from django.core.cache import cache

152
archivebox/config/paths.py Normal file
View file

@ -0,0 +1,152 @@
__package__ = 'archivebox.config'
import os
import tempfile
import hashlib
from pathlib import Path
from functools import cache
from platformdirs import PlatformDirs
from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
#############################################################################################
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
#############################################################################################
@cache
def get_collection_id(DATA_DIR=DATA_DIR):
"""Get a short, stable, unique ID for the current collection"""
collection_id_file = DATA_DIR / '.collection_id'
try:
return collection_id_file.read_text().strip()
except (OSError, FileNotFoundError, PermissionError):
pass
hash_key = str(DATA_DIR.resolve()).encode()
collection_id = hashlib.sha256(hash_key).hexdigest()[:8]
try:
collection_id_file.write_text(collection_id)
except (OSError, FileNotFoundError, PermissionError):
pass
return collection_id
def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True) -> bool:
"""Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)"""
current_uid, current_gid = os.geteuid(), os.getegid()
uid, gid = uid or current_uid, gid or current_gid
test_file = dir_path / '.permissions_test'
try:
with SudoPermission(uid=uid, fallback=fallback):
test_file.exists()
test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir')
test_file.unlink()
return True
except (IOError, OSError, PermissionError):
pass
return False
@cache
def get_LIB_DIR():
"""
- should be shared with other collections on the same host
- must be scoped by CPU architecture, OS family, and archivebox version
- should not be shared with other hosts/archivebox versions
- must be writable by any archivebox user
- should be persistent across reboots
- can be on a docker bin mount but probably shouldnt be
- ok to have a long path (doesnt contain SOCKETS)
"""
from .version import detect_installed_version
HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
if 'SYSTEM_LIB_DIR' in os.environ:
lib_dir = Path(os.environ['SYSTEM_LIB_DIR'])
else:
with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True):
lib_dir = HOST_DIRS.site_data_path
# Docker: /usr/local/share/archivebox/0.8.5
# Ubuntu: /usr/local/share/archivebox/0.8.5
# macOS: /Library/Application Support/archivebox
try:
with SudoPermission(uid=0, fallback=True):
lib_dir.mkdir(parents=True, exist_ok=True)
except PermissionError:
# our user cannot
lib_dir = HOST_DIRS.user_data_path
lib_dir.mkdir(parents=True, exist_ok=True)
if not dir_is_writable(lib_dir):
if IS_ROOT:
# make sure lib dir is owned by the archivebox user, not root
with SudoPermission(uid=0):
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"')
else:
raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
return lib_dir
@cache
def get_TMP_DIR():
"""
- must NOT be inside DATA_DIR / inside a docker volume bind mount
- must NOT have a long PATH (UNIX socket path length restrictions)
- must NOT be shared with other collections/hosts
- must be writable by archivebox user & root
- must be cleared on every boot / not persisted
- must be cleared on every archivebox version upgrade
"""
from .version import detect_installed_version
HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
# print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP)
# print('RUNNING AS:', self.PUID, self.PGID)
if 'SYSTEM_TMP_DIR' in os.environ:
run_dir = Path(os.environ['SYSTEM_TMP_DIR']).resolve() / get_collection_id(DATA_DIR=DATA_DIR)
with SudoPermission(uid=0, fallback=True):
run_dir.mkdir(parents=True, exist_ok=True)
if not dir_is_writable(run_dir):
if IS_ROOT:
with SudoPermission(uid=0, fallback=False):
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
else:
raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
return run_dir
run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve()
try:
assert len(str(run_dir)) + len('/supervisord.sock') < 95
except AssertionError:
run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR)
assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
with SudoPermission(uid=0, fallback=True):
run_dir.mkdir(parents=True, exist_ok=True)
if not dir_is_writable(run_dir):
if IS_ROOT:
with SudoPermission(uid=0):
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
else:
raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
# Docker: /tmp/archivebox/0.8.5/abc324235
# Ubuntu: /tmp/archivebox/0.8.5/abc324235
# macOS: /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/0.8.5/abc324235
return run_dir

View file

@ -0,0 +1,70 @@
__package__ = 'archivebox.config'
import os
from pathlib import Path
from contextlib import contextmanager
#############################################################################################
DATA_DIR = Path(os.getcwd())
DATA_DIR_STAT = Path(DATA_DIR).stat()
DATA_DIR_UID = DATA_DIR_STAT.st_uid
DATA_DIR_GID = DATA_DIR_STAT.st_gid
DEFAULT_PUID = 911
DEFAULT_PGID = 911
RUNNING_AS_UID = os.getuid()
RUNNING_AS_GID = os.getgid()
EUID = os.geteuid()
EGID = os.getegid()
USER: str = Path('~').expanduser().resolve().name
IS_ROOT = RUNNING_AS_UID == 0
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
os.environ.setdefault('PUID', str(DATA_DIR_UID or RUNNING_AS_UID or DEFAULT_PUID))
os.environ.setdefault('PGID', str(DATA_DIR_GID or RUNNING_AS_GID or DEFAULT_PGID))
ARCHIVEBOX_USER = int(os.environ['PUID'])
ARCHIVEBOX_GROUP = int(os.environ['PGID'])
#############################################################################################
def drop_privileges():
"""If running as root, drop privileges to the user that owns the data dir (or PUID, or default=911)"""
# always run archivebox as the user that owns the data dir, never as root
if os.getuid() == 0:
# drop permissions to the user that owns the data dir / provided PUID
if os.geteuid() != ARCHIVEBOX_USER:
os.seteuid(ARCHIVEBOX_USER)
# if we need sudo (e.g. for installing dependencies) code should use SudoPermissions() context manager to regain root
@contextmanager
def SudoPermission(uid=0, fallback=False):