diff --git a/Dockerfile b/Dockerfile
index 82647329..fbb56a78 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,7 +10,7 @@
# docker run -v "$PWD/data":/data -p 8000:8000 archivebox server
# Multi-arch build:
# docker buildx create --use
-# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
+# docker buildx build . --platform=linux/amd64,linux/arm64--push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
#
# Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
@@ -194,10 +194,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
&& playwright install --with-deps chromium \
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \
else \
- # fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
- apt-get install -qq -y -t bookworm-backports --no-install-recommends \
- chromium \
- && export CHROME_BINARY="$(which chromium)"; \
+ # fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
+ # apt-get install -qq -y -t bookworm-backports --no-install-recommends \
+ # chromium \
+ # && export CHROME_BINARY="$(which chromium)"; \
+ echo 'armv7 no longer supported in versions after v0.7.3' \
+ exit 1; \
fi \
&& rm -rf /var/lib/apt/lists/* \
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
@@ -275,7 +277,6 @@ ENV IN_DOCKER=True \
GOOGLE_DEFAULT_CLIENT_SECRET=no \
ALLOWED_HOSTS=*
## No need to set explicitly, these values will be autodetected by archivebox in docker:
- # CHROME_SANDBOX=False \
# WGET_BINARY="wget" \
# YOUTUBEDL_BINARY="yt-dlp" \
# CHROME_BINARY="/usr/bin/chromium-browser" \
diff --git a/README.md b/README.md
index 27a84956..4d1bcf0d 100644
--- a/README.md
+++ b/README.md
@@ -1076,7 +1076,7 @@ Because ArchiveBox is designed to ingest a large volume of URLs with multiple co
Don't store large collections on older filesystems like EXT3/FAT as they may not be able to handle more than 50k directory entries in the data/archive/
folder.
Try to keep the data/index.sqlite3
file on local drive (not a network mount) or SSD for maximum performance, however the data/archive/
folder can be on a network mount or slower HDD.
-If using Docker or NFS/SMB/FUSE for the `data/archive/` folder, you may need to set PUID
& PGID
and disable root_squash
on your fileshare server.
+If using Docker or NFS/SMB/FUSE for the data/archive/
folder, you may need to set PUID
& PGID
and disable root_squash
on your fileshare server.
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index 9912b4c7..fb3688f3 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -250,7 +250,7 @@ def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in"""
from core.models import Snapshot
try:
- return Snapshot.objects.all()
+ return Snapshot.objects.all().only('id')
except (KeyboardInterrupt, SystemExit):
raise SystemExit(0)
diff --git a/docker-compose.yml b/docker-compose.yml
index ea3d3ab7..a8293705 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -8,32 +8,26 @@
# Documentation:
# https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose
-version: '3.9'
services:
archivebox:
- #image: ${DOCKER_IMAGE:-archivebox/archivebox:dev}
- image: archivebox/archivebox:dev
- command: server --quick-init 0.0.0.0:8000
+ image: archivebox/archivebox
ports:
- 8000:8000
volumes:
- ./data:/data
- # - ./etc/crontabs:/var/spool/cron/crontabs # uncomment this and archivebox_scheduler below to set up automatic recurring archive jobs
- # - ./archivebox:/app/archivebox # uncomment this to mount the ArchiveBox source code at runtime (for developers working on archivebox)
- # build: . # uncomment this to build the image from source code at buildtime (for developers working on archivebox)
environment:
- ALLOWED_HOSTS=* # restrict this to only accept incoming traffic via specific domain name
- # - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list
- # - PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content
- # - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive
# - ADMIN_USERNAME=admin # create an admin user on first run with the given user/pass combo
# - ADMIN_PASSWORD=SomeSecretPassword
# - PUID=911 # set to your host user's UID & GID if you encounter permissions issues
# - PGID=911
- # - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search
- # - SEARCH_BACKEND_HOST_NAME=sonic
- # - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
+ # - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list
+ # - PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content
+ # - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive
+ - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search
+ - SEARCH_BACKEND_HOST_NAME=sonic
+ - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
# - MEDIA_MAX_SIZE=750m # increase this filesize limit to allow archiving larger audio/video files
# - TIMEOUT=60 # increase this number to 120+ seconds if you see many slow downloads timing out
# - CHECK_SSL_VALIDITY=True # set to False to disable strict SSL checking (allows saving URLs w/ broken certs)
@@ -42,7 +36,7 @@ services:
# add further configuration options from archivebox/config.py as needed (to apply them only to this container)
# or set using `docker compose run archivebox config --set SOME_KEY=someval` (to persist config across all containers)
- # For ad-blocking during archiving, uncomment this section and pihole service section below
+ # For ad-blocking during archiving, uncomment this section and pihole service section below
# networks:
# - dns
# dns:
@@ -51,22 +45,26 @@ services:
######## Optional Addons: tweak examples below as needed for your specific use case ########
- ### Example: To run the Sonic full-text search backend, first download the config file to sonic.cfg
- # $ curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic.cfg
- # After starting, backfill any existing Snapshots into the full-text index:
+ ### Runs the Sonic full-text search backend, config file is auto-downloaded into sonic.cfg:
+ # After starting, backfill any existing Snapshots into the full-text index:
# $ docker-compose run archivebox update --index-only
- # sonic:
- # image: valeriansaliou/sonic:latest
- # expose:
- # - 1491
- # environment:
- # - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
- # volumes:
- # - ./sonic.cfg:/etc/sonic.cfg:ro
- # - ./data/sonic:/var/lib/sonic/store
-
-
+ sonic:
+ image: valeriansaliou/sonic
+ build:
+ dockerfile_inline: |
+ FROM quay.io/curl/curl:latest AS setup
+ RUN curl -fsSL 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/main/etc/sonic.cfg' > /tmp/sonic.cfg
+ FROM valeriansaliou/sonic:latest
+ COPY --from=setup /tmp/sonic.cfg /etc/sonic.cfg
+ expose:
+ - 1491
+ environment:
+ - SEARCH_BACKEND_PASSWORD=SomeSecretPassword
+ volumes:
+ - ./etc/sonic.cfg:/etc/sonic.cfg
+ - ./data/sonic:/var/lib/sonic/store
+
### Example: To run pihole in order to block ad/tracker requests during archiving,
# uncomment this block and set up pihole using its admin interface
diff --git a/package.json b/package.json
index 1377ef99..3c42a8b9 100644
--- a/package.json
+++ b/package.json
@@ -8,6 +8,6 @@
"dependencies": {
"@postlight/parser": "^2.2.3",
"readability-extractor": "github:ArchiveBox/readability-extractor",
- "single-file-cli": "^1.1.46"
+ "single-file-cli": "^1.1.54"
}
}
diff --git a/pyproject.toml b/pyproject.toml
index 969b6318..98a1a055 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,15 +15,16 @@ dependencies = [
"dateparser>=1.0.0",
"django-extensions>=3.2.3",
"django>=4.2.0,<5.0",
+ "setuptools>=69.0.3",
"feedparser>=6.0.11",
"ipython>5.0.0",
"mypy-extensions>=0.4.3",
"python-crontab>=2.5.1",
"requests>=2.24.0",
"w3lib>=1.22.0",
- "yt-dlp>=2023.10.13",
+ "yt-dlp>=2024.3.10",
# dont add playwright becuase packages without sdists cause trouble on many build systems that refuse to install wheel-only packages
- # "playwright>=1.39.0; platform_machine != 'armv7l'",
+ "playwright>=1.39.0; platform_machine != 'armv7l'",
]
classifiers = [
@@ -64,11 +65,11 @@ classifiers = [
sonic = [
# echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list
# curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg
+ # apt install sonic
"sonic-client>=0.0.5",
]
ldap = [
# apt install libldap2-dev libsasl2-dev python3-ldap
- "setuptools>=69.0.3",
"python-ldap>=3.4.3",
"django-auth-ldap>=4.1.0",
]
@@ -83,7 +84,6 @@ ldap = [
[tool.pdm.dev-dependencies]
dev = [
# building
- "setuptools>=69.0.3",
"wheel",
"pdm",
"homebrew-pypi-poet>=0.10.0",