From 09553d83402e468cc8c600c630295e35788d3e3f Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 22 Aug 2024 15:36:02 -0700 Subject: [PATCH] hardcode EXTRACTOR_CHOICES to prevent nondeterministic migrations --- archivebox/core/admin.py | 6 +--- ...ult_options_archiveresult_abid_and_more.py | 17 ++++++++++- archivebox/core/models.py | 30 ++++++++++++++----- archivebox/index/html.py | 6 ++-- archivebox/logging_util.py | 4 +-- 5 files changed, 44 insertions(+), 19 deletions(-) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index f94cd68a..b87f6874 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -180,12 +180,8 @@ class SnapshotActionForm(ActionForm): ) # TODO: allow selecting actions for specific extractors? is this useful? - # EXTRACTOR_CHOICES = [ - # (name, name.title()) - # for name, _, _ in get_default_archive_methods() - # ] # extractor = forms.ChoiceField( - # choices=EXTRACTOR_CHOICES, + # choices=ArchiveResult.EXTRACTOR_CHOICES, # required=False, # widget=forms.MultileChoiceField(attrs={'class': "form-control"}) # ) diff --git a/archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py b/archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py index 39d3d570..438f455e 100644 --- a/archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py +++ b/archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py @@ -38,6 +38,21 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='archiveresult', name='extractor', - field=models.CharField(choices=[('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('title', 'title'), ('wget', 'wget')], max_length=32), + field=models.CharField(choices=( + ('htmltotext', 'htmltotext'), + ('git', 'git'), + ('singlefile', 'singlefile'), + ('media', 'media'), + ('archive_org', 'archive_org'), + ('readability', 'readability'), + ('mercury', 'mercury'), + ('favicon', 'favicon'), + ('pdf', 'pdf'), + ('headers', 'headers'), + ('screenshot', 'screenshot'), + ('dom', 'dom'), + ('title', 'title'), + ('wget', 'wget'), + ), max_length=32), ), ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index a8a2522c..c9266bd9 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -28,13 +28,6 @@ from ..index.html import snapshot_icons from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS -EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()] -STATUS_CHOICES = [ - ("succeeded", "succeeded"), - ("failed", "failed"), - ("skipped", "skipped") -] - def rand_int_id(): return random.getrandbits(32) @@ -376,7 +369,28 @@ class ArchiveResult(ABIDModel): abid_uri_src = 'self.snapshot.url' abid_subtype_src = 'self.extractor' abid_rand_src = 'self.old_id' - EXTRACTOR_CHOICES = EXTRACTOR_CHOICES + + EXTRACTOR_CHOICES = ( + ('htmltotext', 'htmltotext'), + ('git', 'git'), + ('singlefile', 'singlefile'), + ('media', 'media'), + ('archive_org', 'archive_org'), + ('readability', 'readability'), + ('mercury', 'mercury'), + ('favicon', 'favicon'), + ('pdf', 'pdf'), + ('headers', 'headers'), + ('screenshot', 'screenshot'), + ('dom', 'dom'), + ('title', 'title'), + ('wget', 'wget'), + ) + STATUS_CHOICES = [ + ("succeeded", "succeeded"), + ("failed", "failed"), + ("skipped", "skipped") + ] old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID') diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 2a891d7d..339f9429 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -121,7 +121,7 @@ def snapshot_icons(snapshot) -> str: cache_key = f'{snapshot.pk}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons' def calc_snapshot_icons(): - from core.models import EXTRACTOR_CHOICES + from core.models import ArchiveResult # start = datetime.now(timezone.utc) archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) @@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str: # Missing specific entry for WARC extractor_outputs = defaultdict(lambda: None) - for extractor, _ in EXTRACTOR_CHOICES: + for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES: for result in archive_results: if result.extractor == extractor and result: extractor_outputs[extractor] = result - for extractor, _ in EXTRACTOR_CHOICES: + for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES: if extractor not in exclude: existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching) diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index de7c4474..09f52c72 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -529,8 +529,8 @@ def log_shell_welcome_msg(): from .cli import list_subcommands print('{green}# ArchiveBox Imports{reset}'.format(**ANSI)) - print('{green}from archivebox.core.models import Snapshot, ArchiveResult, Tag, User{reset}'.format(**ANSI)) - print('{green}from archivebox.cli import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI)) + print('{green}from core.models import Snapshot, ArchiveResult, Tag, User{reset}'.format(**ANSI)) + print('{green}from cli import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI)) print() print('[i] Welcome to the ArchiveBox Shell!') print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage')