hardcode EXTRACTOR_CHOICES to prevent nondeterministic migrations

2024-09-19 07:28:49 -04:00 · 2024-08-22 15:36:02 -07:00 · 2024-08-22 15:36:02 -07:00 · 09553d8340
commit 09553d8340
parent 0a5b22700c
5 changed files with 44 additions and 19 deletions
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@ -180,12 +180,8 @@ class SnapshotActionForm(ActionForm):
    )

    # TODO: allow selecting actions for specific extractors? is this useful?
-    # EXTRACTOR_CHOICES = [
-    #     (name, name.title())
-    #     for name, _, _ in get_default_archive_methods()
-    # ]
    # extractor = forms.ChoiceField(
-    #     choices=EXTRACTOR_CHOICES,
+    #     choices=ArchiveResult.EXTRACTOR_CHOICES,
    #     required=False,
    #     widget=forms.MultileChoiceField(attrs={'class': "form-control"})
    # )
--- a/archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py
+++ b/archivebox/core/migrations/0023_alter_archiveresult_options_archiveresult_abid_and_more.py
@ -38,6 +38,21 @@ class Migration(migrations.Migration):
        migrations.AlterField(
            model_name='archiveresult',
            name='extractor',
-            field=models.CharField(choices=[('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('title', 'title'), ('wget', 'wget')], max_length=32),
+            field=models.CharField(choices=(
+                ('htmltotext', 'htmltotext'),
+                ('git', 'git'),
+                ('singlefile', 'singlefile'),
+                ('media', 'media'),
+                ('archive_org', 'archive_org'),
+                ('readability', 'readability'),
+                ('mercury', 'mercury'),
+                ('favicon', 'favicon'),
+                ('pdf', 'pdf'),
+                ('headers', 'headers'),
+                ('screenshot', 'screenshot'),
+                ('dom', 'dom'),
+                ('title', 'title'),
+                ('wget', 'wget'),
+            ), max_length=32),
        ),
    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@ -28,13 +28,6 @@ from ..index.html import snapshot_icons
 from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS


-EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
-STATUS_CHOICES = [
-    ("succeeded", "succeeded"),
-    ("failed", "failed"),
-    ("skipped", "skipped")
-]
-
 def rand_int_id():
    return random.getrandbits(32)

@ -376,7 +369,28 @@ class ArchiveResult(ABIDModel):
    abid_uri_src = 'self.snapshot.url'
    abid_subtype_src = 'self.extractor'
    abid_rand_src = 'self.old_id'
-    EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
+
+    EXTRACTOR_CHOICES = (
+        ('htmltotext', 'htmltotext'),
+        ('git', 'git'),
+        ('singlefile', 'singlefile'),
+        ('media', 'media'),
+        ('archive_org', 'archive_org'),
+        ('readability', 'readability'),
+        ('mercury', 'mercury'),
+        ('favicon', 'favicon'),
+        ('pdf', 'pdf'),
+        ('headers', 'headers'),
+        ('screenshot', 'screenshot'),
+        ('dom', 'dom'),
+        ('title', 'title'),
+        ('wget', 'wget'),
+    )
+    STATUS_CHOICES = [
+        ("succeeded", "succeeded"),
+        ("failed", "failed"),
+        ("skipped", "skipped")
+    ]

    old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID')

--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@ -121,7 +121,7 @@ def snapshot_icons(snapshot) -> str:
    cache_key = f'{snapshot.pk}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
    
    def calc_snapshot_icons():
-        from core.models import EXTRACTOR_CHOICES
+        from core.models import ArchiveResult
        # start = datetime.now(timezone.utc)

        archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str:
        # Missing specific entry for WARC

        extractor_outputs = defaultdict(lambda: None)
-        for extractor, _ in EXTRACTOR_CHOICES:
+        for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
            for result in archive_results:
                if result.extractor == extractor and result:
                    extractor_outputs[extractor] = result

-        for extractor, _ in EXTRACTOR_CHOICES:
+        for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
            if extractor not in exclude:
                existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
                # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@ -529,8 +529,8 @@ def log_shell_welcome_msg():
    from .cli import list_subcommands

    print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
-    print('{green}from archivebox.core.models import Snapshot, ArchiveResult, Tag, User{reset}'.format(**ANSI))
-    print('{green}from archivebox.cli import *\n    {}{reset}'.format("\n    ".join(list_subcommands().keys()), **ANSI))
+    print('{green}from core.models import Snapshot, ArchiveResult, Tag, User{reset}'.format(**ANSI))
+    print('{green}from cli import *\n    {}{reset}'.format("\n    ".join(list_subcommands().keys()), **ANSI))
    print()
    print('[i] Welcome to the ArchiveBox Shell!')
    print('    https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage')