hardcode EXTRACTOR_CHOICES to prevent nondeterministic migrations

This commit is contained in:
Nick Sweeting 2024-08-22 15:36:02 -07:00
parent 0a5b22700c
commit 09553d8340
No known key found for this signature in database
5 changed files with 44 additions and 19 deletions

View file

@ -180,12 +180,8 @@ class SnapshotActionForm(ActionForm):
)
# TODO: allow selecting actions for specific extractors? is this useful?
# EXTRACTOR_CHOICES = [
# (name, name.title())
# for name, _, _ in get_default_archive_methods()
# ]
# extractor = forms.ChoiceField(
# choices=EXTRACTOR_CHOICES,
# choices=ArchiveResult.EXTRACTOR_CHOICES,
# required=False,
# widget=forms.MultileChoiceField(attrs={'class': "form-control"})
# )

View file

@ -38,6 +38,21 @@ class Migration(migrations.Migration):
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=[('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('title', 'title'), ('wget', 'wget')], max_length=32),
field=models.CharField(choices=(
('htmltotext', 'htmltotext'),
('git', 'git'),
('singlefile', 'singlefile'),
('media', 'media'),
('archive_org', 'archive_org'),
('readability', 'readability'),
('mercury', 'mercury'),
('favicon', 'favicon'),
('pdf', 'pdf'),
('headers', 'headers'),
('screenshot', 'screenshot'),
('dom', 'dom'),
('title', 'title'),
('wget', 'wget'),
), max_length=32),
),
]

View file

@ -28,13 +28,6 @@ from ..index.html import snapshot_icons
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
STATUS_CHOICES = [
("succeeded", "succeeded"),
("failed", "failed"),
("skipped", "skipped")
]
def rand_int_id():
return random.getrandbits(32)
@ -376,7 +369,28 @@ class ArchiveResult(ABIDModel):
abid_uri_src = 'self.snapshot.url'
abid_subtype_src = 'self.extractor'
abid_rand_src = 'self.old_id'
EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
EXTRACTOR_CHOICES = (
('htmltotext', 'htmltotext'),
('git', 'git'),
('singlefile', 'singlefile'),
('media', 'media'),
('archive_org', 'archive_org'),
('readability', 'readability'),
('mercury', 'mercury'),
('favicon', 'favicon'),
('pdf', 'pdf'),
('headers', 'headers'),
('screenshot', 'screenshot'),
('dom', 'dom'),
('title', 'title'),
('wget', 'wget'),
)
STATUS_CHOICES = [
("succeeded", "succeeded"),
("failed", "failed"),
("skipped", "skipped")
]
old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID')

View file

@ -121,7 +121,7 @@ def snapshot_icons(snapshot) -> str:
cache_key = f'{snapshot.pk}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
def calc_snapshot_icons():
from core.models import EXTRACTOR_CHOICES
from core.models import ArchiveResult
# start = datetime.now(timezone.utc)
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str:
# Missing specific entry for WARC
extractor_outputs = defaultdict(lambda: None)
for extractor, _ in EXTRACTOR_CHOICES:
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
for result in archive_results:
if result.extractor == extractor and result:
extractor_outputs[extractor] = result
for extractor, _ in EXTRACTOR_CHOICES:
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
if extractor not in exclude:
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)

View file

@ -529,8 +529,8 @@ def log_shell_welcome_msg():
from .cli import list_subcommands
print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
print('{green}from archivebox.core.models import Snapshot, ArchiveResult, Tag, User{reset}'.format(**ANSI))
print('{green}from archivebox.cli import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI))
print('{green}from core.models import Snapshot, ArchiveResult, Tag, User{reset}'.format(**ANSI))
print('{green}from cli import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI))
print()
print('[i] Welcome to the ArchiveBox Shell!')
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage')