massively improve Snapshot admin list view query performance

This commit is contained in:
Nick Sweeting 2024-08-26 20:16:43 -07:00
parent 6c4f3fc83a
commit 24fe958ff3
No known key found for this signature in database
5 changed files with 194 additions and 39 deletions

View file

@ -103,7 +103,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True}, 'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False}, 'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'}, 'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40}, 'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 100},
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None}, 'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
'TIME_ZONE': {'type': str, 'default': 'UTC'}, 'TIME_ZONE': {'type': str, 'default': 'UTC'},
'TIMEZONE': {'type': str, 'default': 'UTC'}, 'TIMEZONE': {'type': str, 'default': 'UTC'},

View file

@ -10,12 +10,15 @@ from datetime import datetime, timezone
from typing import Dict, Any from typing import Dict, Any
from django.contrib import admin from django.contrib import admin
from django.db.models import Count, Q from django.db.models import Count, Q, Prefetch
from django.urls import path, reverse from django.urls import path, reverse, resolve
from django.utils import timezone
from django.utils.functional import cached_property
from django.utils.html import format_html from django.utils.html import format_html
from django.utils.safestring import mark_safe from django.utils.safestring import mark_safe
from django.shortcuts import render, redirect from django.shortcuts import render, redirect
from django.contrib.auth import get_user_model from django.contrib.auth import get_user_model
from django.core.paginator import Paginator
from django.core.exceptions import ValidationError from django.core.exceptions import ValidationError
from django.conf import settings from django.conf import settings
from django import forms from django import forms
@ -126,23 +129,100 @@ archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_ad
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin) archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
class AccelleratedPaginator(Paginator):
"""
Accellerated Pagniator ignores DISTINCT when counting total number of rows.
Speeds up SELECT Count(*) on Admin views by >20x.
https://hakibenita.com/optimizing-the-django-admin-paginator
"""
@cached_property
def count(self):
if self.object_list._has_filters():
# fallback to normal count method on filtered queryset
return super().count
else:
# otherwise count total rows in a separate fast query
return self.object_list.model.objects.count()
# Alternative approach for PostgreSQL: fallback count takes > 200ms
# from django.db import connection, transaction, OperationalError
# with transaction.atomic(), connection.cursor() as cursor:
# cursor.execute('SET LOCAL statement_timeout TO 200;')
# try:
# return super().count
# except OperationalError:
# return 9999999999999
class ArchiveResultInline(admin.TabularInline): class ArchiveResultInline(admin.TabularInline):
name = 'Archive Results Log' name = 'Archive Results Log'
model = ArchiveResult model = ArchiveResult
parent_model = Snapshot
# fk_name = 'snapshot' # fk_name = 'snapshot'
extra = 1 extra = 0
readonly_fields = ('result_id', 'start_ts', 'end_ts', 'extractor', 'command', 'cmd_version') sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version')
fields = ('id', *readonly_fields, 'status', 'output') readonly_fields = ('result_id', 'completed', 'extractor', 'command', 'version')
fields = ('id', 'start_ts', 'end_ts', *readonly_fields, 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output')
# exclude = ('id',)
ordering = ('end_ts',)
show_change_link = True show_change_link = True
# # classes = ['collapse'] # # classes = ['collapse']
# # list_display_links = ['abid'] # # list_display_links = ['abid']
def get_parent_object_from_request(self, request):
resolved = resolve(request.path_info)
return self.parent_model.objects.get(pk=resolved.kwargs['object_id'])
@admin.display(
description='Completed',
ordering='end_ts',
)
def completed(self, obj):
return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S'))
def result_id(self, obj): def result_id(self, obj):
return format_html('<a href="{}"><small><code>[{}]</code></small></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid) return format_html('<a href="{}"><code style="font-size: 10px">[{}]</code></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
def command(self, obj): def command(self, obj):
return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or [])) return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
def version(self, obj):
return format_html('<small><code>{}</code></small>', obj.cmd_version or '-')
def get_formset(self, request, obj=None, **kwargs):
formset = super().get_formset(request, obj, **kwargs)
snapshot = self.get_parent_object_from_request(request)
# import ipdb; ipdb.set_trace()
formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
# default values for new entries
formset.form.base_fields['status'].initial = 'succeeded'
formset.form.base_fields['start_ts'].initial = timezone.now()
formset.form.base_fields['end_ts'].initial = timezone.now()
formset.form.base_fields['cmd_version'].initial = '-'
formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
formset.form.base_fields['created_by'].initial = request.user
formset.form.base_fields['cmd'] = forms.JSONField(initial=['-'])
formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
if obj is not None:
# hidden values for existing entries and new entries
formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
return formset
def get_readonly_fields(self, request, obj=None):
if obj is not None:
return self.readonly_fields
else:
return []
class TagInline(admin.TabularInline): class TagInline(admin.TabularInline):
model = Tag.snapshot_set.through # type: ignore model = Tag.snapshot_set.through # type: ignore
@ -222,25 +302,22 @@ def get_abid_info(self, obj):
@admin.register(Snapshot, site=archivebox_admin) @admin.register(Snapshot, site=archivebox_admin)
class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
class Meta:
model = Snapshot
list_display = ('added', 'title_str', 'files', 'size', 'url_str') list_display = ('added', 'title_str', 'files', 'size', 'url_str')
# list_editable = ('title',)
sort_fields = ('title_str', 'url_str', 'added', 'files') sort_fields = ('title_str', 'url_str', 'added', 'files')
readonly_fields = ('tags', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir') readonly_fields = ('tags_str', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir')
search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name') search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name')
list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags') list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags__name')
fields = ('url', 'created_by', 'title', *readonly_fields) fields = ('url', 'created_by', 'title', *readonly_fields)
ordering = ['-added'] ordering = ['-added']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
autocomplete_fields = ['tags']
inlines = [TagInline, ArchiveResultInline] inlines = [TagInline, ArchiveResultInline]
list_per_page = CONFIG.SNAPSHOTS_PER_PAGE list_per_page = min(max(5, CONFIG.SNAPSHOTS_PER_PAGE), 5000)
action_form = SnapshotActionForm action_form = SnapshotActionForm
paginator = AccelleratedPaginator
save_on_top = True save_on_top = True
show_full_result_count = False
def changelist_view(self, request, extra_context=None): def changelist_view(self, request, extra_context=None):
extra_context = extra_context or {} extra_context = extra_context or {}
@ -286,12 +363,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
] ]
return custom_urls + urls return custom_urls + urls
def get_queryset(self, request): # def get_queryset(self, request):
self.request = request # # tags_qs = SnapshotTag.objects.all().select_related('tag')
return super().get_queryset(request).prefetch_related('tags', 'archiveresult_set').annotate(archiveresult_count=Count('archiveresult')) # # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
# self.request = request
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
def tag_list(self, obj): def tag_list(self, obj):
return ', '.join(obj.tags.values_list('name', flat=True)) return ', '.join(tag.name for tag in obj.tags.all())
# TODO: figure out a different way to do this, you cant nest forms so this doenst work # TODO: figure out a different way to do this, you cant nest forms so this doenst work
# def action(self, obj): # def action(self, obj):
@ -360,21 +440,20 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
ordering='title', ordering='title',
) )
def title_str(self, obj): def title_str(self, obj):
canon = obj.as_link().canonical_outputs()
tags = ''.join( tags = ''.join(
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag) format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.pk, tag.name)
for tag in obj.tags.all() for tag in obj.tags.all()
if str(tag).strip() if str(tag.name).strip()
) )
return format_html( return format_html(
'<a href="/{}">' '<a href="/{}">'
'<img src="/{}/{}" class="favicon" onerror="this.remove()">' '<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
'</a>' '</a>'
'<a href="/{}/index.html">' '<a href="/{}/index.html">'
'<b class="status-{}">{}</b>' '<b class="status-{}">{}</b>'
'</a>', '</a>',
obj.archive_path, obj.archive_path,
obj.archive_path, canon['favicon_path'], obj.archive_path,
obj.archive_path, obj.archive_path,
'fetched' if obj.latest_title or obj.title else 'pending', 'fetched' if obj.latest_title or obj.title else 'pending',
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...' urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
@ -382,14 +461,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
@admin.display( @admin.display(
description='Files Saved', description='Files Saved',
ordering='archiveresult_count', # ordering='archiveresult_count',
) )
def files(self, obj): def files(self, obj):
return snapshot_icons(obj) return snapshot_icons(obj)
@admin.display( @admin.display(
ordering='archiveresult_count' # ordering='archiveresult_count'
) )
def size(self, obj): def size(self, obj):
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
@ -536,6 +615,8 @@ class TagAdmin(ABIDModelAdmin):
actions = ['delete_selected'] actions = ['delete_selected']
ordering = ['-created'] ordering = ['-created']
paginator = AccelleratedPaginator
def API(self, obj): def API(self, obj):
try: try:
return get_abid_info(self, obj) return get_abid_info(self, obj)
@ -575,6 +656,8 @@ class ArchiveResultAdmin(ABIDModelAdmin):
ordering = ['-start_ts'] ordering = ['-start_ts']
list_per_page = CONFIG.SNAPSHOTS_PER_PAGE list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
paginator = AccelleratedPaginator
@admin.display( @admin.display(
description='Snapshot Info' description='Snapshot Info'
) )

View file

@ -125,6 +125,12 @@ class SnapshotTag(models.Model):
db_table = 'core_snapshot_tags' db_table = 'core_snapshot_tags'
unique_together = [('snapshot', 'tag')] unique_together = [('snapshot', 'tag')]
class SnapshotManager(models.Manager):
def get_queryset(self):
return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
class Snapshot(ABIDModel): class Snapshot(ABIDModel):
abid_prefix = 'snp_' abid_prefix = 'snp_'
abid_ts_src = 'self.added' abid_ts_src = 'self.added'
@ -150,6 +156,8 @@ class Snapshot(ABIDModel):
archiveresult_set: models.Manager['ArchiveResult'] archiveresult_set: models.Manager['ArchiveResult']
objects = SnapshotManager()
@property @property
def uuid(self): def uuid(self):
return self.id return self.id
@ -177,8 +185,7 @@ class Snapshot(ABIDModel):
def as_json(self, *args) -> dict: def as_json(self, *args) -> dict:
args = args or self.keys args = args or self.keys
return { return {
key: getattr(self, key) key: getattr(self, key) if key != 'tags' else self.tags_str(nocache=False)
if key != 'tags' else self.tags_str()
for key in args for key in args
} }
@ -190,8 +197,14 @@ class Snapshot(ABIDModel):
return load_link_details(self.as_link()) return load_link_details(self.as_link())
def tags_str(self, nocache=True) -> str | None: def tags_str(self, nocache=True) -> str | None:
calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags' cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache:
# tags are pre-fetched already, use them directly (best because db is always freshest)
tags_str = calc_tags_str()
return tags_str
if nocache: if nocache:
tags_str = calc_tags_str() tags_str = calc_tags_str()
cache.set(cache_key, tags_str) cache.set(cache_key, tags_str)
@ -234,7 +247,10 @@ class Snapshot(ABIDModel):
@cached_property @cached_property
def num_outputs(self) -> int: def num_outputs(self) -> int:
return self.archiveresult_set.filter(status='succeeded').count() # DONT DO THIS: it will trigger a separate query for every snapshot
# return self.archiveresult_set.filter(status='succeeded').count()
# this is better:
return sum((1 for result in self.archiveresult_set.all() if result.status == 'succeeded'))
@cached_property @cached_property
def base_url(self): def base_url(self):
@ -262,10 +278,21 @@ class Snapshot(ABIDModel):
@cached_property @cached_property
def thumbnail_url(self) -> Optional[str]: def thumbnail_url(self) -> Optional[str]:
result = self.archiveresult_set.filter( if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
extractor='screenshot', result = (sorted(
status='succeeded' (
).only('output').last() result
for result in self.archiveresult_set.all()
if result.extractor == 'screenshot' and result.status =='succeeded' and result.output
),
key=lambda result: result.created,
) or [None])[-1]
else:
result = self.archiveresult_set.filter(
extractor='screenshot',
status='succeeded'
).only('output').last()
if result: if result:
return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}']) return reverse('Snapshot', args=[f'{str(self.timestamp)}/{result.output}'])
return None return None
@ -292,6 +319,21 @@ class Snapshot(ABIDModel):
if self.title: if self.title:
return self.title # whoopdedoo that was easy return self.title # whoopdedoo that was easy
# check if ArchiveResult set has already been prefetched, if so use it instead of fetching it from db again
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
try:
return (sorted(
(
result.output.strip()
for result in self.archiveresult_set.all()
if result.extractor == 'title' and result.status =='succeeded' and result.output
),
key=lambda title: len(title),
) or [None])[-1]
except IndexError:
pass
try: try:
# take longest successful title from ArchiveResult db history # take longest successful title from ArchiveResult db history
return sorted( return sorted(
@ -355,12 +397,23 @@ class Snapshot(ABIDModel):
class ArchiveResultManager(models.Manager): class ArchiveResultManager(models.Manager):
def indexable(self, sorted: bool = True): def indexable(self, sorted: bool = True):
"""Return only ArchiveResults containing text suitable for full-text search (sorted in order of typical result quality)"""
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS,status='succeeded') qs = self.get_queryset().filter(extractor__in=INDEXABLE_METHODS, status='succeeded')
if sorted: if sorted:
precedence = [ When(extractor=method, then=Value(precedence)) for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] precedence = [
qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence') When(extractor=method, then=Value(precedence))
for method, precedence in ARCHIVE_METHODS_INDEXING_PRECEDENCE
]
qs = qs.annotate(
indexing_precedence=Case(
*precedence,
default=Value(1000),
output_field=IntegerField()
)
).order_by('indexing_precedence')
return qs return qs
class ArchiveResult(ABIDModel): class ArchiveResult(ABIDModel):

View file

@ -197,7 +197,7 @@ def unsafe_wget_output_path(link: Link) -> Optional[str]:
@enforce_types @enforce_types
def wget_output_path(link: Link) -> Optional[str]: def wget_output_path(link: Link, nocache: bool=False) -> Optional[str]:
"""calculate the path to the wgetted .html file, since wget may """calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path. adjust some paths to be different than the base_url path.
@ -245,6 +245,15 @@ def wget_output_path(link: Link) -> Optional[str]:
# https://example.com/abc/test/?v=zzVa_tX1OiI # https://example.com/abc/test/?v=zzVa_tX1OiI
# > example.com/abc/test/index.html@v=zzVa_tX1OiI.html # > example.com/abc/test/index.html@v=zzVa_tX1OiI.html
cache_key = f'{link.url_hash}:{link.timestamp}-{link.updated and link.updated.timestamp()}-wget-output-path'
if not nocache:
from django.core.cache import cache
cached_result = cache.get(cache_key)
if cached_result:
return cached_result
# There's also lots of complexity around how the urlencoding and renaming # There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments, extensions like shtml / htm / php / etc, # is done for pages with query and hash fragments, extensions like shtml / htm / php / etc,
# unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than # unicode escape sequences, punycode domain names, unicode double-width characters, extensions longer than
@ -271,6 +280,8 @@ def wget_output_path(link: Link) -> Optional[str]:
output_path = None output_path = None
if output_path: if output_path:
if not nocache:
cache.set(cache_key, output_path)
return output_path return output_path
# fallback to just the domain dir # fallback to just the domain dir

View file

@ -124,7 +124,15 @@ def snapshot_icons(snapshot) -> str:
from core.models import ArchiveResult from core.models import ArchiveResult
# start = datetime.now(timezone.utc) # start = datetime.now(timezone.utc)
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False) if hasattr(snapshot, '_prefetched_objects_cache') and 'archiveresult_set' in snapshot._prefetched_objects_cache:
archive_results = [
result
for result in snapshot.archiveresult_set.all()
if result.status == "succeeded" and result.output
]
else:
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
link = snapshot.as_link() link = snapshot.as_link()
path = link.archive_path path = link.archive_path
canon = link.canonical_outputs() canon = link.canonical_outputs()