From de489d3c604727946202d49a3960c83cd3961193 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 3 Jun 2024 04:00:18 -0700 Subject: [PATCH] minor snapshot details ui fixes and migrations log msg improvements --- .../core/migrations/0024_auto_20240513_1143.py | 3 +++ archivebox/core/views.py | 14 ++++++++------ archivebox/templates/core/snapshot_live.html | 14 +++++++------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/archivebox/core/migrations/0024_auto_20240513_1143.py b/archivebox/core/migrations/0024_auto_20240513_1143.py index 31f1e773..95652a07 100644 --- a/archivebox/core/migrations/0024_auto_20240513_1143.py +++ b/archivebox/core/migrations/0024_auto_20240513_1143.py @@ -47,12 +47,14 @@ def calculate_abid(self): def copy_snapshot_uuids(apps, schema_editor): + print(' Copying snapshot.id -> snapshot.uuid...') Snapshot = apps.get_model("core", "Snapshot") for snapshot in Snapshot.objects.all(): snapshot.uuid = snapshot.id snapshot.save(update_fields=["uuid"]) def generate_snapshot_abids(apps, schema_editor): + print(' Generating snapshot.abid values...') Snapshot = apps.get_model("core", "Snapshot") for snapshot in Snapshot.objects.all(): snapshot.abid_prefix = 'snp_' @@ -65,6 +67,7 @@ def generate_snapshot_abids(apps, schema_editor): snapshot.save(update_fields=["abid"]) def generate_archiveresult_abids(apps, schema_editor): + print(' Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)') ArchiveResult = apps.get_model("core", "ArchiveResult") Snapshot = apps.get_model("core", "Snapshot") for result in ArchiveResult.objects.all(): diff --git a/archivebox/core/views.py b/archivebox/core/views.py index efaca2f5..3b491b8e 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -90,7 +90,7 @@ class SnapshotView(View): archiveresults[result.extractor] = result_info existing_files = {result['path'] for result in archiveresults.values()} - min_size_threshold = 128 # bytes + min_size_threshold = 10_000 # bytes allowed_extensions = { 'txt', 'html', @@ -108,12 +108,14 @@ class SnapshotView(View): 'md', } + # iterate through all the files in the snapshot dir and add the biggest ones to the result list - for result_file in Path(snapshot.link_dir).glob('*/*/*'): + snap_dir = Path(snapshot.link_dir) + for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')): extension = result_file.suffix.lstrip('.').lower() if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions: continue - if result_file.name in existing_files: + if result_file.name in existing_files or result_file.name == 'index.html': continue file_size = result_file.stat().st_size or 0 @@ -121,7 +123,7 @@ class SnapshotView(View): if file_size > min_size_threshold: archiveresults[result_file.name] = { 'name': result_file.stem, - 'path': result_file.relative_to(snapshot.link_dir), + 'path': result_file.relative_to(snap_dir), 'ts': ts_to_date_str(result_file.stat().st_mtime or 0), 'size': file_size, } @@ -140,7 +142,7 @@ class SnapshotView(View): link_info = link._asdict(extended=True) try: - warc_path = 'warc/' + list(Path(snapshot.link_dir).glob('warc/*.warc.*'))[0].name + warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name except IndexError: warc_path = 'warc/' @@ -160,7 +162,7 @@ class SnapshotView(View): 'warc_path': warc_path, 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG, 'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS, - 'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name'])), + 'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']), 'best_result': best_result, # 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234', } diff --git a/archivebox/templates/core/snapshot_live.html b/archivebox/templates/core/snapshot_live.html index 32957516..73af92a5 100644 --- a/archivebox/templates/core/snapshot_live.html +++ b/archivebox/templates/core/snapshot_live.html @@ -401,13 +401,13 @@
{% endfor %} @@ -419,7 +419,7 @@

Headers, JSON, etc.

- + @@ -430,7 +430,7 @@ - + @@ -444,9 +444,9 @@ this.src = this.src + '#toolbar=0' } this.onload = function() { - if (this.src.endsWith('.pdf')) { + if (this.src.includes('.pdf')) { this.removeAttribute('sandbox') - this.src = this.src + '#toolbar=0' + this.src = this.src.split('?autoplay=')[0] + '#toolbar=0' } try { // doesnt work if frame origin rules prevent accessing its DOM via JS