diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py index 1d0da342..c0e1393b 100644 --- a/archivebox/core/migrations/0007_archiveresult.py +++ b/archivebox/core/migrations/0007_archiveresult.py @@ -1,8 +1,43 @@ # Generated by Django 3.0.8 on 2020-11-04 12:25 +import json +from pathlib import Path + from django.db import migrations, models import django.db.models.deletion +from config import CONFIG + + +def forwards_func(apps, schema_editor): + from core.models import EXTRACTORS + + Snapshot = apps.get_model("core", "Snapshot") + ArchiveResult = apps.get_model("core", "ArchiveResult") + + snapshots = Snapshot.objects.all() + for snapshot in snapshots: + out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp + + try: + with open(out_dir / "index.json", "r") as f: + fs_index = json.load(f) + except Exception as e: + continue + + history = fs_index["history"] + + for extractor in history: + for result in history[extractor]: + ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=json.dumps(result["cmd"]), cmd_version=result["cmd_version"], + start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"]) + + + +def reverse_func(apps, schema_editor): + ArchiveResult = apps.get_model("core", "ArchiveResult") + ArchiveResult.objects.all().delete() + class Migration(migrations.Migration): @@ -18,6 +53,7 @@ class Migration(migrations.Migration): ('cmd', models.CharField(default='', max_length=500)), ('pwd', models.CharField(default='', max_length=200)), ('cmd_version', models.CharField(default='', max_length=20)), + ('status', models.CharField(max_length=10)), ('output', models.CharField(default='', max_length=500)), ('start_ts', models.DateTimeField()), ('end_ts', models.DateTimeField()), @@ -25,4 +61,5 @@ class Migration(migrations.Migration): ('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')), ], ), + migrations.RunPython(forwards_func, reverse_func), ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 944d8612..41976348 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -161,4 +161,8 @@ class ArchiveResult(models.Model): output = models.CharField(max_length=500, default="") start_ts = models.DateTimeField() end_ts = models.DateTimeField() - extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20) \ No newline at end of file + status = models.CharField(max_length=10) + extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20) + + def __str__(self): + return self.extractor diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 0bb8fceb..56c74b5c 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -2,38 +2,72 @@ from pathlib import Path from django.utils.html import format_html -from core.models import Snapshot +from core.models import Snapshot, ArchiveResult, EXTRACTORS def get_icons(snapshot: Snapshot) -> str: + archive_results = snapshot.archiveresult_set link = snapshot.as_link() canon = link.canonical_outputs() - out_dir = Path(link.link_dir) + output = "" + output_template = '{} ' + icons = { + "singlefile": "❶", + "wget": "🆆", + "dom": "🅷", + "pdf": "📄", + "screenshot": "💻", + "media": "📼", + "git": "🅶", + "archive_org": "🏛", + "readability": "🆁", + "mercury": "🅼", + } + exclude = ["favicon"] + # Missing specific entry for WARC - # slow version: highlights icons based on whether files exist or not for that output - # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) - # fast version: all icons are highlighted without checking for outputs in filesystem - link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) - return format_html( - '' - '' - '🆆 ' - '🅷 ' - '📄 ' - '💻 ' - '📦 ' - '📼 ' - '🅶 ' - '🏛 ' - '', - *link_tuple(link, 'singlefile_path'), - *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')), - *link_tuple(link, 'pdf_path'), - *link_tuple(link, 'screenshot_path'), - *link_tuple(link, 'dom_path'), - *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), - *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), - *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), - canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), - ) + for extractor in EXTRACTORS: + result = archive_results.filter(extractor=extractor[0]) + try: + if extractor[0] not in exclude: + output += output_template.format(link.archive_path, canon[f"{extractor[0]}_path"], + result.exists(), extractor[0], icons.get(extractor[0], "?")) + except Exception as e: + print(e) + + return format_html(f'{output}') + +#def get_icons(snapshot: Snapshot) -> str: +# link = snapshot.as_link() +# canon = link.canonical_outputs() +# out_dir = Path(link.link_dir) +# +# # slow version: highlights icons based on whether files exist or not for that output +# # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) +# # fast version: all icons are highlighted without checking for outputs in filesystem +# link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) +# +# return format_html( +# '' +# '' +# '🆆 ' +# '🅷 ' +# '📄 ' +# '💻 ' +# '📦 ' +# '📼 ' +# '🅶 ' +# '🏛 ' +# '', +# *link_tuple(link, 'singlefile_path'), +# *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')), +# *link_tuple(link, 'pdf_path'), +# *link_tuple(link, 'screenshot_path'), +# *link_tuple(link, 'dom_path'), +# *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), +# *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), +# *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), +# canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), +# ) +# \ No newline at end of file