diff --git a/archivebox/core/migrations/0007_archiveresult.py b/archivebox/core/migrations/0007_archiveresult.py
index 1d0da342..c0e1393b 100644
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@@ -1,8 +1,43 @@
# Generated by Django 3.0.8 on 2020-11-04 12:25
+import json
+from pathlib import Path
+
from django.db import migrations, models
import django.db.models.deletion
+from config import CONFIG
+
+
+def forwards_func(apps, schema_editor):
+ from core.models import EXTRACTORS
+
+ Snapshot = apps.get_model("core", "Snapshot")
+ ArchiveResult = apps.get_model("core", "ArchiveResult")
+
+ snapshots = Snapshot.objects.all()
+ for snapshot in snapshots:
+ out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
+
+ try:
+ with open(out_dir / "index.json", "r") as f:
+ fs_index = json.load(f)
+ except Exception as e:
+ continue
+
+ history = fs_index["history"]
+
+ for extractor in history:
+ for result in history[extractor]:
+ ArchiveResult.objects.create(extractor=extractor, snapshot=snapshot, cmd=json.dumps(result["cmd"]), cmd_version=result["cmd_version"],
+ start_ts=result["start_ts"], end_ts=result["end_ts"], status=result["status"], pwd=result["pwd"], output=result["output"])
+
+
+
+def reverse_func(apps, schema_editor):
+ ArchiveResult = apps.get_model("core", "ArchiveResult")
+ ArchiveResult.objects.all().delete()
+
class Migration(migrations.Migration):
@@ -18,6 +53,7 @@ class Migration(migrations.Migration):
('cmd', models.CharField(default='', max_length=500)),
('pwd', models.CharField(default='', max_length=200)),
('cmd_version', models.CharField(default='', max_length=20)),
+ ('status', models.CharField(max_length=10)),
('output', models.CharField(default='', max_length=500)),
('start_ts', models.DateTimeField()),
('end_ts', models.DateTimeField()),
@@ -25,4 +61,5 @@ class Migration(migrations.Migration):
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
],
),
+ migrations.RunPython(forwards_func, reverse_func),
]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 944d8612..41976348 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -161,4 +161,8 @@ class ArchiveResult(models.Model):
output = models.CharField(max_length=500, default="")
start_ts = models.DateTimeField()
end_ts = models.DateTimeField()
- extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
\ No newline at end of file
+ status = models.CharField(max_length=10)
+ extractor = models.CharField(choices=EXTRACTORS, blank=False, max_length=20)
+
+ def __str__(self):
+ return self.extractor
diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py
index 0bb8fceb..56c74b5c 100644
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -2,38 +2,72 @@ from pathlib import Path
from django.utils.html import format_html
-from core.models import Snapshot
+from core.models import Snapshot, ArchiveResult, EXTRACTORS
def get_icons(snapshot: Snapshot) -> str:
+ archive_results = snapshot.archiveresult_set
link = snapshot.as_link()
canon = link.canonical_outputs()
- out_dir = Path(link.link_dir)
+ output = ""
+ output_template = '{} '
+ icons = {
+ "singlefile": "❶",
+ "wget": "🆆",
+ "dom": "🅷",
+ "pdf": "📄",
+ "screenshot": "💻",
+ "media": "📼",
+ "git": "🅶",
+ "archive_org": "🏛",
+ "readability": "🆁",
+ "mercury": "🅼",
+ }
+ exclude = ["favicon"]
+ # Missing specific entry for WARC
- # slow version: highlights icons based on whether files exist or not for that output
- # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
- # fast version: all icons are highlighted without checking for outputs in filesystem
- link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
- return format_html(
- ''
- '❶ '
- '🆆 '
- '🅷 '
- '📄 '
- '💻 '
- '📦 '
- '📼 '
- '🅶 '
- '🏛 '
- '',
- *link_tuple(link, 'singlefile_path'),
- *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
- *link_tuple(link, 'pdf_path'),
- *link_tuple(link, 'screenshot_path'),
- *link_tuple(link, 'dom_path'),
- *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
- *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
- *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
- canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
- )
+ for extractor in EXTRACTORS:
+ result = archive_results.filter(extractor=extractor[0])
+ try:
+ if extractor[0] not in exclude:
+ output += output_template.format(link.archive_path, canon[f"{extractor[0]}_path"],
+ result.exists(), extractor[0], icons.get(extractor[0], "?"))
+ except Exception as e:
+ print(e)
+
+ return format_html(f'{output}')
+
+#def get_icons(snapshot: Snapshot) -> str:
+# link = snapshot.as_link()
+# canon = link.canonical_outputs()
+# out_dir = Path(link.link_dir)
+#
+# # slow version: highlights icons based on whether files exist or not for that output
+# # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
+# # fast version: all icons are highlighted without checking for outputs in filesystem
+# link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
+#
+# return format_html(
+# ''
+# '❶ '
+# '🆆 '
+# '🅷 '
+# '📄 '
+# '💻 '
+# '📦 '
+# '📼 '
+# '🅶 '
+# '🏛 '
+# '',
+# *link_tuple(link, 'singlefile_path'),
+# *link_tuple(link, 'wget_path')[:2], any((out_dir / link.domain).glob('*')),
+# *link_tuple(link, 'pdf_path'),
+# *link_tuple(link, 'screenshot_path'),
+# *link_tuple(link, 'dom_path'),
+# *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
+# *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
+# *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
+# canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
+# )
+#
\ No newline at end of file