From cbf2a8fdc3f3932514ab311f66976e476038514a Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 4 Sep 2024 23:42:36 -0700 Subject: [PATCH] rename datetime fields to _at, massively improve ABID generation safety and determinism --- archivebox/abid_utils/abid.py | 1 + archivebox/abid_utils/admin.py | 73 +++--- archivebox/abid_utils/models.py | 242 ++++++++++++------ archivebox/api/models.py | 14 +- archivebox/api/v1_cli.py | 2 +- archivebox/api/v1_core.py | 104 ++++---- archivebox/core/admin.py | 98 ++++--- .../migrations/0032_alter_archiveresult_id.py | 6 +- ...eresult_old_id_alter_archiveresult_uuid.py | 7 +- ...iveresult_id_alter_archiveresult_old_id.py | 6 +- .../core/migrations/0058_alter_tag_old_id.py | 7 +- ...ottag_tag_alter_tag_id_alter_tag_old_id.py | 5 +- archivebox/core/models.py | 57 +++-- archivebox/core/views.py | 20 +- archivebox/extractors/__init__.py | 8 +- archivebox/extractors/wget.py | 2 +- archivebox/index/html.py | 2 +- archivebox/index/schema.py | 14 +- archivebox/main.py | 12 +- archivebox/plugantic/models.py | 49 ---- .../templates/admin/snapshots_grid.html | 2 +- archivebox/templates/core/index_row.html | 4 +- archivebox/templates/core/snapshot.html | 2 +- archivebox/templates/core/snapshot_live.html | 4 +- archivebox/util.py | 2 +- 25 files changed, 408 insertions(+), 335 deletions(-) diff --git a/archivebox/abid_utils/abid.py b/archivebox/abid_utils/abid.py index c7fe8fb9..8863e61c 100644 --- a/archivebox/abid_utils/abid.py +++ b/archivebox/abid_utils/abid.py @@ -208,6 +208,7 @@ def abid_hashes_from_values(prefix: str, ts: datetime, uri: str, subtype: str | 'uri': abid_part_from_uri(uri, salt=salt), 'subtype': abid_part_from_subtype(subtype), 'rand': abid_part_from_rand(rand), + # 'salt': don't add this, salt combined with uri above to form a single hash } @enforce_types diff --git a/archivebox/abid_utils/admin.py b/archivebox/abid_utils/admin.py index 3adf4b34..66e53bc7 100644 --- a/archivebox/abid_utils/admin.py +++ b/archivebox/abid_utils/admin.py @@ -1,58 +1,61 @@ __package__ = 'archivebox.abid_utils' -from django.contrib import admin + +from typing import Any from datetime import datetime + +from django.contrib import admin, messages +from django.core.exceptions import ValidationError from django.utils.html import format_html from django.utils.safestring import mark_safe +from django.shortcuts import redirect -from abid_utils.abid import abid_part_from_ts, abid_part_from_uri, abid_part_from_rand, abid_part_from_subtype +from abid_utils.abid import ABID, abid_part_from_ts, abid_part_from_uri, abid_part_from_rand, abid_part_from_subtype from api.auth import get_or_create_api_token from ..util import parse_date -def highlight_diff(display_val, compare_val): +def highlight_diff(display_val: Any, compare_val: Any, invert: bool=False, color_same: str | None=None, color_diff: str | None=None): """highlight each character in red that differs with the char at the same index in compare_val""" display_val = str(display_val) compare_val = str(compare_val) + if len(compare_val) < len(display_val): + compare_val += ' ' * (len(display_val) - len(compare_val)) + + similar_color, highlighted_color = color_same or 'inherit', color_diff or 'red' + if invert: + similar_color, highlighted_color = color_same or 'green', color_diff or 'inherit' + return mark_safe(''.join( - format_html('{}', display_val[i]) + format_html('{}', highlighted_color, display_val[i]) if display_val[i] != compare_val[i] else - format_html('{}', display_val[i]) + format_html('{}', similar_color, display_val[i]) for i in range(len(display_val)) )) def get_abid_info(self, obj, request=None): try: - abid_diff = f' != obj.ABID: {highlight_diff(obj.ABID, obj.abid)} ❌' if str(obj.ABID) != str(obj.abid) else ' == .ABID ✅' + #abid_diff = f' != obj.ABID: {highlight_diff(obj.ABID, obj.abid)} ❌' if str(obj.ABID) != str(obj.abid) else ' == .ABID ✅' - fresh_abid = obj.ABID - fresh_abid_diff = f' !=   .fresh_abid: {highlight_diff(fresh_abid, obj.ABID)} ❌' if str(fresh_abid) != str(obj.ABID) else '✅' - fresh_uuid_diff = f' !=   .fresh_uuid: {highlight_diff(fresh_abid.uuid, obj.ABID.uuid)} ❌' if str(fresh_abid.uuid) != str(obj.ABID.uuid) else '✅' + fresh_abid = ABID(**obj.ABID_FRESH_HASHES) + fresh_abid_diff = f'❌ !=   .fresh_abid: {highlight_diff(fresh_abid, obj.ABID)}' if str(fresh_abid) != str(obj.ABID) else '✅' + fresh_uuid_diff = f'❌ !=   .fresh_uuid: {highlight_diff(fresh_abid.uuid, obj.ABID.uuid)}' if str(fresh_abid.uuid) != str(obj.ABID.uuid) else '✅' - id_fresh_abid_diff = f' != .fresh_abid ❌' if str(fresh_abid.uuid) != str(obj.id) else ' == .fresh_abid ✅' - id_abid_diff = f' != .abid.uuid: {highlight_diff(obj.ABID.uuid, obj.id)} ❌' if str(obj.id) != str(obj.ABID.uuid) else ' == .abid ✅' - id_pk_diff = f' != .pk: {highlight_diff(obj.pk, obj.id)} ❌' if str(obj.pk) != str(obj.id) else ' == .pk ✅' + id_pk_diff = f'❌ != .pk: {highlight_diff(obj.pk, obj.id)}' if str(obj.pk) != str(obj.id) else '✅' fresh_ts = parse_date(obj.ABID_FRESH_VALUES['ts']) or None - derived_ts = abid_part_from_ts(fresh_ts) if fresh_ts else None - ts_diff = f'!= {highlight_diff(derived_ts, obj.ABID.ts)} ❌' if derived_ts != obj.ABID.ts else '✅' + ts_diff = f'❌ != {highlight_diff( obj.ABID_FRESH_HASHES["ts"], obj.ABID.ts)}' if obj.ABID_FRESH_HASHES["ts"] != obj.ABID.ts else '✅' - derived_uri = abid_part_from_uri(obj.ABID_FRESH_VALUES['uri']) - uri_diff = f'!= {highlight_diff(derived_uri, obj.ABID.uri)} ❌' if derived_uri != obj.ABID.uri else '✅' + derived_uri = obj.ABID_FRESH_HASHES['uri'] + uri_diff = f'❌ != {highlight_diff(derived_uri, obj.ABID.uri)}' if derived_uri != obj.ABID.uri else '✅' - derived_subtype = abid_part_from_subtype(obj.ABID_FRESH_VALUES['subtype']) - subtype_diff = f'!= {highlight_diff(derived_subtype, obj.ABID.subtype)} ❌' if derived_subtype != obj.ABID.subtype else '✅' + derived_subtype = obj.ABID_FRESH_HASHES['subtype'] + subtype_diff = f'❌ != {highlight_diff(derived_subtype, obj.ABID.subtype)}' if derived_subtype != obj.ABID.subtype else '✅' - derived_rand = abid_part_from_rand(obj.ABID_FRESH_VALUES['rand']) - rand_diff = f'!= {highlight_diff(derived_rand, obj.ABID.rand)} ❌' if derived_rand != obj.ABID.rand else '✅' - - # any_abid_discrepancies = any( - # '❌' in diff or '!=' in diff - # for diff in (abid_diff, fresh_abid_diff, id_abid_diff, id_pk_diff, ts_diff, uri_diff, subtype_diff, rand_diff) - # ) - # total_diff = f' != .generate_abid() -> {fresh_abid} ❌' if any_abid_discrepancies else '✅' + derived_rand = obj.ABID_FRESH_HASHES['rand'] + rand_diff = f'❌ != {highlight_diff(derived_rand, obj.ABID.rand)}' if derived_rand != obj.ABID.rand else '✅' return format_html( # URL Hash: {}
@@ -69,30 +72,34 @@ def get_abid_info(self, obj, request=None):     SUBTYPE:       {}           {}                           {} {}: {}
    RAND:             {}       {}                 {} {}: {}

+ {} {} ''', obj.api_url + (f'?api_key={get_or_create_api_token(request.user)}' if request and request.user else ''), obj.api_url, obj.api_docs_url, - highlight_diff(obj.id, obj.ABID.uuid), mark_safe(id_pk_diff + id_abid_diff), - highlight_diff(obj.ABID.uuid, obj.id), mark_safe(fresh_uuid_diff), + highlight_diff(obj.id, obj.ABID.uuid, invert=True), mark_safe(id_pk_diff), + highlight_diff(obj.ABID.uuid, obj.id, invert=True), mark_safe(fresh_uuid_diff), highlight_diff(obj.abid, fresh_abid), mark_safe(fresh_abid_diff), # str(fresh_abid.uuid), mark_safe(fresh_uuid_diff), # str(fresh_abid), mark_safe(fresh_abid_diff), - highlight_diff(obj.ABID.ts, derived_ts), highlight_diff(str(obj.ABID.uuid)[0:14], str(fresh_abid.uuid)[0:14]), mark_safe(ts_diff), obj.abid_ts_src, fresh_ts and fresh_ts.isoformat(), + highlight_diff(obj.ABID.ts, obj.ABID_FRESH_HASHES['ts']), highlight_diff(str(obj.ABID.uuid)[0:14], str(fresh_abid.uuid)[0:14]), mark_safe(ts_diff), obj.abid_ts_src, fresh_ts and fresh_ts.isoformat(), highlight_diff(obj.ABID.uri, derived_uri), highlight_diff(str(obj.ABID.uuid)[14:26], str(fresh_abid.uuid)[14:26]), mark_safe(uri_diff), obj.abid_uri_src, str(obj.ABID_FRESH_VALUES['uri']), highlight_diff(obj.ABID.subtype, derived_subtype), highlight_diff(str(obj.ABID.uuid)[26:28], str(fresh_abid.uuid)[26:28]), mark_safe(subtype_diff), obj.abid_subtype_src, str(obj.ABID_FRESH_VALUES['subtype']), highlight_diff(obj.ABID.rand, derived_rand), highlight_diff(str(obj.ABID.uuid)[28:36], str(fresh_abid.uuid)[28:36]), mark_safe(rand_diff), obj.abid_rand_src, str(obj.ABID_FRESH_VALUES['rand'])[-7:], + f'Some values the ABID depends on have changed since the ABID was issued:' if obj.ABID_FRESH_DIFFS else '', + ", ".join(diff['abid_src'] for diff in obj.ABID_FRESH_DIFFS.values()), ) except Exception as e: + # import ipdb; ipdb.set_trace() return str(e) class ABIDModelAdmin(admin.ModelAdmin): - list_display = ('created', 'created_by', 'abid', '__str__') - sort_fields = ('created', 'created_by', 'abid', '__str__') - readonly_fields = ('created', 'modified', '__str__', 'API') + list_display = ('created_at', 'created_by', 'abid', '__str__') + sort_fields = ('created_at', 'created_by', 'abid', '__str__') + readonly_fields = ('created_at', 'modified_at', '__str__', 'abid_info') @admin.display(description='API Identifiers') - def API(self, obj): + def abid_info(self, obj): return get_abid_info(self, obj, request=self.request) def queryset(self, request): diff --git a/archivebox/abid_utils/models.py b/archivebox/abid_utils/models.py index a860c69d..38ad57f7 100644 --- a/archivebox/abid_utils/models.py +++ b/archivebox/abid_utils/models.py @@ -11,7 +11,7 @@ from datetime import datetime, timedelta from functools import partial from charidfield import CharIDField # type: ignore[import-untyped] -from django.conf import settings +from django.core.exceptions import ValidationError from django.db import models from django.utils import timezone from django.db.utils import OperationalError @@ -59,7 +59,7 @@ def get_or_create_system_user_pk(username='system'): return User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0] # otherwise, create a dedicated "system" user - user, created = User.objects.get_or_create(username=username, is_staff=True, is_superuser=True, defaults={'email': '', 'password': ''}) + user, _was_created = User.objects.get_or_create(username=username, is_staff=True, is_superuser=True, defaults={'email': '', 'password': ''}) return user.pk @@ -68,69 +68,166 @@ class AutoDateTimeField(models.DateTimeField): # return timezone.now() pass +class ABIDError(Exception): + pass + +class ABIDFieldsCannotBeChanged(ValidationError, ABIDError): + """ + Properties used as unique identifiers (to generate ABID) cannot be edited after an object is created. + Create a new object instead with your desired changes (and it will be issued a new ABID). + """ + def __init__(self, ABID_FRESH_DIFFS, obj): + self.ABID_FRESH_DIFFS = ABID_FRESH_DIFFS + self.obj = obj + + def __str__(self): + keys_changed = ', '.join(diff['abid_src'] for diff in self.ABID_FRESH_DIFFS.values()) + return ( + f"This {self.obj.__class__.__name__}(abid={str(self.obj.ABID)}) was assigned a fixed, unique ID (ABID) based on its contents when it was created. " + + f'\nThe following changes cannot be made because they would alter the ABID:' + + '\n ' + "\n ".join(f' - {diff["summary"]}' for diff in self.ABID_FRESH_DIFFS.values()) + + f"\nYou must reduce your changes to not affect these fields, or create a new {self.obj.__class__.__name__} object instead." + ) + class ABIDModel(models.Model): """ Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface. """ abid_prefix: str = DEFAULT_ABID_PREFIX # e.g. 'tag_' - abid_ts_src = 'self.created' # e.g. 'self.created' - abid_uri_src = 'None' # e.g. 'self.uri' + abid_ts_src = 'self.created_at' # e.g. 'self.created_at' + abid_uri_src = 'None' # e.g. 'self.uri' (MUST BE SET) abid_subtype_src = 'self.__class__.__name__' # e.g. 'self.extractor' abid_rand_src = 'self.id' # e.g. 'self.uuid' or 'self.id' - abid_salt: str = DEFAULT_ABID_URI_SALT + abid_salt: str = DEFAULT_ABID_URI_SALT # combined with self.uri to anonymize hashes on a per-install basis (default is shared globally with all users, means everyone will hash ABC to -> 123 the same around the world, makes it easy to share ABIDs across installs and see if they are for the same URI. Change this if you dont want your hashes to be guessable / in the same hash space as all other users) + abid_drift_allowed: bool = False # set to True to allow abid_field values to change after a fixed ABID has been issued (NOT RECOMMENDED: means values can drift out of sync from original ABID) # id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') # abid = ABIDField(prefix=abid_prefix) # created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) - # created = AutoDateTimeField(default=None, null=False, db_index=True) - # modified = models.DateTimeField(auto_now=True) + # created_at = AutoDateTimeField(default=None, null=False, db_index=True) + # modified_at = models.DateTimeField(auto_now=True) class Meta(TypedModelMeta): abstract = True - def save(self, *args: Any, **kwargs: Any) -> None: + def __init__(self, *args: Any, **kwargs: Any) -> None: + """Overriden __init__ method ensures we have a stable creation timestamp that fields can use within initialization code pre-saving to DB.""" + super().__init__(*args, **kwargs) + # pre-compute a stable timestamp of the obj init time (with abid.ts precision limit applied) for use when object is first created, + # some other fields depend on a timestamp at creation time, and it's nice to have one common timestamp they can all share. + # Used as an alternative to auto_now_add=True + auto_now=True which can produce two different times & requires saving to DB to get the TS. + # (ordinarily fields cant depend on other fields until the obj is saved to db and recalled) + self._init_timestamp = ts_from_abid(abid_part_from_ts(timezone.now())) + + def save(self, *args: Any, abid_drift_allowed: bool | None=None, **kwargs: Any) -> None: + """Overriden save method ensures new ABID is generated while a new object is first saving.""" + if self._state.adding: - self.pk = self.id = self.id or uuid4() - self.created = ts_from_abid(abid_part_from_ts(timezone.now())) # cut off precision to match precision of TS component - self.modified = self.created - self.created_by = self.created_by or get_or_create_system_user_pk() + # only runs once when a new object is first saved to the DB + # sets self.id, self.pk, self.created_by, self.created_at, self.modified_at self.abid = str(self.issue_new_abid()) + + else: + # otherwise if updating, make sure none of the field changes would invalidate existing ABID + if self.ABID_FRESH_DIFFS: + ovewrite_abid = self.abid_drift_allowed if (abid_drift_allowed is None) else abid_drift_allowed + + change_error = ABIDFieldsCannotBeChanged(self.ABID_FRESH_DIFFS, obj=self) + if ovewrite_abid: + print(f'#### DANGER: Changing ABID of existing record ({self.__class__.__name__}.abid_drift_allowed={abid_drift_allowed}), this will break any references to its previous ABID!') + print(change_error) + self.abid = str(self.issue_new_abid(force_new=True)) + print(f'#### DANGER: OVERWROTE OLD ABID. NEW ABID=', self.abid) + else: + raise change_error + return super().save(*args, **kwargs) - # assert str(self.id) == str(self.ABID.uuid), f'self.id {self.id} does not match self.ABID {self.ABID.uuid}' - # assert str(self.abid) == str(self.ABID), f'self.abid {self.id} does not match self.ABID {self.ABID.uuid}' - # assert str(self.uuid) == str(self.ABID.uuid), f'self.uuid ({self.uuid}) does not match .ABID.uuid ({self.ABID.uuid})' + @property + def ABID_SOURCES(self) -> Dict[str, str]: + """"Get the dict of fresh ABID component values based on the live object's properties.""" + assert self.abid_prefix + return { + 'prefix': 'self.abid_prefix', # defined as static class vars at build time + 'ts': self.abid_ts_src, + 'uri': self.abid_uri_src, + 'subtype': self.abid_subtype_src, + 'rand': self.abid_rand_src, + 'salt': 'self.abid_salt', # defined as static class vars at build time + } @property def ABID_FRESH_VALUES(self) -> Dict[str, Any]: - assert self.abid_ts_src != 'None' - assert self.abid_uri_src != 'None' - assert self.abid_rand_src != 'None' - assert self.abid_subtype_src != 'None' + """"Get the dict of fresh ABID component values based on the live object's properties.""" + abid_sources = self.ABID_SOURCES + assert all(src != 'None' for src in abid_sources.values()) return { - 'prefix': self.abid_prefix, - 'ts': eval(self.abid_ts_src), - 'uri': eval(self.abid_uri_src), - 'subtype': eval(self.abid_subtype_src), - 'rand': eval(self.abid_rand_src), - 'salt': self.abid_salt, + 'prefix': eval(abid_sources['prefix']), + 'ts': eval(abid_sources['ts']), + 'uri': eval(abid_sources['uri']), + 'subtype': eval(abid_sources['subtype']), + 'rand': eval(abid_sources['rand']), + 'salt': eval(abid_sources['salt']), } @property def ABID_FRESH_HASHES(self) -> Dict[str, str]: - return abid_hashes_from_values(**self.ABID_FRESH_VALUES) + """"Get the dict of fresh ABID component hashes based on the live object's properties.""" + abid_values = self.ABID_FRESH_VALUES + assert all(val for val in abid_values.values()) + return abid_hashes_from_values( + prefix=abid_values['prefix'], + ts=abid_values['ts'], + uri=abid_values['uri'], + subtype=abid_values['subtype'], + rand=abid_values['rand'], + salt=abid_values['salt'], + ) + + @property + def ABID_FRESH_DIFFS(self) -> Dict[str, Dict[str, Any]]: + """Get the dict of discrepancies between the existing saved ABID and a new fresh ABID computed based on the live object.""" + existing_abid = self.ABID + existing_values = {} if self._state.adding else self.__class__.objects.get(pk=self.pk).ABID_FRESH_VALUES + abid_sources = self.ABID_SOURCES + fresh_values = self.ABID_FRESH_VALUES + fresh_hashes = self.ABID_FRESH_HASHES + return { + key: { + 'model': self.__class__.__name__, + 'pk': self.pk, + 'abid_src': abid_sources[key], + 'abid_section': key, + 'old_val': existing_values.get(key, None), + 'old_hash': getattr(existing_abid, key), + 'new_val': fresh_values[key], + 'new_hash': new_hash, + 'summary': f'{abid_sources[key]}= "{existing_values.get(key, None)}" -> "{fresh_values[key]}" (would alter {self.__class__.__name__.lower()}.ABID.{key}={getattr(existing_abid, key)} to {new_hash})', + } + for key, new_hash in fresh_hashes.items() + if getattr(existing_abid, key) != new_hash + } - def issue_new_abid(self): - assert self.abid is None, f'Can only issue new ABID for new objects that dont already have one {self.abid}' - assert self._state.adding, 'Can only issue new ABID when model._state.adding is True' + def issue_new_abid(self, force_new=False) -> ABID: + """ + Issue a new ABID based on the current object's properties, can only be called once on new objects (before they are saved to DB). + """ + if not force_new: + assert self.abid is None, f'Can only issue new ABID for new objects that dont already have one {self.abid}' + assert self._state.adding, 'Can only issue new ABID when model._state.adding is True' assert eval(self.abid_uri_src), f'Can only issue new ABID if self.abid_uri_src is defined ({self.abid_uri_src}={eval(self.abid_uri_src)})' + # Setup Field defaults to be ready for ABID generation self.abid = None - self.pk = self.id = self.id or uuid4() - self.created = ts_from_abid(abid_part_from_ts(timezone.now())) # cut off precision to match precision of TS component + self.id = self.id or uuid4() + self.pk = self.id + self.created_at = self.created_at or self._init_timestamp # cut off precision to match precision of TS component + self.modified_at = self.modified_at or self.created_at + self.created_by = self.created_by or get_or_create_system_user_pk() + # Compute fresh ABID values & hashes based on object's live properties abid_fresh_values = self.ABID_FRESH_VALUES assert all(abid_fresh_values.values()), f'All ABID_FRESH_VALUES must be set {abid_fresh_values}' abid_fresh_hashes = self.ABID_FRESH_HASHES @@ -140,64 +237,63 @@ class ABIDModel(models.Model): assert new_abid.ulid and new_abid.uuid and new_abid.typeid, f'Failed to calculate {abid_fresh_values["prefix"]}_ABID for {self.__class__.__name__}' - # store stable ABID on local fields, overwrite them because we are adding a new entry and existing defaults havent touched db yet - self.abid = str(new_abid) - assert str(self.ABID.uuid) == str(new_abid.uuid) return new_abid - @property def ABID(self) -> ABID: """ - aka get_or_generate_abid -> ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE') + Get the object's existing ABID (from self.abid if it's already saved to DB, otherwise generated fresh) + e.g. -> ABID(ts='01HX9FPYTR', uri='E4A5CCD9', subtype='00', rand='ZYEBQE') """ - if not self.abid: - pre_save_abid = self.issue_new_abid() - self.abid = str(pre_save_abid) - return pre_save_abid + if self.abid: + return ABID.parse(cast(str, self.abid)) + + return self.issue_new_abid() - return ABID.parse(cast(str, self.abid)) - - @property - def ULID(self) -> ULID: - """ - Get a ulid.ULID representation of the object's ABID. - """ - return self.ABID.ulid - - @property - def UUID(self) -> UUID: - """ - Get a uuid.UUID (v4) representation of the object's ABID. - """ - return self.ABID.uuid + # These are all example helpers to make it easy to access alternate formats of the ABID.*, only add them if you actually need them + # @property + # def UUID(self) -> UUID: + # """ + # Get a uuid.UUID (v4) representation of the object's ABID. + # """ + # return self.ABID.uuid - @property - def uuid(self) -> str: - """ - Get a str uuid.UUID (v4) representation of the object's ABID. - """ - return str(self.ABID.uuid) - - @property - def TypeID(self) -> TypeID: - """ - Get a typeid.TypeID (stripe-style) representation of the object's ABID. - """ - return self.ABID.typeid + # @property + # def uuid(self) -> str: + # """ + # Get a str uuid.UUID (v4) representation of the object's ABID. + # """ + # return str(self.ABID.uuid) - @property - def abid_uri(self) -> str: - return eval(self.abid_uri_src) + # @property + # def ULID(self) -> ULID: + # """ + # Get a ulid.ULID representation of the object's ABID. + # """ + # return self.ABID.ulid + + # @property + # def TypeID(self) -> TypeID: + # """ + # Get a typeid.TypeID (stripe-style) representation of the object's ABID. + # """ + # return self.ABID.typeid @property def api_url(self) -> str: - # /api/v1/core/any/{abid} + """ + Compute the REST API URL to access this object. + e.g. /api/v1/core/snapshot/snp_01BJQMF54D093DXEAWZ6JYRP + """ return reverse_lazy('api-1:get_any', args=[self.abid]) @property def api_docs_url(self) -> str: + """ + Compute the REST API Documentation URL to learn about accessing this object. + e.g. /api/v1/docs#/Core%20Models/api_v1_core_get_snapshots + """ return f'/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}' @@ -311,7 +407,7 @@ def find_obj_from_abid_rand(rand: Union[ABID, str], model=None) -> List[ABIDMode ) for obj in qs: - if abid in (str(obj.ABID_FRESH), str(obj.id), str(obj.abid)): + if abid in (str(obj.ABID), str(obj.id), str(obj.pk), str(obj.abid)): # found exact match, no need to keep iterating return [obj] partial_matches.append(obj) diff --git a/archivebox/api/models.py b/archivebox/api/models.py index fe5e0750..9f6b8395 100644 --- a/archivebox/api/models.py +++ b/archivebox/api/models.py @@ -27,7 +27,7 @@ class APIToken(ABIDModel): """ # ABID: apt____ abid_prefix = 'apt_' - abid_ts_src = 'self.created' + abid_ts_src = 'self.created_at' abid_uri_src = 'self.token' abid_subtype_src = 'self.created_by_id' abid_rand_src = 'self.id' @@ -36,8 +36,8 @@ class APIToken(ABIDModel): abid = ABIDField(prefix=abid_prefix) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) - created = AutoDateTimeField(default=None, null=False, db_index=True) - modified = models.DateTimeField(auto_now=True) + created_at = AutoDateTimeField(default=None, null=False, db_index=True) + modified_at = models.DateTimeField(auto_now=True) token = models.CharField(max_length=32, default=generate_secret_token, unique=True) expires = models.DateTimeField(null=True, blank=True) @@ -59,7 +59,7 @@ class APIToken(ABIDModel): "abid": str(self.ABID), "created_by_id": str(self.created_by_id), "token": self.token, - "created": self.created.isoformat(), + "created_at": self.created_at.isoformat(), "expires": self.expires_as_iso8601, } @@ -95,7 +95,7 @@ class OutboundWebhook(ABIDModel, WebhookBase): settings.SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook' """ abid_prefix = 'whk_' - abid_ts_src = 'self.created' + abid_ts_src = 'self.created_at' abid_uri_src = 'self.endpoint' abid_subtype_src = 'self.ref' abid_rand_src = 'self.id' @@ -104,8 +104,8 @@ class OutboundWebhook(ABIDModel, WebhookBase): abid = ABIDField(prefix=abid_prefix) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) - created = AutoDateTimeField(default=None, null=False, db_index=True) - modified = models.DateTimeField(auto_now=True) + created_at = AutoDateTimeField(default=None, null=False, db_index=True) + modified_at = models.DateTimeField(auto_now=True) # More fields here: WebhookBase... diff --git a/archivebox/api/v1_cli.py b/archivebox/api/v1_cli.py index 23f9a93a..cb0cc561 100644 --- a/archivebox/api/v1_cli.py +++ b/archivebox/api/v1_cli.py @@ -94,7 +94,7 @@ class ListCommandSchema(Schema): status: Optional[StatusChoices] = StatusChoices.indexed after: Optional[float] = 0 before: Optional[float] = 999999999999999 - sort: str = 'added' + sort: str = 'bookmarked_at' as_json: bool = True as_html: bool = False as_csv: str | bool = 'timestamp,url' diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index a103f354..cc13b203 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -60,22 +60,17 @@ class CustomPagination(PaginationBase): ### ArchiveResult ######################################################################### -class ArchiveResultSchema(Schema): +class MinimalArchiveResultSchema(Schema): TYPE: str = 'core.models.ArchiveResult' id: UUID abid: str - modified: datetime - created: datetime + modified_at: datetime + created_at: datetime created_by_id: str created_by_username: str - snapshot_abid: str - snapshot_timestamp: str - snapshot_url: str - snapshot_tags: str - extractor: str cmd_version: Optional[str] cmd: List[str] @@ -92,20 +87,12 @@ class ArchiveResultSchema(Schema): User = get_user_model() return User.objects.get(id=obj.created_by_id).username - @staticmethod - def resolve_pk(obj): - return str(obj.pk) - - @staticmethod - def resolve_uuid(obj): - return str(obj.uuid) - @staticmethod def resolve_abid(obj): return str(obj.ABID) @staticmethod - def resolve_created(obj): + def resolve_created_at(obj): return obj.start_ts @staticmethod @@ -116,13 +103,28 @@ class ArchiveResultSchema(Schema): def resolve_snapshot_url(obj): return obj.snapshot.url + @staticmethod + def resolve_snapshot_id(obj): + return str(obj.snapshot_id) + @staticmethod def resolve_snapshot_abid(obj): return str(obj.snapshot.ABID) @staticmethod def resolve_snapshot_tags(obj): - return obj.snapshot.tags_str() + return sorted(tag.name for tag in obj.snapshot.tags.all()) + +class ArchiveResultSchema(MinimalArchiveResultSchema): + TYPE: str = 'core.models.ArchiveResult' + + # ... Extends MinimalArchiveResultSchema fields ... + + snapshot_id: UUID + snapshot_abid: str + snapshot_timestamp: str + snapshot_url: str + snapshot_tags: List[str] class ArchiveResultFilterSchema(FilterSchema): @@ -140,9 +142,9 @@ class ArchiveResultFilterSchema(FilterSchema): pwd: Optional[str] = Field(None, q='pwd__icontains') cmd_version: Optional[str] = Field(None, q='cmd_version') - created: Optional[datetime] = Field(None, q='updated') - created__gte: Optional[datetime] = Field(None, q='updated__gte') - created__lt: Optional[datetime] = Field(None, q='updated__lt') + created_at: Optional[datetime] = Field(None, q='created_at') + created_at__gte: Optional[datetime] = Field(None, q='created_at__gte') + created_at__lt: Optional[datetime] = Field(None, q='created_at__lt') @router.get("/archiveresults", response=List[ArchiveResultSchema], url_name="get_archiveresult") @@ -194,23 +196,25 @@ class SnapshotSchema(Schema): id: UUID abid: str - modified: datetime - created: datetime created_by_id: str created_by_username: str + created_at: datetime + modified_at: datetime + + bookmarked_at: datetime + downloaded_at: Optional[datetime] url: str - tags: str + tags: List[str] title: Optional[str] timestamp: str archive_path: str - bookmarked: datetime - added: datetime - updated: Optional[datetime] + # url_for_admin: str + # url_for_view: str num_archiveresults: int - archiveresults: List[ArchiveResultSchema] + archiveresults: List[MinimalArchiveResultSchema] @staticmethod def resolve_created_by_id(obj): @@ -221,21 +225,21 @@ class SnapshotSchema(Schema): User = get_user_model() return User.objects.get(id=obj.created_by_id).username - @staticmethod - def resolve_pk(obj): - return str(obj.pk) - - @staticmethod - def resolve_uuid(obj): - return str(obj.uuid) - @staticmethod def resolve_abid(obj): return str(obj.ABID) @staticmethod def resolve_tags(obj): - return obj.tags_str() + return sorted(tag.name for tag in obj.tags.all()) + + # @staticmethod + # def resolve_url_for_admin(obj): + # return f"/admin/core/snapshot/{obj.id}/change/" + + # @staticmethod + # def resolve_url_for_view(obj): + # return f"/{obj.archive_path}" @staticmethod def resolve_num_archiveresults(obj, context): @@ -255,12 +259,12 @@ class SnapshotFilterSchema(FilterSchema): created_by_id: str = Field(None, q='created_by_id') created_by_username: str = Field(None, q='created_by__username__icontains') - created__gte: datetime = Field(None, q='created__gte') - created__lt: datetime = Field(None, q='created__lt') - created: datetime = Field(None, q='created') - modified: datetime = Field(None, q='modified') - modified__gte: datetime = Field(None, q='modified__gte') - modified__lt: datetime = Field(None, q='modified__lt') + created_at__gte: datetime = Field(None, q='created_at__gte') + created_at__lt: datetime = Field(None, q='created_at__lt') + created_at: datetime = Field(None, q='created_at') + modified_at: datetime = Field(None, q='modified_at') + modified_at__gte: datetime = Field(None, q='modified_at__gte') + modified_at__lt: datetime = Field(None, q='modified_at__lt') search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'id__icontains', 'abid__icontains', 'timestamp__startswith']) url: Optional[str] = Field(None, q='url') @@ -268,8 +272,8 @@ class SnapshotFilterSchema(FilterSchema): title: Optional[str] = Field(None, q='title__icontains') timestamp: Optional[str] = Field(None, q='timestamp__startswith') - added__gte: Optional[datetime] = Field(None, q='added__gte') - added__lt: Optional[datetime] = Field(None, q='added__lt') + bookmarked_at__gte: Optional[datetime] = Field(None, q='bookmarked_at__gte') + bookmarked_at__lt: Optional[datetime] = Field(None, q='bookmarked_at__lt') @@ -285,7 +289,7 @@ def get_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_arch @router.get("/snapshot/{snapshot_id}", response=SnapshotSchema, url_name="get_snapshot") def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): - """Get a specific Snapshot by abid, uuid, or pk.""" + """Get a specific Snapshot by abid or id.""" request.with_archiveresults = with_archiveresults snapshot = None try: @@ -311,7 +315,7 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): # # @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema) # def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema): -# snapshot = get_object_or_404(Snapshot, uuid=snapshot_id) +# snapshot = get_object_or_404(Snapshot, id=snapshot_id) # # for attr, value in payload.dict().items(): # setattr(snapshot, attr, value) @@ -321,7 +325,7 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): # # @router.delete("/snapshot/{snapshot_id}") # def delete_snapshot(request, snapshot_id: str): -# snapshot = get_object_or_404(Snapshot, uuid=snapshot_id) +# snapshot = get_object_or_404(Snapshot, id=snapshot_id) # snapshot.delete() # return {"success": True} @@ -336,8 +340,8 @@ class TagSchema(Schema): id: UUID abid: str - modified: datetime - created: datetime + modified_at: datetime + created_at: datetime created_by_id: str created_by_username: str diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index fbc4494c..ca1adac4 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -127,10 +127,10 @@ class CustomUserAdmin(UserAdmin): '[{}] 📅 {} {}', snap.pk, snap.abid, - snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...', + snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...', snap.url[:64], ) - for snap in obj.snapshot_set.order_by('-modified')[:10] + for snap in obj.snapshot_set.order_by('-modified_at')[:10] ) + f'
{total_count} total records...') @admin.display(description='Archive Result Logs') @@ -141,11 +141,11 @@ class CustomUserAdmin(UserAdmin): '[{}] 📅 {} 📄 {} {}', result.pk, result.abid, - result.snapshot.updated.strftime('%Y-%m-%d %H:%M') if result.snapshot.updated else 'pending...', + result.snapshot.downloaded_at.strftime('%Y-%m-%d %H:%M') if result.snapshot.downloaded_at else 'pending...', result.extractor, result.snapshot.url[:64], ) - for result in obj.archiveresult_set.order_by('-modified')[:10] + for result in obj.archiveresult_set.order_by('-modified_at')[:10] ) + f'
{total_count} total records...') @admin.display(description='Tags') @@ -157,7 +157,7 @@ class CustomUserAdmin(UserAdmin): tag.pk, tag.name, ) - for tag in obj.tag_set.order_by('-modified')[:10] + for tag in obj.tag_set.order_by('-modified_at')[:10] ) + f'
{total_count} total records...') @admin.display(description='API Tokens') @@ -171,7 +171,7 @@ class CustomUserAdmin(UserAdmin): apitoken.token_redacted[:64], apitoken.expires, ) - for apitoken in obj.apitoken_set.order_by('-modified')[:10] + for apitoken in obj.apitoken_set.order_by('-modified_at')[:10] ) + f'
{total_count} total records...') @admin.display(description='API Outbound Webhooks') @@ -185,7 +185,7 @@ class CustomUserAdmin(UserAdmin): outboundwebhook.referenced_model, outboundwebhook.endpoint, ) - for outboundwebhook in obj.outboundwebhook_set.order_by('-modified')[:10] + for outboundwebhook in obj.outboundwebhook_set.order_by('-modified_at')[:10] ) + f'
{total_count} total records...') @@ -351,13 +351,13 @@ class SnapshotActionForm(ActionForm): @admin.register(Snapshot, site=archivebox_admin) class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): - list_display = ('created', 'title_str', 'files', 'size', 'url_str') - sort_fields = ('title_str', 'url_str', 'created') - readonly_fields = ('tags_str', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'created', 'created', 'updated', 'modified', 'API', 'link_dir') + list_display = ('created_at', 'title_str', 'files', 'size', 'url_str') + sort_fields = ('title_str', 'url_str', 'created_at') + readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'abid_info', 'link_dir') search_fields = ('id', 'url', 'abid', 'timestamp', 'title', 'tags__name') - list_filter = ('created', 'updated', 'archiveresult__status', 'created_by', 'tags__name') - fields = ('url', 'created_by', 'title',*readonly_fields) - ordering = ['-created'] + list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name') + fields = ('url', 'title', 'created_by', 'bookmarked_at', *readonly_fields) + ordering = ['-created_at'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] inlines = [TagInline, ArchiveResultInline] list_per_page = min(max(5, CONFIG.SNAPSHOTS_PER_PAGE), 5000) @@ -377,30 +377,6 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}') return super().changelist_view(request, GLOBAL_CONTEXT) - def change_view(self, request, object_id, form_url="", extra_context=None): - self.request = request - snapshot = None - - try: - snapshot = snapshot or Snapshot.objects.get(id=object_id) - except (Snapshot.DoesNotExist, Snapshot.MultipleObjectsReturned, ValidationError): - pass - - try: - snapshot = snapshot or Snapshot.objects.get(abid=Snapshot.abid_prefix + object_id.split('_', 1)[-1]) - except (Snapshot.DoesNotExist, ValidationError): - pass - - if snapshot: - object_id = str(snapshot.id) - - - return super().change_view( - request, - object_id, - form_url, - extra_context=extra_context, - ) def get_urls(self): urls = super().get_urls() @@ -416,8 +392,20 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): # self.request = request # return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult')) - def tag_list(self, obj): - return ', '.join(tag.name for tag in obj.tags.all()) + @admin.action( + description="Imported Timestamp" + ) + def imported_timestamp(self, obj): + context = RequestContext(self.request, { + 'bookmarked_date': obj.bookmarked, + 'timestamp': obj.timestamp, + }) + + html = Template("""{{bookmarked_date}} ({{timestamp}})""") + return mark_safe(html.render(context)) + + # pretty_time = obj.bookmarked.strftime('%Y-%m-%d %H:%M:%S') + # return f'{pretty_time} ({obj.timestamp})' # TODO: figure out a different way to do this, you cant nest forms so this doenst work # def action(self, obj): @@ -647,14 +635,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): @admin.register(Tag, site=archivebox_admin) class TagAdmin(ABIDModelAdmin): - list_display = ('created', 'created_by', 'abid', 'name', 'num_snapshots', 'snapshots') - list_filter = ('created', 'created_by') - sort_fields = ('name', 'slug', 'abid', 'created_by', 'created') - readonly_fields = ('slug', 'abid', 'created', 'modified', 'API', 'snapshots') + list_display = ('created_at', 'created_by', 'abid', 'name', 'num_snapshots', 'snapshots') + list_filter = ('created_at', 'created_by') + sort_fields = ('name', 'slug', 'abid', 'created_by', 'created_at') + readonly_fields = ('slug', 'abid', 'created_at', 'modified_at', 'abid_info', 'snapshots') search_fields = ('abid', 'name', 'slug') fields = ('name', 'created_by', *readonly_fields) actions = ['delete_selected'] - ordering = ['-created'] + ordering = ['-created_at'] paginator = AccelleratedPaginator @@ -672,10 +660,10 @@ class TagAdmin(ABIDModelAdmin): format_html( '[{}] {}', snap.pk, - snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...', + snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...', snap.url[:64], ) - for snap in tag.snapshot_set.order_by('-updated')[:10] + for snap in tag.snapshot_set.order_by('-downloaded_at')[:10] ) + (f'
{total_count} total snapshots...')) @@ -683,7 +671,7 @@ class TagAdmin(ABIDModelAdmin): class ArchiveResultAdmin(ABIDModelAdmin): list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str') sort_fields = ('start_ts', 'extractor', 'status') - readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created', 'modified', 'API', 'output_summary') + readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'abid_info', 'output_summary') search_fields = ('id', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'start_ts', 'end_ts', 'created_by', 'cmd_version', 'cmd', *readonly_fields) autocomplete_fields = ['snapshot'] @@ -706,7 +694,7 @@ class ArchiveResultAdmin(ABIDModelAdmin): '[{}]   {}   {}
', result.snapshot.timestamp, result.snapshot.abid, - result.snapshot.added.strftime('%Y-%m-%d %H:%M'), + result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'), result.snapshot.url[:128], ) @@ -765,18 +753,18 @@ class ArchiveResultAdmin(ABIDModelAdmin): @admin.register(APIToken, site=archivebox_admin) class APITokenAdmin(ABIDModelAdmin): - list_display = ('created', 'abid', 'created_by', 'token_redacted', 'expires') - sort_fields = ('abid', 'created', 'created_by', 'expires') - readonly_fields = ('created', 'modified', 'API') + list_display = ('created_at', 'abid', 'created_by', 'token_redacted', 'expires') + sort_fields = ('abid', 'created_at', 'created_by', 'expires') + readonly_fields = ('created_at', 'modified_at', 'abid_info') search_fields = ('id', 'abid', 'created_by__username', 'token') fields = ('created_by', 'token', 'expires', *readonly_fields) list_filter = ('created_by',) - ordering = ['-created'] + ordering = ['-created_at'] list_per_page = 100 @admin.register(get_webhook_model(), site=archivebox_admin) class CustomWebhookAdmin(WebhookAdmin, ABIDModelAdmin): - list_display = ('created', 'created_by', 'abid', *WebhookAdmin.list_display) - sort_fields = ('created', 'created_by', 'abid', 'referenced_model', 'endpoint', 'last_success', 'last_error') - readonly_fields = ('created', 'modified', 'API', *WebhookAdmin.readonly_fields) + list_display = ('created_at', 'created_by', 'abid', *WebhookAdmin.list_display) + sort_fields = ('created_at', 'created_by', 'abid', 'referenced_model', 'endpoint', 'last_success', 'last_error') + readonly_fields = ('created_at', 'modified_at', 'abid_info', *WebhookAdmin.readonly_fields) diff --git a/archivebox/core/migrations/0032_alter_archiveresult_id.py b/archivebox/core/migrations/0032_alter_archiveresult_id.py index 98299a31..9eb9f458 100644 --- a/archivebox/core/migrations/0032_alter_archiveresult_id.py +++ b/archivebox/core/migrations/0032_alter_archiveresult_id.py @@ -1,9 +1,13 @@ # Generated by Django 5.0.6 on 2024-08-18 05:20 import core.models +import random from django.db import migrations, models +def rand_int_id(): + return random.getrandbits(32) + class Migration(migrations.Migration): dependencies = [ @@ -14,6 +18,6 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='archiveresult', name='id', - field=models.BigIntegerField(default=core.models.rand_int_id, primary_key=True, serialize=False, verbose_name='ID'), + field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='ID'), ), ] diff --git a/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py b/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py index dd6da1f5..5109a69b 100644 --- a/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py +++ b/archivebox/core/migrations/0034_alter_archiveresult_old_id_alter_archiveresult_uuid.py @@ -2,11 +2,16 @@ import core.models import uuid +import random from django.db import migrations, models from abid_utils.abid import ABID +def rand_int_id(): + return random.getrandbits(32) + + def update_archiveresult_ids(apps, schema_editor): ArchiveResult = apps.get_model("core", "ArchiveResult") num_total = ArchiveResult.objects.all().count() @@ -30,7 +35,7 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='archiveresult', name='old_id', - field=models.BigIntegerField(default=core.models.rand_int_id, serialize=False, verbose_name='ID'), + field=models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='ID'), ), migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop), migrations.AlterField( diff --git a/archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py b/archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py index 10b4f9c6..9595eb0d 100644 --- a/archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py +++ b/archivebox/core/migrations/0036_alter_archiveresult_id_alter_archiveresult_old_id.py @@ -2,9 +2,13 @@ import core.models import uuid +import random from django.db import migrations, models +def rand_int_id(): + return random.getrandbits(32) + class Migration(migrations.Migration): dependencies = [ @@ -20,6 +24,6 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='archiveresult', name='old_id', - field=models.BigIntegerField(default=core.models.rand_int_id, serialize=False, verbose_name='Old ID'), + field=models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID'), ), ] diff --git a/archivebox/core/migrations/0058_alter_tag_old_id.py b/archivebox/core/migrations/0058_alter_tag_old_id.py index 4cc291c0..39900366 100644 --- a/archivebox/core/migrations/0058_alter_tag_old_id.py +++ b/archivebox/core/migrations/0058_alter_tag_old_id.py @@ -1,9 +1,12 @@ # Generated by Django 5.0.6 on 2024-08-20 03:30 -import core.models +import random from django.db import migrations, models +def rand_int_id(): + return random.getrandbits(32) + class Migration(migrations.Migration): dependencies = [ @@ -14,6 +17,6 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='tag', name='old_id', - field=models.BigIntegerField(default=core.models.rand_int_id, primary_key=True, serialize=False, verbose_name='Old ID'), + field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='Old ID'), ), ] diff --git a/archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py b/archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py index e6022eab..9f70a8d0 100644 --- a/archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py +++ b/archivebox/core/migrations/0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id.py @@ -3,8 +3,11 @@ import core.models import django.db.models.deletion import uuid +import random from django.db import migrations, models +def rand_int_id(): + return random.getrandbits(32) class Migration(migrations.Migration): @@ -26,6 +29,6 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='tag', name='old_id', - field=models.BigIntegerField(default=core.models.rand_int_id, serialize=False, unique=True, verbose_name='Old ID'), + field=models.BigIntegerField(default=rand_int_id, serialize=False, unique=True, verbose_name='Old ID'), ), ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 5abc8274..aa224e88 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -5,10 +5,7 @@ from typing import Optional, List, Dict, Iterable from django_stubs_ext.db.models import TypedModelMeta import json -import random -import uuid -from uuid import uuid4 from pathlib import Path from django.db import models @@ -18,9 +15,10 @@ from django.utils.text import slugify from django.core.cache import cache from django.urls import reverse, reverse_lazy from django.db.models import Case, When, Value, IntegerField +from django.contrib import admin from django.conf import settings -from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, get_or_create_system_user_pk +from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField from ..system import get_dir_size from ..util import parse_date, base_url @@ -29,13 +27,10 @@ from ..index.html import snapshot_icons from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS -def rand_int_id(): - return random.getrandbits(32) - # class BaseModel(models.Model): # # TODO: migrate all models to a shared base class with all our standard fields and helpers: -# # ulid/created/modified/owner/is_deleted/as_json/from_json/etc. +# # ulid/created_at/modified_at/created_by/is_deleted/as_json/from_json/etc. # # # # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') # # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True) @@ -51,17 +46,18 @@ class Tag(ABIDModel): Based on django-taggit model + ABID base. """ abid_prefix = 'tag_' - abid_ts_src = 'self.created' + abid_ts_src = 'self.created_at' abid_uri_src = 'self.slug' abid_subtype_src = '"03"' abid_rand_src = 'self.id' + abid_drift_allowed = True id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') abid = ABIDField(prefix=abid_prefix) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='tag_set') - created = AutoDateTimeField(default=None, null=False, db_index=True) - modified = models.DateTimeField(auto_now=True) + created_at = AutoDateTimeField(default=None, null=False, db_index=True) + modified_at = models.DateTimeField(auto_now=True) name = models.CharField(unique=True, blank=False, max_length=100) slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False) @@ -131,33 +127,41 @@ class SnapshotManager(models.Manager): class Snapshot(ABIDModel): abid_prefix = 'snp_' - abid_ts_src = 'self.created' + abid_ts_src = 'self.created_at' abid_uri_src = 'self.url' abid_subtype_src = '"01"' abid_rand_src = 'self.id' + abid_drift_allowed = False id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') abid = ABIDField(prefix=abid_prefix) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set') - created = AutoDateTimeField(default=None, null=False, db_index=True) - modified = models.DateTimeField(auto_now=True) + created_at = AutoDateTimeField(default=None, null=False, db_index=True) # loaded from self._init_timestamp + modified_at = models.DateTimeField(auto_now=True) # legacy ts fields - added = AutoDateTimeField(default=None, null=False, editable=True, db_index=True) - updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True) + bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True) + downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True) url = models.URLField(unique=True, db_index=True) timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False) tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag')) title = models.CharField(max_length=512, null=True, blank=True, db_index=True) - keys = ('url', 'timestamp', 'title', 'tags', 'updated') + keys = ('url', 'timestamp', 'title', 'tags', 'downloaded_at') archiveresult_set: models.Manager['ArchiveResult'] objects = SnapshotManager() + def save(self, *args, **kwargs): + if not self.bookmarked_at: + self.bookmarked_at = self.created_at or self._init_timestamp + + super().save(*args, **kwargs) + + def __repr__(self) -> str: title = (self.title_stripped or '-')[:64] return f'[{self.timestamp}] {self.url[:64]} ({title})' @@ -185,9 +189,10 @@ class Snapshot(ABIDModel): from ..index import load_link_details return load_link_details(self.as_link()) + @admin.display(description='Tags') def tags_str(self, nocache=True) -> str | None: calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all())) - cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags' + cache_key = f'{self.pk}-{(self.downloaded_at or self.bookmarked_at).timestamp()}-tags' if hasattr(self, '_prefetched_objects_cache') and 'tags' in self._prefetched_objects_cache: # tags are pre-fetched already, use them directly (best because db is always freshest) @@ -255,7 +260,7 @@ class Snapshot(ABIDModel): @cached_property def archive_size(self): - cache_key = f'{str(self.pk)[:12]}-{(self.updated or self.added).timestamp()}-size' + cache_key = f'{str(self.pk)[:12]}-{(self.downloaded_at or self.bookmarked_at).timestamp()}-size' def calc_dir_size(): try: @@ -274,7 +279,7 @@ class Snapshot(ABIDModel): for result in self.archiveresult_set.all() if result.extractor == 'screenshot' and result.status =='succeeded' and result.output ), - key=lambda result: result.created, + key=lambda result: result.created_at, ) or [None])[-1] else: result = self.archiveresult_set.filter( @@ -359,7 +364,7 @@ class Snapshot(ABIDModel): # def get_storage_dir(self, create=True, symlink=True) -> Path: - # date_str = self.added.strftime('%Y%m%d') + # date_str = self.bookmarked_at.strftime('%Y%m%d') # domain_str = domain(self.url) # abs_storage_dir = Path(settings.CONFIG.ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid) @@ -407,10 +412,11 @@ class ArchiveResultManager(models.Manager): class ArchiveResult(ABIDModel): abid_prefix = 'res_' - abid_ts_src = 'self.snapshot.added' + abid_ts_src = 'self.snapshot.created_at' abid_uri_src = 'self.snapshot.url' abid_subtype_src = 'self.extractor' abid_rand_src = 'self.id' + abid_drift_allowed = True EXTRACTOR_CHOICES = ( ('htmltotext', 'htmltotext'), @@ -438,8 +444,8 @@ class ArchiveResult(ABIDModel): abid = ABIDField(prefix=abid_prefix) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='archiveresult_set') - created = AutoDateTimeField(default=None, null=False, db_index=True) - modified = models.DateTimeField(auto_now=True) + created_at = AutoDateTimeField(default=None, null=False, db_index=True) + modified_at = models.DateTimeField(auto_now=True) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE, to_field='id', db_column='snapshot_id') @@ -460,6 +466,7 @@ class ArchiveResult(ABIDModel): def __str__(self): + # return f'[{self.abid}] 📅 {self.start_ts.strftime("%Y-%m-%d %H:%M")} 📄 {self.extractor} {self.snapshot.url}' return self.extractor @cached_property @@ -503,7 +510,7 @@ class ArchiveResult(ABIDModel): # def get_storage_dir(self, create=True, symlink=True): - # date_str = self.snapshot.added.strftime('%Y%m%d') + # date_str = self.snapshot.bookmarked_at.strftime('%Y%m%d') # domain_str = domain(self.snapshot.url) # abs_storage_dir = Path(settings.CONFIG.ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid) diff --git a/archivebox/core/views.py b/archivebox/core/views.py index da09224c..89082ace 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -211,7 +211,7 @@ class SnapshotView(View): format_html( ( '



' - 'No Snapshot directories match the given timestamp or UUID: {}

' + 'No Snapshot directories match the given timestamp/ID/ABID: {}

' 'You can add a new Snapshot, or return to the Main Index' '
' ), @@ -225,18 +225,18 @@ class SnapshotView(View): snapshot_hrefs = mark_safe('
').join( format_html( '{} {} {} {}', - snap.added.strftime('%Y-%m-%d %H:%M:%S'), + snap.bookmarked_at.strftime('%Y-%m-%d %H:%M:%S'), snap.timestamp, snap.timestamp, snap.url, snap.title_stripped[:64] or '', ) - for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added') + for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'bookmarked_at').order_by('-bookmarked_at') ) return HttpResponse( format_html( ( - 'Multiple Snapshots match the given timestamp/UUID {}
'
+                            'Multiple Snapshots match the given timestamp/ID/ABID {}
'
                         ),
                         slug,
                     ) + snapshot_hrefs + format_html(
@@ -257,12 +257,12 @@ class SnapshotView(View):
                         (
                             '



' f'Snapshot [{snapshot.timestamp}]: {snapshot.url}
' - f'was queued on {str(snapshot.added).split(".")[0]}, ' + f'was queued on {str(snapshot.bookmarked_at).split(".")[0]}, ' f'but no files have been saved yet in:
{snapshot.timestamp}/' '{}' f'

' 'It\'s possible {} ' - f'during the last capture on {str(snapshot.added).split(".")[0]},
or that the archiving process has not completed yet.
' + f'during the last capture on {str(snapshot.bookmarked_at).split(".")[0]},
or that the archiving process has not completed yet.
' f'
# run this cmd to finish/retry archiving this Snapshot
' f'archivebox update -t timestamp {snapshot.timestamp}


' '
' @@ -270,7 +270,7 @@ class SnapshotView(View): f'- list all the Snapshot files .*
' f'- view the Snapshot ./index.html
' f'- go to the Snapshot admin to edit
' - f'- go to the Snapshot actions to re-archive
' + f'- go to the Snapshot actions to re-archive
' '- or return to the main index...
' '
' ), @@ -343,7 +343,7 @@ class SnapshotView(View): snapshot_hrefs = mark_safe('
').join( format_html( '{} {} {} {} {}', - snap.added.strftime('%Y-%m-%d %H:%M:%S'), + snap.bookmarked_at.strftime('%Y-%m-%d %H:%M:%S'), snap.abid, snap.timestamp, snap.timestamp, @@ -353,7 +353,7 @@ class SnapshotView(View): for snap in Snapshot.objects.filter( Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path)) | Q(abid__icontains=path) | Q(id__icontains=path) - ).only('url', 'timestamp', 'title', 'added').order_by('-added') + ).only('url', 'timestamp', 'title', 'bookmarked_at').order_by('-bookmarked_at') ) return HttpResponse( format_html( @@ -376,7 +376,7 @@ class PublicIndexView(ListView): template_name = 'public_index.html' model = Snapshot paginate_by = SNAPSHOTS_PER_PAGE - ordering = ['-added'] + ordering = ['-bookmarked_at', '-created_at'] def get_context_data(self, **kwargs): return { diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 22d6a405..1432b271 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -134,7 +134,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s link = load_link_details(link, out_dir=out_dir) write_link_details(link, out_dir=out_dir, skip_sql_index=False) log_link_archiving_started(link, str(out_dir), is_new) - link = link.overwrite(updated=datetime.now(timezone.utc)) + link = link.overwrite(downloaded_at=datetime.now(timezone.utc)) stats = {'skipped': 0, 'succeeded': 0, 'failed': 0} start_ts = datetime.now(timezone.utc) @@ -157,11 +157,11 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status, created_by_id=snapshot.created_by_id) - # bump the updated time on the main Snapshot here, this is critical + # bump the downloaded_at time on the main Snapshot here, this is critical # to be able to cache summaries of the ArchiveResults for a given # snapshot without having to load all the results from the DB each time. - # (we use {Snapshot.pk}-{Snapshot.updated} as the cache key and assume - # ArchiveResults are unchanged as long as the updated timestamp is unchanged) + # (we use {Snapshot.pk}-{Snapshot.downloaded_at} as the cache key and assume + # ArchiveResults are unchanged as long as the downloaded_at timestamp is unchanged) snapshot.save() else: # print('{black} X {}{reset}'.format(method_name, **ANSI)) diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index c97b2f28..c4cb6d44 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -245,7 +245,7 @@ def wget_output_path(link: Link, nocache: bool=False) -> Optional[str]: # https://example.com/abc/test/?v=zzVa_tX1OiI # > example.com/abc/test/index.html@v=zzVa_tX1OiI.html - cache_key = f'{link.url_hash}:{link.timestamp}-{link.updated and link.updated.timestamp()}-wget-output-path' + cache_key = f'{link.url_hash}:{link.timestamp}-{link.downloaded_at and link.downloaded_at.timestamp()}-wget-output-path' if not nocache: from django.core.cache import cache diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 504385b2..8ea32446 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -118,7 +118,7 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str: def snapshot_icons(snapshot) -> str: - cache_key = f'result_icons:{snapshot.pk}:{(snapshot.modified or snapshot.created or snapshot.added).timestamp()}' + cache_key = f'result_icons:{snapshot.pk}:{(snapshot.downloaded_at or snapshot.modified_at or snapshot.created_at or snapshot.bookmarked_at).timestamp()}' def calc_snapshot_icons(): from core.models import ArchiveResult diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index b01b6ae5..bcf48fc9 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -132,7 +132,7 @@ class Link: tags: Optional[str] sources: List[str] history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {}) - updated: Optional[datetime] = None + downloaded_at: Optional[datetime] = None schema: str = 'Link' def __str__(self) -> str: @@ -164,7 +164,7 @@ class Link: assert isinstance(self.timestamp, str) and self.timestamp assert self.timestamp.replace('.', '').isdigit() assert isinstance(self.url, str) and '://' in self.url - assert self.updated is None or isinstance(self.updated, datetime) + assert self.downloaded_at is None or isinstance(self.downloaded_at, datetime) assert self.title is None or (isinstance(self.title, str) and self.title) assert self.tags is None or isinstance(self.tags, str) assert isinstance(self.sources, list) @@ -184,7 +184,7 @@ class Link: 'url': self.url, 'title': self.title or None, 'timestamp': self.timestamp, - 'updated': self.updated or None, + 'downloaded_at': self.downloaded_at or None, 'tags': self.tags or None, 'sources': self.sources or [], 'history': self.history or {}, @@ -210,7 +210,7 @@ class Link: 'icons': None, # only used to render static index in index/html.py, remove if no longer needed there 'bookmarked_date': self.bookmarked_date, - 'updated_date': self.updated_date, + 'downloaded_datestr': self.downloaded_datestr, 'oldest_archive_date': self.oldest_archive_date, 'newest_archive_date': self.newest_archive_date, @@ -236,7 +236,7 @@ class Link: for key, val in json_info.items() if key in cls.field_names() } - info['updated'] = parse_date(info.get('updated')) + info['downloaded_at'] = parse_date(info.get('updated') or info.get('downloaded_at')) info['sources'] = info.get('sources') or [] json_history = info.get('history') or {} @@ -347,8 +347,8 @@ class Link: @property - def updated_date(self) -> Optional[str]: - return ts_to_date_str(self.updated) if self.updated else None + def downloaded_datestr(self) -> Optional[str]: + return ts_to_date_str(self.downloaded_at) if self.downloaded_at else None @property def archive_dates(self) -> List[datetime]: diff --git a/archivebox/main.py b/archivebox/main.py index 02d377b1..a070ddb3 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -540,9 +540,9 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: last_login = User.objects.order_by('last_login').last() if last_login: print(f' Last UI login: {last_login.username} @ {str(last_login.last_login)[:16]}') - last_updated = Snapshot.objects.order_by('updated').last() - if last_updated: - print(f' Last changes: {str(last_updated.updated)[:16]}') + last_downloaded = Snapshot.objects.order_by('downloaded_at').last() + if last_downloaded: + print(f' Last changes: {str(last_downloaded.downloaded_at)[:16]}') if not users: print() @@ -550,13 +550,13 @@ def status(out_dir: Path=OUTPUT_DIR) -> None: print(' archivebox manage createsuperuser') print() - for snapshot in links.order_by('-updated')[:10]: - if not snapshot.updated: + for snapshot in links.order_by('-downloaded_at')[:10]: + if not snapshot.downloaded_at: continue print( ANSI['black'], ( - f' > {str(snapshot.updated)[:16]} ' + f' > {str(snapshot.downloaded_at)[:16]} ' f'[{snapshot.num_outputs} {("X", "√")[snapshot.is_archived]} {printable_filesize(snapshot.archive_size)}] ' f'"{snapshot.title}": {snapshot.url}' )[:TERM_WIDTH()], diff --git a/archivebox/plugantic/models.py b/archivebox/plugantic/models.py index 7ef226ec..98372eb4 100644 --- a/archivebox/plugantic/models.py +++ b/archivebox/plugantic/models.py @@ -1,50 +1 @@ __package__ = 'archivebox.plugantic' - - -# import uuid -# from django.db import models -# from typing_extensions import Self - -# from django_pydantic_field import SchemaField -# from django.conf import settings - -# from abid_utils.models import ABIDModel, ABIDField - -# # from .plugins import Plugin as PluginSchema, CORE_PLUGIN -# from .binproviders import BinProvider -# from .binaries import Binary -# from .configs import WgetOptionsConfig -# from .extractors import Extractor -# from .replayers import Replayer - - -# PLUGINS_ROOT = settings.CONFIG['OUTPUT_DIR'] / 'plugins' -# PLUGINS_ROOT.mkdir(exist_ok=True) - - -# class CustomPlugin(ABIDModel): -# abid_prefix = 'plg_' -# abid_ts_src = 'self.added' -# abid_uri_src = 'self.name' -# abid_subtype_src = '"09"' -# abid_rand_src = 'self.id' - -# id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) # legacy pk -# uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) -# abid = ABIDField(prefix=abid_prefix) - -# name = models.CharField(max_length=64, blank=False, unique=True) - -# path = models.FilePathField(path=str(PLUGINS_ROOT), match='*', recursive=True, allow_folders=True, allow_files=False) - -# # replayers: list[Replayer] = SchemaField() -# # binaries: list[Replayer] = SchemaField() -# # extractors: list[Replayer] = SchemaField() - - -# # @classmethod -# # def from_loaded_plugin(cls, plugin: PluginSchema) -> Self: -# # new_obj = cls( -# # schema=plugin, -# # ) -# # return new_obj diff --git a/archivebox/templates/admin/snapshots_grid.html b/archivebox/templates/admin/snapshots_grid.html index a500b07b..dbb19a41 100644 --- a/archivebox/templates/admin/snapshots_grid.html +++ b/archivebox/templates/admin/snapshots_grid.html @@ -148,7 +148,7 @@
- {{obj.added}} + {{obj.bookmarked_at}}
Last Checked
- {{updated_date}} + {{downloaded_datestr}}
diff --git a/archivebox/templates/core/snapshot_live.html b/archivebox/templates/core/snapshot_live.html index 4b219c29..fcdf04c0 100644 --- a/archivebox/templates/core/snapshot_live.html +++ b/archivebox/templates/core/snapshot_live.html @@ -379,8 +379,8 @@
- - {{oldest_archive_date|default:updated_date|default:bookmarked_date}} + + {{oldest_archive_date|default:downloaded_datestr|default:bookmarked_date}}