From 4427869ae84928dfa7c38a19429f7ad5cd252637 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Wed, 4 Sep 2024 02:02:29 -0700 Subject: [PATCH] fix ABID generation by chopping ts_src precision to consistent length --- archivebox/abid_utils/abid.py | 11 +- archivebox/abid_utils/admin.py | 29 +++--- archivebox/abid_utils/models.py | 174 +++++++++++++++++--------------- archivebox/core/admin.py | 4 +- archivebox/core/models.py | 13 +-- 5 files changed, 122 insertions(+), 109 deletions(-) diff --git a/archivebox/abid_utils/abid.py b/archivebox/abid_utils/abid.py index e294e6a5..c7fe8fb9 100644 --- a/archivebox/abid_utils/abid.py +++ b/archivebox/abid_utils/abid.py @@ -114,7 +114,7 @@ class ABID(NamedTuple): @enforce_types def uri_hash(uri: Union[str, bytes], salt: str=DEFAULT_ABID_URI_SALT) -> str: """ - 'E4A5CCD9AF4ED2A6E0954DF19FD274E9CDDB4853051F033FD518BFC90AA1AC25' + https://example.com -> 'E4A5CCD9AF4ED2A6E0954DF19FD274E9CDDB4853051F033FD518BFC90AA1AC25' (example.com) """ if isinstance(uri, bytes): uri_str: str = uri.decode() @@ -130,6 +130,7 @@ def uri_hash(uri: Union[str, bytes], salt: str=DEFAULT_ABID_URI_SALT) -> str: except AttributeError: pass + # the uri hash is the sha256 of the domain + salt uri_bytes = uri_str.encode('utf-8') + salt.encode('utf-8') return hashlib.sha256(uri_bytes).hexdigest().upper() @@ -162,7 +163,11 @@ def abid_part_from_ts(ts: datetime) -> str: return str(ulid.from_timestamp(ts))[:ABID_TS_LEN] @enforce_types -def abid_part_from_subtype(subtype: str) -> str: +def ts_from_abid(abid: str) -> datetime: + return ulid.parse(abid.split('_', 1)[-1]).timestamp().datetime + +@enforce_types +def abid_part_from_subtype(subtype: str | int) -> str: """ Snapshots have 01 type, other objects have other subtypes like wget/media/etc. Also allows us to change the ulid spec later by putting special sigil values here. @@ -196,7 +201,7 @@ def abid_part_from_rand(rand: Union[str, UUID, None, int]) -> str: @enforce_types -def abid_hashes_from_values(prefix: str, ts: datetime, uri: str, subtype: str, rand: Union[str, UUID, None, int], salt: str=DEFAULT_ABID_URI_SALT) -> Dict[str, str]: +def abid_hashes_from_values(prefix: str, ts: datetime, uri: str, subtype: str | int, rand: Union[str, UUID, None, int], salt: str=DEFAULT_ABID_URI_SALT) -> Dict[str, str]: return { 'prefix': abid_part_from_prefix(prefix), 'ts': abid_part_from_ts(ts), diff --git a/archivebox/abid_utils/admin.py b/archivebox/abid_utils/admin.py index 46adf3f7..95e48641 100644 --- a/archivebox/abid_utils/admin.py +++ b/archivebox/abid_utils/admin.py @@ -27,7 +27,7 @@ def get_abid_info(self, obj, request=None): try: abid_diff = f' != obj.ABID: {highlight_diff(obj.ABID, obj.abid)} ❌' if str(obj.ABID) != str(obj.abid) else ' == .ABID ✅' - fresh_abid = obj.ABID_FRESH + fresh_abid = obj.ABID fresh_abid_diff = f' !=   .fresh_abid: {highlight_diff(fresh_abid, obj.ABID)} ❌' if str(fresh_abid) != str(obj.ABID) else '✅' fresh_uuid_diff = f' !=   .fresh_uuid: {highlight_diff(fresh_abid.uuid, obj.ABID.uuid)} ❌' if str(fresh_abid.uuid) != str(obj.ABID.uuid) else '✅' @@ -35,17 +35,17 @@ def get_abid_info(self, obj, request=None): id_abid_diff = f' != .abid.uuid: {highlight_diff(obj.ABID.uuid, obj.id)} ❌' if str(obj.id) != str(obj.ABID.uuid) else ' == .abid ✅' id_pk_diff = f' != .pk: {highlight_diff(obj.pk, obj.id)} ❌' if str(obj.pk) != str(obj.id) else ' == .pk ✅' - source_ts_val = parse_date(obj.abid_values['ts']) or None - derived_ts = abid_part_from_ts(source_ts_val) if source_ts_val else None + fresh_ts = parse_date(obj.ABID_FRESH_VALUES['ts']) or None + derived_ts = abid_part_from_ts(fresh_ts) if fresh_ts else None ts_diff = f'!= {highlight_diff(derived_ts, obj.ABID.ts)} ❌' if derived_ts != obj.ABID.ts else '✅' - derived_uri = abid_part_from_uri(obj.abid_values['uri']) + derived_uri = abid_part_from_uri(obj.ABID_FRESH_VALUES['uri']) uri_diff = f'!= {highlight_diff(derived_uri, obj.ABID.uri)} ❌' if derived_uri != obj.ABID.uri else '✅' - derived_subtype = abid_part_from_subtype(obj.abid_values['subtype']) + derived_subtype = abid_part_from_subtype(obj.ABID_FRESH_VALUES['subtype']) subtype_diff = f'!= {highlight_diff(derived_subtype, obj.ABID.subtype)} ❌' if derived_subtype != obj.ABID.subtype else '✅' - derived_rand = abid_part_from_rand(obj.abid_values['rand']) + derived_rand = abid_part_from_rand(obj.ABID_FRESH_VALUES['rand']) rand_diff = f'!= {highlight_diff(derived_rand, obj.ABID.rand)} ❌' if derived_rand != obj.ABID.rand else '✅' # any_abid_discrepancies = any( @@ -60,9 +60,9 @@ def get_abid_info(self, obj, request=None): {}     📖 API DOCS

-     .abid:                   {}                 {}
-     .abid.uuid:           {}     {}
    .id:                       {}     {}
+     .abid.uuid:           {}     {}
+     .abid:                   {}                 {}

    TS:                  {}   {}        {} {}: {}
    URI:                 {}     {}           {} {}: {}
@@ -73,15 +73,15 @@ def get_abid_info(self, obj, request=None):
''', obj.api_url + (f'?api_key={get_or_create_api_token(request.user)}' if request and request.user else ''), obj.api_url, obj.api_docs_url, + highlight_diff(obj.id, obj.ABID.uuid), mark_safe(id_pk_diff + id_abid_diff), + highlight_diff(obj.ABID.uuid, obj.id), mark_safe(fresh_uuid_diff), highlight_diff(obj.abid, fresh_abid), mark_safe(fresh_abid_diff), - highlight_diff(obj.ABID.uuid, fresh_abid.uuid), mark_safe(fresh_uuid_diff), - str(obj.id), mark_safe(id_pk_diff + id_abid_diff + id_fresh_abid_diff), # str(fresh_abid.uuid), mark_safe(fresh_uuid_diff), # str(fresh_abid), mark_safe(fresh_abid_diff), - highlight_diff(obj.ABID.ts, derived_ts), highlight_diff(str(obj.ABID.uuid)[0:14], str(fresh_abid.uuid)[0:14]), mark_safe(ts_diff), obj.abid_ts_src, source_ts_val and source_ts_val.isoformat(), - highlight_diff(obj.ABID.uri, derived_uri), highlight_diff(str(obj.ABID.uuid)[14:26], str(fresh_abid.uuid)[14:26]), mark_safe(uri_diff), obj.abid_uri_src, str(obj.abid_values['uri']), - highlight_diff(obj.ABID.subtype, derived_subtype), highlight_diff(str(obj.ABID.uuid)[26:28], str(fresh_abid.uuid)[26:28]), mark_safe(subtype_diff), obj.abid_subtype_src, str(obj.abid_values['subtype']), - highlight_diff(obj.ABID.rand, derived_rand), highlight_diff(str(obj.ABID.uuid)[28:36], str(fresh_abid.uuid)[28:36]), mark_safe(rand_diff), obj.abid_rand_src, str(obj.abid_values['rand'])[-7:], + highlight_diff(obj.ABID.ts, derived_ts), highlight_diff(str(obj.ABID.uuid)[0:14], str(fresh_abid.uuid)[0:14]), mark_safe(ts_diff), obj.abid_ts_src, fresh_ts and fresh_ts.isoformat(), + highlight_diff(obj.ABID.uri, derived_uri), highlight_diff(str(obj.ABID.uuid)[14:26], str(fresh_abid.uuid)[14:26]), mark_safe(uri_diff), obj.abid_uri_src, str(obj.ABID_FRESH_VALUES['uri']), + highlight_diff(obj.ABID.subtype, derived_subtype), highlight_diff(str(obj.ABID.uuid)[26:28], str(fresh_abid.uuid)[26:28]), mark_safe(subtype_diff), obj.abid_subtype_src, str(obj.ABID_FRESH_VALUES['subtype']), + highlight_diff(obj.ABID.rand, derived_rand), highlight_diff(str(obj.ABID.uuid)[28:36], str(fresh_abid.uuid)[28:36]), mark_safe(rand_diff), obj.abid_rand_src, str(obj.ABID_FRESH_VALUES['rand'])[-7:], highlight_diff(getattr(obj, 'old_id', ''), obj.pk), ) except Exception as e: @@ -93,6 +93,7 @@ class ABIDModelAdmin(admin.ModelAdmin): sort_fields = ('created', 'created_by', 'abid', '__str__') readonly_fields = ('created', 'modified', '__str__', 'API') + @admin.display(description='API Identifiers') def API(self, obj): return get_abid_info(self, obj, request=self.request) diff --git a/archivebox/abid_utils/models.py b/archivebox/abid_utils/models.py index 7bdd89a8..93ce69ab 100644 --- a/archivebox/abid_utils/models.py +++ b/archivebox/abid_utils/models.py @@ -1,7 +1,5 @@ """ This file provides the Django ABIDField and ABIDModel base model to inherit from. - -It implements the ArchiveBox ID (ABID) interfaces including abid_values, generate_abid, .abid, .uuid, .id. """ from typing import Any, Dict, Union, List, Set, NamedTuple, cast @@ -9,7 +7,7 @@ from typing import Any, Dict, Union, List, Set, NamedTuple, cast from ulid import ULID from uuid import uuid4, UUID from typeid import TypeID # type: ignore[import-untyped] -from datetime import datetime +from datetime import datetime, timedelta from functools import partial from charidfield import CharIDField # type: ignore[import-untyped] @@ -30,7 +28,10 @@ from .abid import ( DEFAULT_ABID_PREFIX, DEFAULT_ABID_URI_SALT, abid_part_from_prefix, - abid_from_values + abid_hashes_from_values, + abid_from_values, + ts_from_abid, + abid_part_from_ts, ) #################################################### @@ -63,134 +64,141 @@ def get_or_create_system_user_pk(username='system'): class AutoDateTimeField(models.DateTimeField): - def pre_save(self, model_instance, add): - return timezone.now() + # def pre_save(self, model_instance, add): + # return timezone.now() + pass class ABIDModel(models.Model): """ Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface. """ - abid_prefix: str = DEFAULT_ABID_PREFIX # e.g. 'tag_' + abid_prefix: str = DEFAULT_ABID_PREFIX # e.g. 'tag_' abid_ts_src = 'None' # e.g. 'self.created' abid_uri_src = 'None' # e.g. 'self.uri' abid_subtype_src = 'None' # e.g. 'self.extractor' abid_rand_src = 'None' # e.g. 'self.uuid' or 'self.id' + abid_salt: str = DEFAULT_ABID_URI_SALT # id = models.UUIDField(primary_key=True, default=uuid4, editable=True) # uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True) abid = ABIDField(prefix=abid_prefix) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk) - created = AutoDateTimeField(default=timezone.now, db_index=True) + created = AutoDateTimeField(default=None, null=False, db_index=True) modified = models.DateTimeField(auto_now=True) class Meta(TypedModelMeta): abstract = True def save(self, *args: Any, **kwargs: Any) -> None: - self.created = self.created or timezone.now() - - assert all(val for val in self.abid_values.values()), f'All ABID src values must be set: {self.abid_values}' - if self._state.adding: - self.id = self.ABID.uuid - self.abid = str(self.ABID) - else: - assert self.id, 'id must be set when object exists in DB' - if not self.abid: - self.abid = str(self.ABID) - # assert str(self.abid) == str(self.ABID), f'self.abid {self.id} does not match self.ABID {self.ABID.uuid}' - - # fresh_abid = self.generate_abid() - # if str(fresh_abid) != str(self.abid): - # self.abid = str(fresh_abid) - + self.issue_new_abid() return super().save(*args, **kwargs) - assert str(self.id) == str(self.ABID.uuid), f'self.id {self.id} does not match self.ABID {self.ABID.uuid}' - assert str(self.abid) == str(self.ABID), f'self.abid {self.id} does not match self.ABID {self.ABID.uuid}' - assert str(self.uuid) == str(self.ABID.uuid), f'self.uuid ({self.uuid}) does not match .ABID.uuid ({self.ABID.uuid})' + # assert str(self.id) == str(self.ABID.uuid), f'self.id {self.id} does not match self.ABID {self.ABID.uuid}' + # assert str(self.abid) == str(self.ABID), f'self.abid {self.id} does not match self.ABID {self.ABID.uuid}' + # assert str(self.uuid) == str(self.ABID.uuid), f'self.uuid ({self.uuid}) does not match .ABID.uuid ({self.ABID.uuid})' @property - def abid_values(self) -> Dict[str, Any]: + def ABID_FRESH_VALUES(self) -> Dict[str, Any]: + assert self.abid_ts_src != 'None' + assert self.abid_uri_src != 'None' + assert self.abid_rand_src != 'None' + assert self.abid_subtype_src != 'None' return { 'prefix': self.abid_prefix, 'ts': eval(self.abid_ts_src), 'uri': eval(self.abid_uri_src), 'subtype': eval(self.abid_subtype_src), 'rand': eval(self.abid_rand_src), + 'salt': self.abid_salt, } + + @property + def ABID_FRESH_HASHES(self) -> Dict[str, str]: + return abid_hashes_from_values(**self.ABID_FRESH_VALUES) - def generate_abid(self) -> ABID: + + @property + def ABID_FRESH(self) -> ABID: """ - Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src). + Return a pure freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src). """ - prefix, ts, uri, subtype, rand = self.abid_values.values() - if (not prefix) or prefix == DEFAULT_ABID_PREFIX: - suggested_abid = self.__class__.__name__[:3].lower() - raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})') - - if not ts: - # default to unix epoch with 00:00:00 UTC - ts = datetime.fromtimestamp(0, timezone.utc) # equivalent to: ts = datetime.utcfromtimestamp(0) - print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat()) - - if not uri: - uri = str(self) - print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri) - - if not subtype: - subtype = self.__class__.__name__ - print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype) - - if not rand: - rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk') - print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand) - - abid = abid_from_values( - prefix=prefix, - ts=ts, - uri=uri, - subtype=subtype, - rand=rand, - salt=DEFAULT_ABID_URI_SALT, - ) - assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}' + abid_fresh_values = self.ABID_FRESH_VALUES + assert all(abid_fresh_values.values()), f'All ABID_FRESH_VALUES must be set {abid_fresh_values}' + abid_fresh_hashes = self.ABID_FRESH_HASHES + assert all(abid_fresh_hashes.values()), f'All ABID_FRESH_HASHES must be able to be generated {abid_fresh_hashes}' + + abid = ABID(**abid_fresh_hashes) + + assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {abid_fresh_values["prefix"]}_ABID for {self.__class__.__name__}' return abid + + def issue_new_abid(self): + assert self.abid is None, f'Can only issue new ABID for new objects that dont already have one {self.abid}' + assert self._state.adding, 'Can only issue new ABID when model._state.adding is True' + assert eval(self.abid_uri_src), f'Can only issue new ABID if self.abid_uri_src is defined ({self.abid_uri_src}={eval(self.abid_uri_src)})' + + self.old_id = getattr(self, 'old_id', None) or self.id or uuid4() + self.abid = None + self.created = ts_from_abid(abid_part_from_ts(timezone.now())) # cut off precision to match precision of TS component + self.added = getattr(self, 'added', None) or self.created + self.modified = self.created + abid_ts_src_attr = self.abid_ts_src.split('self.', 1)[-1] # e.g. 'self.added' -> 'added' + if abid_ts_src_attr and abid_ts_src_attr != 'created' and hasattr(self, abid_ts_src_attr): + # self.added = self.created + existing_abid_ts = getattr(self, abid_ts_src_attr, None) + created_and_abid_ts_are_same = existing_abid_ts and (existing_abid_ts - self.created) < timedelta(seconds=5) + if created_and_abid_ts_are_same: + setattr(self, abid_ts_src_attr, self.created) + assert getattr(self, abid_ts_src_attr) == self.created + + assert all(self.ABID_FRESH_VALUES.values()), f'Can only issue new ABID if all self.ABID_FRESH_VALUES are defined {self.ABID_FRESH_VALUES}' + + new_abid = self.ABID_FRESH + + # store stable ABID on local fields, overwrite them because we are adding a new entry and existing defaults havent touched db yet + self.abid = str(new_abid) + self.id = new_abid.uuid + self.pk = new_abid.uuid + + assert self.ABID == new_abid + assert str(self.ABID.uuid) == str(self.id) == str(self.pk) == str(ABID.parse(self.abid).uuid) + + self._ready_to_save_as_new = True + + @property def ABID(self) -> ABID: """ - ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE') + aka get_or_generate_abid -> ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE') """ - - # if object is not yet saved to DB, always generate fresh ABID from values - if self._state.adding: - return self.generate_abid() - + # otherwise DB is single source of truth, load ABID from existing db pk abid: ABID | None = None - try: - abid = abid or ABID.parse(self.pk) - except Exception: - pass - - try: - abid = abid or ABID.parse(self.id) - except Exception: - pass - try: abid = abid or ABID.parse(cast(str, self.abid)) except Exception: pass - abid = abid or self.generate_abid() + try: + abid = abid or ABID.parse(cast(str, self.id)) + except Exception: + pass + + try: + abid = abid or ABID.parse(cast(str, self.pk)) + except Exception: + pass + + abid = abid or self.ABID_FRESH return abid + @property def ULID(self) -> ULID: """ @@ -210,8 +218,7 @@ class ABIDModel(models.Model): """ Get a str uuid.UUID (v4) representation of the object's ABID. """ - assert str(self.id) == str(self.ABID.uuid) - return str(self.id) + return str(self.ABID.uuid) @property def TypeID(self) -> TypeID: @@ -220,6 +227,10 @@ class ABIDModel(models.Model): """ return self.ABID.typeid + @property + def abid_uri(self) -> str: + return eval(self.abid_uri_src) + @property def api_url(self) -> str: # /api/v1/core/any/{abid} @@ -290,6 +301,7 @@ def find_obj_from_abid_rand(rand: Union[ABID, str], model=None) -> List[ABIDMode """ Find an object corresponding to an ABID by exhaustively searching using its random suffix (slow). e.g. 'obj_....................JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ') + Honestly should only be used for debugging, no reason to expose this ability to users. """ # convert str to ABID if necessary @@ -339,7 +351,7 @@ def find_obj_from_abid_rand(rand: Union[ABID, str], model=None) -> List[ABIDMode ) for obj in qs: - if obj.generate_abid() == abid: + if abid in (str(obj.ABID_FRESH), str(obj.id), str(obj.abid)): # found exact match, no need to keep iterating return [obj] partial_matches.append(obj) diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 832a9348..d77449d0 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -353,10 +353,10 @@ class SnapshotActionForm(ActionForm): class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): list_display = ('added', 'title_str', 'files', 'size', 'url_str') sort_fields = ('title_str', 'url_str', 'added') - readonly_fields = ('tags_str', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'API', 'link_dir') + readonly_fields = ('tags_str', 'timestamp', 'admin_actions', 'status_info', 'bookmarked', 'updated', 'created', 'modified', 'API', 'link_dir') search_fields = ('id', 'url', 'abid', 'old_id', 'timestamp', 'title', 'tags__name') list_filter = ('added', 'updated', 'archiveresult__status', 'created_by', 'tags__name') - fields = ('url', 'created_by', 'title', *readonly_fields) + fields = ('url', 'created_by', 'title', 'added', *readonly_fields) ordering = ['-added'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] inlines = [TagInline, ArchiveResultInline] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 20c70797..a76a86c9 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -138,16 +138,16 @@ class Snapshot(ABIDModel): abid_subtype_src = '"01"' abid_rand_src = 'self.old_id' - old_id = models.UUIDField(default=uuid.uuid4, editable=False, unique=True) # legacy pk - id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True, unique=True) + old_id = models.UUIDField(default=None, null=False, editable=False, unique=True) # legacy pk + id = models.UUIDField(default=None, null=False, primary_key=True, editable=True, unique=True) abid = ABIDField(prefix=abid_prefix) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, related_name='snapshot_set') - created = AutoDateTimeField(default=timezone.now, db_index=True) + created = AutoDateTimeField(default=None, null=False, db_index=True) modified = models.DateTimeField(auto_now=True) # legacy ts fields - added = AutoDateTimeField(default=timezone.now, db_index=True) + added = AutoDateTimeField(default=None, null=False, editable=True, db_index=True) updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True) url = models.URLField(unique=True, db_index=True) @@ -161,11 +161,6 @@ class Snapshot(ABIDModel): objects = SnapshotManager() - def save(self, *args, **kwargs): - # make sure self.added is seeded with a value before calculating ABID using it - if self._state.adding or not self.added: - self.added = self.added or timezone.now() - return super().save(*args, **kwargs) def __repr__(self) -> str: title = (self.title_stripped or '-')[:64]