diff --git a/archivebox.egg-info/requires.txt b/archivebox.egg-info/requires.txt index 71dc253d..ca279875 100644 --- a/archivebox.egg-info/requires.txt +++ b/archivebox.egg-info/requires.txt @@ -4,6 +4,7 @@ mypy-extensions==0.4.3 base32-crockford==0.3.0 django==3.0.8 django-extensions==3.0.3 +django-taggit==1.3.0 dateparser ipython youtube-dl diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 4337e4a3..a35d589b 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -66,6 +66,12 @@ class SnapshotAdmin(admin.ModelAdmin): actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots] actions_template = 'admin/actions_as_select.html' + def get_queryset(self, request): + return super().get_queryset(request).prefetch_related('tags') + + def tag_list(self, obj): + return u", ".join(o.name for o in obj.tags.all()) + def id_str(self, obj): return format_html( '{}', @@ -75,9 +81,9 @@ class SnapshotAdmin(admin.ModelAdmin): def title_str(self, obj): canon = obj.as_link().canonical_outputs() tags = ''.join( - format_html('{}', tag.strip()) - for tag in obj.tags.split(',') - ) if obj.tags else '' + format_html(' {} ', tag) + for tag in obj.tags.all() + ) if obj.tags.all() else '' return format_html( '' '' diff --git a/archivebox/core/migrations/0006_auto_20200915_2006.py b/archivebox/core/migrations/0006_auto_20200915_2006.py new file mode 100644 index 00000000..59bb111e --- /dev/null +++ b/archivebox/core/migrations/0006_auto_20200915_2006.py @@ -0,0 +1,89 @@ +# Generated by Django 3.0.8 on 2020-09-15 20:06 + +from django.db import migrations, models +from django.contrib.contenttypes.models import ContentType +from django.utils.text import slugify +import django.db.models.deletion +import taggit.managers + +def forwards_func(apps, schema_editor): + SnapshotModel = apps.get_model("core", "Snapshot") + TaggedItemModel = apps.get_model("core", "TaggedItem") + TagModel = apps.get_model("taggit", "Tag") + contents = ContentType.objects.all() + try: + ct = ContentType.objects.filter(app_label="core", model="snapshot") + except model.DoesNotExist: # Be explicit about exceptions + ct = None + + db_alias = schema_editor.connection.alias + snapshots = SnapshotModel.objects.all() + for snapshot in snapshots: + tags = snapshot.tags + tag_set = ( + set(tag.strip() for tag in (snapshot.tags_old or '').split(',')) + ) + tag_list = list(tag_set) or [] + + for tag in tag_list: + new_tag, created = TagModel.objects.get_or_create(name=tag, slug=slugify(tag)) + TaggedItemModel.objects.get_or_create( + content_type_id=ct[0].id, + object_id=snapshot.id, + tag=new_tag + ) + + +def reverse_func(apps, schema_editor): + SnapshotModel = apps.get_model("core", "Snapshot") + TaggedItemModel = apps.get_model("core", "TaggedItem") + TagModel = apps.get_model("taggit", "Tag") + ct = ContentType.objects.get(app_label="core", model="snapshot") + + db_alias = schema_editor.connection.alias + snapshots = SnapshotModel.objects.all() + for snapshot in snapshots: + for tag in tags: + tagged_items = TaggedItemModel.objects.filter( + object_id=snapshot.id, + ).delete() + + +class Migration(migrations.Migration): + + dependencies = [ + ('contenttypes', '0002_remove_content_type_name'), + ('taggit', '0003_taggeditem_add_unique_index'), + ('core', '0005_auto_20200728_0326'), + ] + + operations = [ + migrations.RenameField( + model_name='snapshot', + old_name='tags', + new_name='tags_old', + ), + migrations.CreateModel( + name='TaggedItem', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('object_id', models.UUIDField(db_index=True, verbose_name='object ID')), + ('content_type', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='core_taggeditem_tagged_items', to='contenttypes.ContentType', verbose_name='content type')), + ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='core_taggeditem_items', to='taggit.Tag')), + ], + options={ + 'verbose_name': 'Tag', + 'verbose_name_plural': 'Tags', + }, + ), + migrations.AddField( + model_name='snapshot', + name='tags', + field=taggit.managers.TaggableManager(help_text='A comma-separated list of tags.', through='core.TaggedItem', to='taggit.Tag', verbose_name='Tags'), + ), + migrations.RunPython(forwards_func, reverse_func), + migrations.RemoveField( + model_name='snapshot', + name='tags_old', + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 313dd67d..b7719b2e 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -5,10 +5,19 @@ import uuid from django.db import models from django.utils.functional import cached_property +from taggit.managers import TaggableManager +from taggit.models import GenericUUIDTaggedItemBase, TaggedItemBase + from ..util import parse_date from ..index.schema import Link + +class TaggedItem(GenericUUIDTaggedItemBase, TaggedItemBase): + class Meta: + verbose_name = "Tag" + verbose_name_plural = "Tags" + class Snapshot(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) @@ -16,7 +25,7 @@ class Snapshot(models.Model): timestamp = models.CharField(max_length=32, unique=True, db_index=True) title = models.CharField(max_length=128, null=True, blank=True, db_index=True) - tags = models.CharField(max_length=256, null=True, blank=True, db_index=True) + tags = TaggableManager(through=TaggedItem) added = models.DateTimeField(auto_now_add=True, db_index=True) updated = models.DateTimeField(null=True, blank=True, db_index=True) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 14b3b369..6ae2b6af 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -31,6 +31,7 @@ INSTALLED_APPS = [ 'core', 'django_extensions', + 'taggit', ] diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 06832dbc..f93a4ab8 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -86,9 +86,16 @@ def merge_links(a: Link, b: Link) -> Link: ) # all unique, truthy tags + tags_a = [] + if a.tags: + tags_a = a.tags.all() + tags_b = [] + if b.tags: + tags_b = b.tags.all() + tags_set = ( - set(tag.strip() for tag in (a.tags or '').split(',')) - | set(tag.strip() for tag in (b.tags or '').split(',')) + set(tag.name.strip() for tag in tags_a) + | set(tag.name.strip() for tag in tags_b) ) tags = ','.join(tags_set) or None diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 7508890d..7ed44e74 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -157,7 +157,8 @@ class Link: assert isinstance(self.url, str) and '://' in self.url assert self.updated is None or isinstance(self.updated, datetime) assert self.title is None or (isinstance(self.title, str) and self.title) - assert self.tags is None or isinstance(self.tags, str) + #for tag in self.tags.all(): + # assert tag is None or isinstance(tag, TaggedItem) assert isinstance(self.sources, list) assert all(isinstance(source, str) and source for source in self.sources) assert isinstance(self.history, dict) diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index b3ca7231..bd3664da 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -65,7 +65,14 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: except Snapshot.DoesNotExist: snap = write_link_to_sql_index(link) snap.title = link.title - snap.tags = link.tags + + tag_set = ( + set(tag.strip() for tag in (link.tags or '').split(',')) + ) + tag_list = list(tag_set) or [] + + for tag in tag_list: + snap.tags.add(tag) snap.save() diff --git a/setup.py b/setup.py index db83e9bf..0272f565 100755 --- a/setup.py +++ b/setup.py @@ -80,6 +80,7 @@ setuptools.setup( "base32-crockford==0.3.0", "django==3.0.8", "django-extensions==3.0.3", + "django-taggit==1.3.0", "dateparser", "ipython", diff --git a/tests/tags_migration/index.sqlite3 b/tests/tags_migration/index.sqlite3 new file mode 100755 index 00000000..04d35a71 Binary files /dev/null and b/tests/tags_migration/index.sqlite3 differ diff --git a/tests/test_init.py b/tests/test_init.py index d162fa80..72caa6d0 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -4,7 +4,7 @@ import os import subprocess from pathlib import Path -import json +import json, shutil import sqlite3 from archivebox.config import OUTPUT_PERMISSIONS @@ -131,4 +131,44 @@ def test_unrecognized_folders(tmp_path, process, disable_extractors_dict): init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict) assert "Skipped adding 1 invalid link data directories" in init_process.stdout.decode("utf-8") - assert init_process.returncode == 0 \ No newline at end of file + assert init_process.returncode == 0 + +def test_tags_migration(tmp_path, disable_extractors_dict): + + base_sqlite_path = Path(__file__).parent / 'tags_migration' + + if os.path.exists(tmp_path): + shutil.rmtree(tmp_path) + shutil.copytree(str(base_sqlite_path), tmp_path) + os.chdir(tmp_path) + + conn = sqlite3.connect("index.sqlite3") + conn.row_factory = sqlite3.Row + c = conn.cursor() + c.execute("SELECT id, tags from core_snapshot") + snapshots = c.fetchall() + snapshots_dict = { sn['id']: sn['tags'] for sn in snapshots} + conn.commit() + conn.close() + + init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict) + + conn = sqlite3.connect("index.sqlite3") + conn.row_factory = sqlite3.Row + c = conn.cursor() + c.execute(""" + SELECT snapshot.id snapshot, tags.name tag + FROM core_snapshot snapshot, core_taggeditem snapshot_tagged, taggit_tag tags + WHERE + snapshot.id = snapshot_tagged.object_id + AND tags.id = snapshot_tagged.tag_id + """) + tags = c.fetchall() + conn.commit() + conn.close() + + for tag in tags: + snapshot_id = tag['snapshot'] + tag_name = tag['tag'] + # Check each tag migrated is in the previous field + assert tag_name in snapshots_dict[snapshot_id] \ No newline at end of file