wip attempt to fix timestamp unique constraint errors

This commit is contained in:
Nick Sweeting 2020-08-18 08:30:09 -04:00
parent b0c0a676f8
commit f18d92570e
2 changed files with 25 additions and 16 deletions

View file

@ -129,7 +129,7 @@ def validate_links(links: Iterable[Link]) -> List[Link]:
try: try:
links = archivable_links(links) # remove chrome://, about:, mailto: etc. links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = sorted_links(links) # deterministically sort the links based on timstamp, url links = sorted_links(links) # deterministically sort the links based on timstamp, url
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls links = fix_duplicate_links(links) # merge/dedupe duplicate timestamps & urls
finally: finally:
timer.end() timer.end()
@ -144,34 +144,39 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
urlparse(link.url) urlparse(link.url)
except ValueError: except ValueError:
continue continue
scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp') if scheme(link.url) not in ('http', 'https', 'ftp'):
not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True continue
if scheme_is_valid and not_blacklisted: if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
yield link continue
yield link
@enforce_types @enforce_types
def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]: def fix_duplicate_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
""" """
ensures that all non-duplicate links have monotonically increasing timestamps ensures that all non-duplicate links have monotonically increasing timestamps
""" """
from core.models import Snapshot
unique_urls: OrderedDict[str, Link] = OrderedDict() unique_urls: OrderedDict[str, Link] = OrderedDict()
for link in sorted_links: for link in sorted_links:
if link.base_url in unique_urls: if link.url in unique_urls:
# merge with any other links that share the same url # merge with any other links that share the same url
link = merge_links(unique_urls[link.base_url], link) link = merge_links(unique_urls[link.url], link)
unique_urls[link.base_url] = link unique_urls[link.url] = link
unique_timestamps: OrderedDict[str, Link] = OrderedDict() # unique_timestamps: OrderedDict[str, Link] = OrderedDict()
for link in unique_urls.values(): # for link in unique_urls.values():
new_link = link.overwrite( # closest_non_duplicate_ts = lowest_uniq_timestamp(unique_timestamps, link.timestamp)
timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp), # if closest_non_duplicate_ts != link.timestamp:
) # link = link.overwrite(timestamp=closest_non_duplicate_ts)
unique_timestamps[new_link.timestamp] = new_link # Snapshot.objects.filter(url=link.url).update(timestamp=link.timestamp)
# unique_timestamps[link.timestamp] = link
return unique_timestamps.values() # return unique_timestamps.values()
return unique_urls.values()
@enforce_types @enforce_types

View file

@ -39,6 +39,10 @@ def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
with transaction.atomic(): with transaction.atomic():
for link in links: for link in links:
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys} info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
try:
info['timestamp'] = Snapshot.objects.get(url=link.url).timestamp
except Snapshot.DoesNotExist:
pass
Snapshot.objects.update_or_create(url=link.url, defaults=info) Snapshot.objects.update_or_create(url=link.url, defaults=info)
@enforce_types @enforce_types