Partition long strings in chunks for sonic

This commit is contained in:
JDC 2020-11-23 16:54:27 -05:00 committed by Nick Sweeting
parent caf4660ac8
commit 0acf479b70
2 changed files with 8 additions and 3 deletions

View file

@ -98,4 +98,4 @@ def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
if snap := Snapshot.objects.filter(url=link.url).first(): if snap := Snapshot.objects.filter(url=link.url).first():
results = ArchiveResult.objects.indexable().filter(snapshot=snap) results = ArchiveResult.objects.indexable().filter(snapshot=snap)
texts = get_indexable_content(results) texts = get_indexable_content(results)
write_search_index(link,texts,out_dir=out_dir) write_search_index(link, texts, out_dir=out_dir)

View file

@ -5,13 +5,18 @@ from sonic import IngestClient, SearchClient
from archivebox.util import enforce_types from archivebox.util import enforce_types
from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION
MAX_SONIC_TEXT_LENGTH = 1000
@enforce_types @enforce_types
def index(snapshot_id: str, texts: List[str]): def index(snapshot_id: str, texts: List[str]):
with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl: with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
for text in texts: for text in texts:
ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(text)) if len(text) < MAX_SONIC_TEXT_LENGTH:
ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(text))
else:
chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)]
for chunk in chunks:
ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk))
@enforce_types @enforce_types
def search(text: str) -> List[str]: def search(text: str) -> List[str]:
with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl: with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl: