improved doc import and fixed duplication glitch

This commit is contained in:
Firq 2024-12-06 23:40:17 +01:00
parent 8ec5eb69ab
commit ef61b926a1
4 changed files with 21 additions and 28 deletions

BIN
data/init.sql (Stored with Git LFS) Normal file → Executable file

Binary file not shown.

View file

@ -1,15 +1,13 @@
import json import json
import pathlib import pathlib
import config_backend
if config_backend.needs_torch: import torch
import torch
from haystack import Document from haystack import Document
from haystack.utils import ComponentDevice from haystack.utils import ComponentDevice
from haystack import Pipeline from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors.document_splitter import DocumentSplitter
from haystack.components.writers import DocumentWriter from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
@ -29,14 +27,18 @@ class AIBackend:
document_store: PgvectorDocumentStore document_store: PgvectorDocumentStore
documents: list[Document] = [] documents: list[Document] = []
def __init__(self): def __init__(self, load_dataset = False):
if config_backend.needs_torch: get_torch_info()
get_torch_info() try:
self.gpu = ComponentDevice.from_str("cuda:0") self.gpu = ComponentDevice.from_str("cuda:0")
except:
self.gpu = None
print("No CUDA gpu device found")
dataset = pathlib.Path(__file__).parents[1] / "data" / "dataset.jsonl" if load_dataset:
if config_backend.load_dataset: dataset = pathlib.Path(__file__).parents[1] / "data" / "dataset.jsonl"
self.documents = [ Document(content=d["text"], meta=d["meta"]) for d in load_data(dataset) ] self.documents = [ Document(content=d["text"], meta=d["meta"]) for d in load_data(dataset) ]
self.document_store = PgvectorDocumentStore( self.document_store = PgvectorDocumentStore(
embedding_dimension=768, embedding_dimension=768,
vector_function="cosine_similarity", vector_function="cosine_similarity",
@ -50,40 +52,32 @@ class AIBackend:
def warmup(self): def warmup(self):
print("Running warmup routine ...") print("Running warmup routine ...")
print("Launching indexing pipeline to generate document embeddings") print("Launching indexing pipeline to generate document embeddings")
res = self.index_pipeline.run({"document_splitter": {"documents": self.documents}}) res = self.index_pipeline.run({"document_embedder": {"documents": self.documents}})
print(f"Finished running indexing pipeline\nDocument Store: Wrote {res['document_writer']['documents_written']} documents") print(f"Finished running indexing pipeline\nDocument Store: Wrote {res['document_writer']['documents_written']} documents")
self._ready = True self._ready = True
print("'.query(\"text\")' is now ready to be used") print("'.query(\"text\")' is now ready to be used")
def _create_indexing_pipeline(self): def _create_indexing_pipeline(self):
print("Creating indexing pipeline ...") print("Creating indexing pipeline ...")
document_splitter = DocumentSplitter(split_by="word", split_length=128, split_overlap=4) document_embedder = SentenceTransformersDocumentEmbedder(model=self.model_embeddings, device=self.gpu)
if config_backend.needs_torch:
document_embedder = SentenceTransformersDocumentEmbedder(model=self.model_embeddings, device=self.gpu)
else:
document_embedder = SentenceTransformersDocumentEmbedder(model=self.model_embeddings)
document_writer = DocumentWriter(document_store=self.document_store) document_writer = DocumentWriter(document_store=self.document_store)
indexing_pipeline = Pipeline() indexing_pipeline = Pipeline()
indexing_pipeline.add_component("document_splitter", document_splitter)
indexing_pipeline.add_component("document_embedder", document_embedder) indexing_pipeline.add_component("document_embedder", document_embedder)
indexing_pipeline.add_component("document_writer", document_writer) indexing_pipeline.add_component("document_writer", document_writer)
indexing_pipeline.connect("document_splitter", "document_embedder")
indexing_pipeline.connect("document_embedder", "document_writer") indexing_pipeline.connect("document_embedder", "document_writer")
return indexing_pipeline return indexing_pipeline
def _create_query_pipeline(self): def _create_query_pipeline(self):
print("Creating hybrid retrival pipeline ...") print("Creating hybrid retrival pipeline ...")
if config_backend.needs_torch: text_embedder = SentenceTransformersTextEmbedder(model=self.model_embeddings, device=self.gpu)
text_embedder = SentenceTransformersTextEmbedder(model=self.model_embeddings, device=self.gpu) ranker = TransformersSimilarityRanker(model=self.model_ranker, device=self.gpu)
ranker = TransformersSimilarityRanker(model=self.model_ranker, device=self.gpu)
else:
text_embedder = SentenceTransformersTextEmbedder(model=self.model_embeddings)
ranker = TransformersSimilarityRanker(model=self.model_ranker)
embedding_retriever = PgvectorEmbeddingRetriever(document_store=self.document_store) embedding_retriever = PgvectorEmbeddingRetriever(document_store=self.document_store)
keyword_retriever = PgvectorKeywordRetriever(document_store=self.document_store) keyword_retriever = PgvectorKeywordRetriever(document_store=self.document_store)
document_joiner = DocumentJoiner() document_joiner = DocumentJoiner()
hybrid_retrieval = Pipeline() hybrid_retrieval = Pipeline()
@ -132,7 +126,8 @@ class AIBackend:
results.append({ results.append({
"id": x.meta["id"], "id": x.meta["id"],
"title": x.meta["title"], "title": x.meta["title"],
"url": x.meta["url"] "url": x.meta["url"],
"image_url": x.meta["image_url"]
}) })
return results return results

View file

@ -1,2 +0,0 @@
needs_torch = True
load_dataset = True