diff --git a/data/init.sql b/data/init.sql old mode 100644 new mode 100755 index 76571af..e4dc0ab --- a/data/init.sql +++ b/data/init.sql @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:458fca4a56476e81656f8e486ee37dee786dc3e6269f11819f38d44ea1ca687e -size 224905425 +oid sha256:9152cd9c36d5b6883f2171d612b378dc88f972bff27d83c786f45cd102b96a21 +size 43633981 diff --git a/server-implementation/__pycache__/backend.cpython-310.pyc b/server-implementation/__pycache__/backend.cpython-310.pyc index bc486c3..aaba666 100644 Binary files a/server-implementation/__pycache__/backend.cpython-310.pyc and b/server-implementation/__pycache__/backend.cpython-310.pyc differ diff --git a/server-implementation/backend.py b/server-implementation/backend.py index ffdbd6d..8032108 100644 --- a/server-implementation/backend.py +++ b/server-implementation/backend.py @@ -1,15 +1,13 @@ import json import pathlib -import config_backend -if config_backend.needs_torch: - import torch +import torch from haystack import Document from haystack.utils import ComponentDevice from haystack import Pipeline + from haystack.components.embedders import SentenceTransformersDocumentEmbedder -from haystack.components.preprocessors.document_splitter import DocumentSplitter from haystack.components.writers import DocumentWriter from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore @@ -29,14 +27,18 @@ class AIBackend: document_store: PgvectorDocumentStore documents: list[Document] = [] - def __init__(self): - if config_backend.needs_torch: - get_torch_info() + def __init__(self, load_dataset = False): + get_torch_info() + try: self.gpu = ComponentDevice.from_str("cuda:0") + except: + self.gpu = None + print("No CUDA gpu device found") - dataset = pathlib.Path(__file__).parents[1] / "data" / "dataset.jsonl" - if config_backend.load_dataset: + if load_dataset: + dataset = pathlib.Path(__file__).parents[1] / "data" / "dataset.jsonl" self.documents = [ Document(content=d["text"], meta=d["meta"]) for d in load_data(dataset) ] + self.document_store = PgvectorDocumentStore( embedding_dimension=768, vector_function="cosine_similarity", @@ -50,40 +52,32 @@ class AIBackend: def warmup(self): print("Running warmup routine ...") print("Launching indexing pipeline to generate document embeddings") - res = self.index_pipeline.run({"document_splitter": {"documents": self.documents}}) + res = self.index_pipeline.run({"document_embedder": {"documents": self.documents}}) print(f"Finished running indexing pipeline\nDocument Store: Wrote {res['document_writer']['documents_written']} documents") self._ready = True print("'.query(\"text\")' is now ready to be used") def _create_indexing_pipeline(self): print("Creating indexing pipeline ...") - document_splitter = DocumentSplitter(split_by="word", split_length=128, split_overlap=4) - if config_backend.needs_torch: - document_embedder = SentenceTransformersDocumentEmbedder(model=self.model_embeddings, device=self.gpu) - else: - document_embedder = SentenceTransformersDocumentEmbedder(model=self.model_embeddings) + document_embedder = SentenceTransformersDocumentEmbedder(model=self.model_embeddings, device=self.gpu) document_writer = DocumentWriter(document_store=self.document_store) indexing_pipeline = Pipeline() - indexing_pipeline.add_component("document_splitter", document_splitter) indexing_pipeline.add_component("document_embedder", document_embedder) indexing_pipeline.add_component("document_writer", document_writer) - indexing_pipeline.connect("document_splitter", "document_embedder") indexing_pipeline.connect("document_embedder", "document_writer") return indexing_pipeline def _create_query_pipeline(self): print("Creating hybrid retrival pipeline ...") - if config_backend.needs_torch: - text_embedder = SentenceTransformersTextEmbedder(model=self.model_embeddings, device=self.gpu) - ranker = TransformersSimilarityRanker(model=self.model_ranker, device=self.gpu) - else: - text_embedder = SentenceTransformersTextEmbedder(model=self.model_embeddings) - ranker = TransformersSimilarityRanker(model=self.model_ranker) + text_embedder = SentenceTransformersTextEmbedder(model=self.model_embeddings, device=self.gpu) + ranker = TransformersSimilarityRanker(model=self.model_ranker, device=self.gpu) + embedding_retriever = PgvectorEmbeddingRetriever(document_store=self.document_store) keyword_retriever = PgvectorKeywordRetriever(document_store=self.document_store) + document_joiner = DocumentJoiner() hybrid_retrieval = Pipeline() @@ -132,7 +126,8 @@ class AIBackend: results.append({ "id": x.meta["id"], "title": x.meta["title"], - "url": x.meta["url"] + "url": x.meta["url"], + "image_url": x.meta["image_url"] }) return results diff --git a/server-implementation/config_backend.py b/server-implementation/config_backend.py deleted file mode 100644 index 7966785..0000000 --- a/server-implementation/config_backend.py +++ /dev/null @@ -1,2 +0,0 @@ -needs_torch = True -load_dataset = True \ No newline at end of file