improved doc import and fixed duplication glitch
This commit is contained in:
parent
8ec5eb69ab
commit
ef61b926a1
4 changed files with 21 additions and 28 deletions
BIN
data/init.sql
(Stored with Git LFS)
Normal file → Executable file
BIN
data/init.sql
(Stored with Git LFS)
Normal file → Executable file
Binary file not shown.
Binary file not shown.
|
@ -1,15 +1,13 @@
|
||||||
import json
|
import json
|
||||||
import pathlib
|
import pathlib
|
||||||
import config_backend
|
|
||||||
|
|
||||||
if config_backend.needs_torch:
|
import torch
|
||||||
import torch
|
|
||||||
|
|
||||||
from haystack import Document
|
from haystack import Document
|
||||||
from haystack.utils import ComponentDevice
|
from haystack.utils import ComponentDevice
|
||||||
from haystack import Pipeline
|
from haystack import Pipeline
|
||||||
|
|
||||||
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
|
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
|
||||||
from haystack.components.preprocessors.document_splitter import DocumentSplitter
|
|
||||||
from haystack.components.writers import DocumentWriter
|
from haystack.components.writers import DocumentWriter
|
||||||
|
|
||||||
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
|
from haystack_integrations.document_stores.pgvector import PgvectorDocumentStore
|
||||||
|
@ -29,14 +27,18 @@ class AIBackend:
|
||||||
document_store: PgvectorDocumentStore
|
document_store: PgvectorDocumentStore
|
||||||
documents: list[Document] = []
|
documents: list[Document] = []
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, load_dataset = False):
|
||||||
if config_backend.needs_torch:
|
get_torch_info()
|
||||||
get_torch_info()
|
try:
|
||||||
self.gpu = ComponentDevice.from_str("cuda:0")
|
self.gpu = ComponentDevice.from_str("cuda:0")
|
||||||
|
except:
|
||||||
|
self.gpu = None
|
||||||
|
print("No CUDA gpu device found")
|
||||||
|
|
||||||
dataset = pathlib.Path(__file__).parents[1] / "data" / "dataset.jsonl"
|
if load_dataset:
|
||||||
if config_backend.load_dataset:
|
dataset = pathlib.Path(__file__).parents[1] / "data" / "dataset.jsonl"
|
||||||
self.documents = [ Document(content=d["text"], meta=d["meta"]) for d in load_data(dataset) ]
|
self.documents = [ Document(content=d["text"], meta=d["meta"]) for d in load_data(dataset) ]
|
||||||
|
|
||||||
self.document_store = PgvectorDocumentStore(
|
self.document_store = PgvectorDocumentStore(
|
||||||
embedding_dimension=768,
|
embedding_dimension=768,
|
||||||
vector_function="cosine_similarity",
|
vector_function="cosine_similarity",
|
||||||
|
@ -50,40 +52,32 @@ class AIBackend:
|
||||||
def warmup(self):
|
def warmup(self):
|
||||||
print("Running warmup routine ...")
|
print("Running warmup routine ...")
|
||||||
print("Launching indexing pipeline to generate document embeddings")
|
print("Launching indexing pipeline to generate document embeddings")
|
||||||
res = self.index_pipeline.run({"document_splitter": {"documents": self.documents}})
|
res = self.index_pipeline.run({"document_embedder": {"documents": self.documents}})
|
||||||
print(f"Finished running indexing pipeline\nDocument Store: Wrote {res['document_writer']['documents_written']} documents")
|
print(f"Finished running indexing pipeline\nDocument Store: Wrote {res['document_writer']['documents_written']} documents")
|
||||||
self._ready = True
|
self._ready = True
|
||||||
print("'.query(\"text\")' is now ready to be used")
|
print("'.query(\"text\")' is now ready to be used")
|
||||||
|
|
||||||
def _create_indexing_pipeline(self):
|
def _create_indexing_pipeline(self):
|
||||||
print("Creating indexing pipeline ...")
|
print("Creating indexing pipeline ...")
|
||||||
document_splitter = DocumentSplitter(split_by="word", split_length=128, split_overlap=4)
|
document_embedder = SentenceTransformersDocumentEmbedder(model=self.model_embeddings, device=self.gpu)
|
||||||
if config_backend.needs_torch:
|
|
||||||
document_embedder = SentenceTransformersDocumentEmbedder(model=self.model_embeddings, device=self.gpu)
|
|
||||||
else:
|
|
||||||
document_embedder = SentenceTransformersDocumentEmbedder(model=self.model_embeddings)
|
|
||||||
document_writer = DocumentWriter(document_store=self.document_store)
|
document_writer = DocumentWriter(document_store=self.document_store)
|
||||||
|
|
||||||
indexing_pipeline = Pipeline()
|
indexing_pipeline = Pipeline()
|
||||||
indexing_pipeline.add_component("document_splitter", document_splitter)
|
|
||||||
indexing_pipeline.add_component("document_embedder", document_embedder)
|
indexing_pipeline.add_component("document_embedder", document_embedder)
|
||||||
indexing_pipeline.add_component("document_writer", document_writer)
|
indexing_pipeline.add_component("document_writer", document_writer)
|
||||||
|
|
||||||
indexing_pipeline.connect("document_splitter", "document_embedder")
|
|
||||||
indexing_pipeline.connect("document_embedder", "document_writer")
|
indexing_pipeline.connect("document_embedder", "document_writer")
|
||||||
|
|
||||||
return indexing_pipeline
|
return indexing_pipeline
|
||||||
|
|
||||||
def _create_query_pipeline(self):
|
def _create_query_pipeline(self):
|
||||||
print("Creating hybrid retrival pipeline ...")
|
print("Creating hybrid retrival pipeline ...")
|
||||||
if config_backend.needs_torch:
|
text_embedder = SentenceTransformersTextEmbedder(model=self.model_embeddings, device=self.gpu)
|
||||||
text_embedder = SentenceTransformersTextEmbedder(model=self.model_embeddings, device=self.gpu)
|
ranker = TransformersSimilarityRanker(model=self.model_ranker, device=self.gpu)
|
||||||
ranker = TransformersSimilarityRanker(model=self.model_ranker, device=self.gpu)
|
|
||||||
else:
|
|
||||||
text_embedder = SentenceTransformersTextEmbedder(model=self.model_embeddings)
|
|
||||||
ranker = TransformersSimilarityRanker(model=self.model_ranker)
|
|
||||||
embedding_retriever = PgvectorEmbeddingRetriever(document_store=self.document_store)
|
embedding_retriever = PgvectorEmbeddingRetriever(document_store=self.document_store)
|
||||||
keyword_retriever = PgvectorKeywordRetriever(document_store=self.document_store)
|
keyword_retriever = PgvectorKeywordRetriever(document_store=self.document_store)
|
||||||
|
|
||||||
document_joiner = DocumentJoiner()
|
document_joiner = DocumentJoiner()
|
||||||
|
|
||||||
hybrid_retrieval = Pipeline()
|
hybrid_retrieval = Pipeline()
|
||||||
|
@ -132,7 +126,8 @@ class AIBackend:
|
||||||
results.append({
|
results.append({
|
||||||
"id": x.meta["id"],
|
"id": x.meta["id"],
|
||||||
"title": x.meta["title"],
|
"title": x.meta["title"],
|
||||||
"url": x.meta["url"]
|
"url": x.meta["url"],
|
||||||
|
"image_url": x.meta["image_url"]
|
||||||
})
|
})
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
|
@ -1,2 +0,0 @@
|
||||||
needs_torch = True
|
|
||||||
load_dataset = True
|
|
Loading…
Add table
Reference in a new issue