448 lines
17 KiB
Text
448 lines
17 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {
|
|
"metadata": {}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Is CUDA or ROCm available? Yes\n",
|
|
"Available devices:\n",
|
|
"- [0] AMD Radeon RX 7900 XT [ 42 processors, 21.39 GB ]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import torch\n",
|
|
"print(f\"Is CUDA or ROCm available? { 'Yes' if torch.cuda.is_available() else 'No'}\")\n",
|
|
"print(\"Available devices:\")\n",
|
|
"for i in range(torch.cuda.device_count()):\n",
|
|
" dev = torch.cuda.get_device_properties(i)\n",
|
|
" print(f\"- [{i}] {dev.name} [ {dev.multi_processor_count} processors, {dev.total_memory / 1_000_000_000:.2f} GB ]\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from haystack.utils import ComponentDevice\n",
|
|
"gpu = ComponentDevice.from_str(\"cuda:0\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {
|
|
"metadata": {}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"import pathlib\n",
|
|
"\n",
|
|
"dataset = pathlib.Path(\"data\") / \"dataset.jsonl\"\n",
|
|
"data = []\n",
|
|
"\n",
|
|
"with open(dataset, \"r\") as f:\n",
|
|
" for x in f.readlines():\n",
|
|
" j: dict = json.loads(x)\n",
|
|
" j.update({\n",
|
|
" \"text\": f\"{j['title']} | {j['transcript']} | {j['explanation']}\",\n",
|
|
" \"meta\": {\n",
|
|
" \"title\": j[\"title\"],\n",
|
|
" \"url\": j[\"url\"],\n",
|
|
" \"image_url\": j[\"image_url\"],\n",
|
|
" \"id\": j[\"id\"]\n",
|
|
" }\n",
|
|
" }\n",
|
|
" )\n",
|
|
" data.append(j)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>id</th>\n",
|
|
" <th>title</th>\n",
|
|
" <th>image_title</th>\n",
|
|
" <th>url</th>\n",
|
|
" <th>image_url</th>\n",
|
|
" <th>explained_url</th>\n",
|
|
" <th>transcript</th>\n",
|
|
" <th>explanation</th>\n",
|
|
" <th>text</th>\n",
|
|
" <th>meta</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>Barrel - Part 1</td>\n",
|
|
" <td>Barrel - Part 1</td>\n",
|
|
" <td>https://www.xkcd.com/1</td>\n",
|
|
" <td>https://imgs.xkcd.com/comics/barrel_cropped_(1...</td>\n",
|
|
" <td>https://www.explainxkcd.com/wiki/index.php/1:_...</td>\n",
|
|
" <td>[A boy sits in a barrel which is floating in a...</td>\n",
|
|
" <td>This was the fifth comic originally posted to ...</td>\n",
|
|
" <td>Barrel - Part 1 | [A boy sits in a barrel whic...</td>\n",
|
|
" <td>{'title': 'Barrel - Part 1', 'url': 'https://w...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>2</td>\n",
|
|
" <td>Petit Trees (sketch)</td>\n",
|
|
" <td>Petit Trees (sketch)</td>\n",
|
|
" <td>https://www.xkcd.com/2</td>\n",
|
|
" <td>https://imgs.xkcd.com/comics/tree_cropped_(1).jpg</td>\n",
|
|
" <td>https://www.explainxkcd.com/wiki/index.php/2:_...</td>\n",
|
|
" <td>[Two trees are growing on opposite sides of a ...</td>\n",
|
|
" <td>This was the fourth comic originally posted to...</td>\n",
|
|
" <td>Petit Trees (sketch) | [Two trees are growing ...</td>\n",
|
|
" <td>{'title': 'Petit Trees (sketch)', 'url': 'http...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>3</td>\n",
|
|
" <td>Island (sketch)</td>\n",
|
|
" <td>Island (sketch)</td>\n",
|
|
" <td>https://www.xkcd.com/3</td>\n",
|
|
" <td>https://imgs.xkcd.com/comics/island_color.jpg</td>\n",
|
|
" <td>https://www.explainxkcd.com/wiki/index.php/3:_...</td>\n",
|
|
" <td>[A green island surrounded by blue water]\\nThi...</td>\n",
|
|
" <td>This was the third comic originally posted to ...</td>\n",
|
|
" <td>Island (sketch) | [A green island surrounded b...</td>\n",
|
|
" <td>{'title': 'Island (sketch)', 'url': 'https://w...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>4</td>\n",
|
|
" <td>Landscape (sketch)</td>\n",
|
|
" <td>Landscape (sketch)</td>\n",
|
|
" <td>https://www.xkcd.com/4</td>\n",
|
|
" <td>https://imgs.xkcd.com/comics/landscape_cropped...</td>\n",
|
|
" <td>https://www.explainxkcd.com/wiki/index.php/4:_...</td>\n",
|
|
" <td>[A sketch of a landscape with sun on the horiz...</td>\n",
|
|
" <td>This was the second comic originally posted to...</td>\n",
|
|
" <td>Landscape (sketch) | [A sketch of a landscape ...</td>\n",
|
|
" <td>{'title': 'Landscape (sketch)', 'url': 'https:...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>5</td>\n",
|
|
" <td>Blown apart</td>\n",
|
|
" <td>Blown apart</td>\n",
|
|
" <td>https://www.xkcd.com/5</td>\n",
|
|
" <td>https://imgs.xkcd.com/comics/blownapart_color.jpg</td>\n",
|
|
" <td>https://www.explainxkcd.com/wiki/index.php/5:_...</td>\n",
|
|
" <td>[A black number 70 sees a red package with the...</td>\n",
|
|
" <td>This comic is a mathematical and technical jok...</td>\n",
|
|
" <td>Blown apart | [A black number 70 sees a red pa...</td>\n",
|
|
" <td>{'title': 'Blown apart', 'url': 'https://www.x...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" id title image_title url \\\n",
|
|
"0 1 Barrel - Part 1 Barrel - Part 1 https://www.xkcd.com/1 \n",
|
|
"1 2 Petit Trees (sketch) Petit Trees (sketch) https://www.xkcd.com/2 \n",
|
|
"2 3 Island (sketch) Island (sketch) https://www.xkcd.com/3 \n",
|
|
"3 4 Landscape (sketch) Landscape (sketch) https://www.xkcd.com/4 \n",
|
|
"4 5 Blown apart Blown apart https://www.xkcd.com/5 \n",
|
|
"\n",
|
|
" image_url \\\n",
|
|
"0 https://imgs.xkcd.com/comics/barrel_cropped_(1... \n",
|
|
"1 https://imgs.xkcd.com/comics/tree_cropped_(1).jpg \n",
|
|
"2 https://imgs.xkcd.com/comics/island_color.jpg \n",
|
|
"3 https://imgs.xkcd.com/comics/landscape_cropped... \n",
|
|
"4 https://imgs.xkcd.com/comics/blownapart_color.jpg \n",
|
|
"\n",
|
|
" explained_url \\\n",
|
|
"0 https://www.explainxkcd.com/wiki/index.php/1:_... \n",
|
|
"1 https://www.explainxkcd.com/wiki/index.php/2:_... \n",
|
|
"2 https://www.explainxkcd.com/wiki/index.php/3:_... \n",
|
|
"3 https://www.explainxkcd.com/wiki/index.php/4:_... \n",
|
|
"4 https://www.explainxkcd.com/wiki/index.php/5:_... \n",
|
|
"\n",
|
|
" transcript \\\n",
|
|
"0 [A boy sits in a barrel which is floating in a... \n",
|
|
"1 [Two trees are growing on opposite sides of a ... \n",
|
|
"2 [A green island surrounded by blue water]\\nThi... \n",
|
|
"3 [A sketch of a landscape with sun on the horiz... \n",
|
|
"4 [A black number 70 sees a red package with the... \n",
|
|
"\n",
|
|
" explanation \\\n",
|
|
"0 This was the fifth comic originally posted to ... \n",
|
|
"1 This was the fourth comic originally posted to... \n",
|
|
"2 This was the third comic originally posted to ... \n",
|
|
"3 This was the second comic originally posted to... \n",
|
|
"4 This comic is a mathematical and technical jok... \n",
|
|
"\n",
|
|
" text \\\n",
|
|
"0 Barrel - Part 1 | [A boy sits in a barrel whic... \n",
|
|
"1 Petit Trees (sketch) | [Two trees are growing ... \n",
|
|
"2 Island (sketch) | [A green island surrounded b... \n",
|
|
"3 Landscape (sketch) | [A sketch of a landscape ... \n",
|
|
"4 Blown apart | [A black number 70 sees a red pa... \n",
|
|
"\n",
|
|
" meta \n",
|
|
"0 {'title': 'Barrel - Part 1', 'url': 'https://w... \n",
|
|
"1 {'title': 'Petit Trees (sketch)', 'url': 'http... \n",
|
|
"2 {'title': 'Island (sketch)', 'url': 'https://w... \n",
|
|
"3 {'title': 'Landscape (sketch)', 'url': 'https:... \n",
|
|
"4 {'title': 'Blown apart', 'url': 'https://www.x... "
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"df = pd.DataFrame(data)\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from haystack import Document\n",
|
|
"\n",
|
|
"documents = [Document(content=d[\"text\"], meta=d[\"meta\"]) for d in data]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {
|
|
"metadata": {}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"application/vnd.jupyter.widget-view+json": {
|
|
"model_id": "10c0a04365264393851ab80b8bd3d3ed",
|
|
"version_major": 2,
|
|
"version_minor": 0
|
|
},
|
|
"text/plain": [
|
|
"Batches: 0%| | 0/167 [00:00<?, ?it/s]"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Finished running indexing pipeline\n",
|
|
"Result: Wrote 5343 to document store\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from haystack import Pipeline\n",
|
|
"from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n",
|
|
"from haystack.components.preprocessors.document_splitter import DocumentSplitter\n",
|
|
"from haystack.components.writers import DocumentWriter\n",
|
|
"from haystack.document_stores.in_memory import InMemoryDocumentStore\n",
|
|
"\n",
|
|
"model_embeddings = \"BAAI/bge-small-en-v1.5\"\n",
|
|
"model_ranker = \"BAAI/bge-reranker-base\"\n",
|
|
"\n",
|
|
"document_store = InMemoryDocumentStore()\n",
|
|
"\n",
|
|
"document_splitter = DocumentSplitter(split_by=\"word\", split_length=512, split_overlap=32)\n",
|
|
"document_embedder = SentenceTransformersDocumentEmbedder(model=model_embeddings, device=gpu)\n",
|
|
"document_writer = DocumentWriter(document_store)\n",
|
|
"\n",
|
|
"indexing_pipeline = Pipeline()\n",
|
|
"\n",
|
|
"indexing_pipeline.add_component(\"document_splitter\", document_splitter)\n",
|
|
"indexing_pipeline.add_component(\"document_embedder\", document_embedder)\n",
|
|
"indexing_pipeline.add_component(\"document_writer\", document_writer)\n",
|
|
"\n",
|
|
"indexing_pipeline.connect(\"document_splitter\", \"document_embedder\")\n",
|
|
"indexing_pipeline.connect(\"document_embedder\", \"document_writer\")\n",
|
|
"\n",
|
|
"res = indexing_pipeline.run({\"document_splitter\": {\"documents\": documents}})\n",
|
|
"\n",
|
|
"print(f\"Finished running indexing pipeline\\nResult: Wrote {res['document_writer']['documents_written']} to document store\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<haystack.core.pipeline.pipeline.Pipeline object at 0x7f5dd4730e80>\n",
|
|
"🚅 Components\n",
|
|
" - text_embedder: SentenceTransformersTextEmbedder\n",
|
|
" - embedding_retriever: InMemoryEmbeddingRetriever\n",
|
|
" - bm25_retriever: InMemoryBM25Retriever\n",
|
|
" - document_joiner: DocumentJoiner\n",
|
|
" - ranker: TransformersSimilarityRanker\n",
|
|
"🛤️ Connections\n",
|
|
" - text_embedder.embedding -> embedding_retriever.query_embedding (List[float])\n",
|
|
" - embedding_retriever.documents -> document_joiner.documents (List[Document])\n",
|
|
" - bm25_retriever.documents -> document_joiner.documents (List[Document])\n",
|
|
" - document_joiner.documents -> ranker.documents (List[Document])"
|
|
]
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from haystack.components.embedders import SentenceTransformersTextEmbedder\n",
|
|
"from haystack.components.joiners import DocumentJoiner\n",
|
|
"from haystack.components.rankers import TransformersSimilarityRanker\n",
|
|
"from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever\n",
|
|
"\n",
|
|
"text_embedder = SentenceTransformersTextEmbedder(model=model_embeddings, device=gpu, progress_bar=False)\n",
|
|
"embedding_retriever = InMemoryEmbeddingRetriever(document_store)\n",
|
|
"bm25_retriever = InMemoryBM25Retriever(document_store)\n",
|
|
"document_joiner = DocumentJoiner()\n",
|
|
"ranker = TransformersSimilarityRanker(model=model_ranker, device=gpu)\n",
|
|
"\n",
|
|
"hybrid_retrieval = Pipeline()\n",
|
|
"hybrid_retrieval.add_component(\"text_embedder\", text_embedder)\n",
|
|
"hybrid_retrieval.add_component(\"embedding_retriever\", embedding_retriever)\n",
|
|
"hybrid_retrieval.add_component(\"bm25_retriever\", bm25_retriever)\n",
|
|
"hybrid_retrieval.add_component(\"document_joiner\", document_joiner)\n",
|
|
"hybrid_retrieval.add_component(\"ranker\", ranker)\n",
|
|
"\n",
|
|
"hybrid_retrieval.connect(\"text_embedder\", \"embedding_retriever\")\n",
|
|
"hybrid_retrieval.connect(\"bm25_retriever\", \"document_joiner\")\n",
|
|
"hybrid_retrieval.connect(\"embedding_retriever\", \"document_joiner\")\n",
|
|
"hybrid_retrieval.connect(\"document_joiner\", \"ranker\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"hybrid_retrieval.draw(\"hybrid-retrieval.png\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Results for 'dependency'\n",
|
|
"\n",
|
|
"| ID | Title | Link |\n",
|
|
"|----------|--------------------------|---------------------------|\n",
|
|
"| [ 1579 ] | Tech Loops | https://www.xkcd.com/1579 |\n",
|
|
"| [ 2102 ] | Internet Archive | https://www.xkcd.com/2102 |\n",
|
|
"| [ 1906 ] | Making Progress | https://www.xkcd.com/1906 |\n",
|
|
"| [ 2347 ] | Dependency | https://www.xkcd.com/2347 |\n",
|
|
"| [ 1654 ] | Universal Install Script | https://www.xkcd.com/1654 |\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from tabulate import tabulate\n",
|
|
"\n",
|
|
"query = \"dependency\"\n",
|
|
"\n",
|
|
"result = hybrid_retrieval.run(\n",
|
|
" data={\n",
|
|
" \"text_embedder\": {\n",
|
|
" \"text\": query\n",
|
|
" },\n",
|
|
" \"bm25_retriever\": {\n",
|
|
" \"query\": query\n",
|
|
" },\n",
|
|
" \"ranker\": {\n",
|
|
" \"query\": query,\n",
|
|
" \"top_k\": 5\n",
|
|
" }\n",
|
|
" }\n",
|
|
")\n",
|
|
"\n",
|
|
"result_table = []\n",
|
|
"headers = [\"ID\", \"Title\", \"Link\"]\n",
|
|
"\n",
|
|
"x: Document | None\n",
|
|
"for x in result[\"ranker\"][\"documents\"]:\n",
|
|
" if x is None:\n",
|
|
" continue\n",
|
|
" result_table.append([f\"[ {x.id:4} ]\", x.meta[\"title\"], x.meta[\"url\"]])\n",
|
|
"\n",
|
|
"print(f\"Results for '{query}'\\n\")\n",
|
|
"print(tabulate(result_table, headers=headers, tablefmt=\"github\"))"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|