xkcd-retriever/nlp.ipynb
2024-12-06 20:19:35 +01:00

448 lines
17 KiB
Text

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"metadata": {}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Is CUDA or ROCm available? Yes\n",
"Available devices:\n",
"- [0] AMD Radeon RX 7900 XT [ 42 processors, 21.39 GB ]\n"
]
}
],
"source": [
"import torch\n",
"print(f\"Is CUDA or ROCm available? { 'Yes' if torch.cuda.is_available() else 'No'}\")\n",
"print(\"Available devices:\")\n",
"for i in range(torch.cuda.device_count()):\n",
" dev = torch.cuda.get_device_properties(i)\n",
" print(f\"- [{i}] {dev.name} [ {dev.multi_processor_count} processors, {dev.total_memory / 1_000_000_000:.2f} GB ]\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from haystack.utils import ComponentDevice\n",
"gpu = ComponentDevice.from_str(\"cuda:0\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"import json\n",
"import pathlib\n",
"\n",
"dataset = pathlib.Path(\"data\") / \"dataset.jsonl\"\n",
"data = []\n",
"\n",
"with open(dataset, \"r\") as f:\n",
" for x in f.readlines():\n",
" j: dict = json.loads(x)\n",
" j.update({\n",
" \"text\": f\"{j['title']} | {j['transcript']} | {j['explanation']}\",\n",
" \"meta\": {\n",
" \"title\": j[\"title\"],\n",
" \"url\": j[\"url\"],\n",
" \"image_url\": j[\"image_url\"],\n",
" \"id\": j[\"id\"]\n",
" }\n",
" }\n",
" )\n",
" data.append(j)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>title</th>\n",
" <th>image_title</th>\n",
" <th>url</th>\n",
" <th>image_url</th>\n",
" <th>explained_url</th>\n",
" <th>transcript</th>\n",
" <th>explanation</th>\n",
" <th>text</th>\n",
" <th>meta</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Barrel - Part 1</td>\n",
" <td>Barrel - Part 1</td>\n",
" <td>https://www.xkcd.com/1</td>\n",
" <td>https://imgs.xkcd.com/comics/barrel_cropped_(1...</td>\n",
" <td>https://www.explainxkcd.com/wiki/index.php/1:_...</td>\n",
" <td>[A boy sits in a barrel which is floating in a...</td>\n",
" <td>This was the fifth comic originally posted to ...</td>\n",
" <td>Barrel - Part 1 | [A boy sits in a barrel whic...</td>\n",
" <td>{'title': 'Barrel - Part 1', 'url': 'https://w...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Petit Trees (sketch)</td>\n",
" <td>Petit Trees (sketch)</td>\n",
" <td>https://www.xkcd.com/2</td>\n",
" <td>https://imgs.xkcd.com/comics/tree_cropped_(1).jpg</td>\n",
" <td>https://www.explainxkcd.com/wiki/index.php/2:_...</td>\n",
" <td>[Two trees are growing on opposite sides of a ...</td>\n",
" <td>This was the fourth comic originally posted to...</td>\n",
" <td>Petit Trees (sketch) | [Two trees are growing ...</td>\n",
" <td>{'title': 'Petit Trees (sketch)', 'url': 'http...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Island (sketch)</td>\n",
" <td>Island (sketch)</td>\n",
" <td>https://www.xkcd.com/3</td>\n",
" <td>https://imgs.xkcd.com/comics/island_color.jpg</td>\n",
" <td>https://www.explainxkcd.com/wiki/index.php/3:_...</td>\n",
" <td>[A green island surrounded by blue water]\\nThi...</td>\n",
" <td>This was the third comic originally posted to ...</td>\n",
" <td>Island (sketch) | [A green island surrounded b...</td>\n",
" <td>{'title': 'Island (sketch)', 'url': 'https://w...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Landscape (sketch)</td>\n",
" <td>Landscape (sketch)</td>\n",
" <td>https://www.xkcd.com/4</td>\n",
" <td>https://imgs.xkcd.com/comics/landscape_cropped...</td>\n",
" <td>https://www.explainxkcd.com/wiki/index.php/4:_...</td>\n",
" <td>[A sketch of a landscape with sun on the horiz...</td>\n",
" <td>This was the second comic originally posted to...</td>\n",
" <td>Landscape (sketch) | [A sketch of a landscape ...</td>\n",
" <td>{'title': 'Landscape (sketch)', 'url': 'https:...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Blown apart</td>\n",
" <td>Blown apart</td>\n",
" <td>https://www.xkcd.com/5</td>\n",
" <td>https://imgs.xkcd.com/comics/blownapart_color.jpg</td>\n",
" <td>https://www.explainxkcd.com/wiki/index.php/5:_...</td>\n",
" <td>[A black number 70 sees a red package with the...</td>\n",
" <td>This comic is a mathematical and technical jok...</td>\n",
" <td>Blown apart | [A black number 70 sees a red pa...</td>\n",
" <td>{'title': 'Blown apart', 'url': 'https://www.x...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id title image_title url \\\n",
"0 1 Barrel - Part 1 Barrel - Part 1 https://www.xkcd.com/1 \n",
"1 2 Petit Trees (sketch) Petit Trees (sketch) https://www.xkcd.com/2 \n",
"2 3 Island (sketch) Island (sketch) https://www.xkcd.com/3 \n",
"3 4 Landscape (sketch) Landscape (sketch) https://www.xkcd.com/4 \n",
"4 5 Blown apart Blown apart https://www.xkcd.com/5 \n",
"\n",
" image_url \\\n",
"0 https://imgs.xkcd.com/comics/barrel_cropped_(1... \n",
"1 https://imgs.xkcd.com/comics/tree_cropped_(1).jpg \n",
"2 https://imgs.xkcd.com/comics/island_color.jpg \n",
"3 https://imgs.xkcd.com/comics/landscape_cropped... \n",
"4 https://imgs.xkcd.com/comics/blownapart_color.jpg \n",
"\n",
" explained_url \\\n",
"0 https://www.explainxkcd.com/wiki/index.php/1:_... \n",
"1 https://www.explainxkcd.com/wiki/index.php/2:_... \n",
"2 https://www.explainxkcd.com/wiki/index.php/3:_... \n",
"3 https://www.explainxkcd.com/wiki/index.php/4:_... \n",
"4 https://www.explainxkcd.com/wiki/index.php/5:_... \n",
"\n",
" transcript \\\n",
"0 [A boy sits in a barrel which is floating in a... \n",
"1 [Two trees are growing on opposite sides of a ... \n",
"2 [A green island surrounded by blue water]\\nThi... \n",
"3 [A sketch of a landscape with sun on the horiz... \n",
"4 [A black number 70 sees a red package with the... \n",
"\n",
" explanation \\\n",
"0 This was the fifth comic originally posted to ... \n",
"1 This was the fourth comic originally posted to... \n",
"2 This was the third comic originally posted to ... \n",
"3 This was the second comic originally posted to... \n",
"4 This comic is a mathematical and technical jok... \n",
"\n",
" text \\\n",
"0 Barrel - Part 1 | [A boy sits in a barrel whic... \n",
"1 Petit Trees (sketch) | [Two trees are growing ... \n",
"2 Island (sketch) | [A green island surrounded b... \n",
"3 Landscape (sketch) | [A sketch of a landscape ... \n",
"4 Blown apart | [A black number 70 sees a red pa... \n",
"\n",
" meta \n",
"0 {'title': 'Barrel - Part 1', 'url': 'https://w... \n",
"1 {'title': 'Petit Trees (sketch)', 'url': 'http... \n",
"2 {'title': 'Island (sketch)', 'url': 'https://w... \n",
"3 {'title': 'Landscape (sketch)', 'url': 'https:... \n",
"4 {'title': 'Blown apart', 'url': 'https://www.x... "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"df = pd.DataFrame(data)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from haystack import Document\n",
"\n",
"documents = [Document(content=d[\"text\"], meta=d[\"meta\"]) for d in data]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"metadata": {}
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "10c0a04365264393851ab80b8bd3d3ed",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Batches: 0%| | 0/167 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finished running indexing pipeline\n",
"Result: Wrote 5343 to document store\n"
]
}
],
"source": [
"from haystack import Pipeline\n",
"from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n",
"from haystack.components.preprocessors.document_splitter import DocumentSplitter\n",
"from haystack.components.writers import DocumentWriter\n",
"from haystack.document_stores.in_memory import InMemoryDocumentStore\n",
"\n",
"model_embeddings = \"BAAI/bge-small-en-v1.5\"\n",
"model_ranker = \"BAAI/bge-reranker-base\"\n",
"\n",
"document_store = InMemoryDocumentStore()\n",
"\n",
"document_splitter = DocumentSplitter(split_by=\"word\", split_length=512, split_overlap=32)\n",
"document_embedder = SentenceTransformersDocumentEmbedder(model=model_embeddings, device=gpu)\n",
"document_writer = DocumentWriter(document_store)\n",
"\n",
"indexing_pipeline = Pipeline()\n",
"\n",
"indexing_pipeline.add_component(\"document_splitter\", document_splitter)\n",
"indexing_pipeline.add_component(\"document_embedder\", document_embedder)\n",
"indexing_pipeline.add_component(\"document_writer\", document_writer)\n",
"\n",
"indexing_pipeline.connect(\"document_splitter\", \"document_embedder\")\n",
"indexing_pipeline.connect(\"document_embedder\", \"document_writer\")\n",
"\n",
"res = indexing_pipeline.run({\"document_splitter\": {\"documents\": documents}})\n",
"\n",
"print(f\"Finished running indexing pipeline\\nResult: Wrote {res['document_writer']['documents_written']} to document store\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<haystack.core.pipeline.pipeline.Pipeline object at 0x7f5dd4730e80>\n",
"🚅 Components\n",
" - text_embedder: SentenceTransformersTextEmbedder\n",
" - embedding_retriever: InMemoryEmbeddingRetriever\n",
" - bm25_retriever: InMemoryBM25Retriever\n",
" - document_joiner: DocumentJoiner\n",
" - ranker: TransformersSimilarityRanker\n",
"🛤️ Connections\n",
" - text_embedder.embedding -> embedding_retriever.query_embedding (List[float])\n",
" - embedding_retriever.documents -> document_joiner.documents (List[Document])\n",
" - bm25_retriever.documents -> document_joiner.documents (List[Document])\n",
" - document_joiner.documents -> ranker.documents (List[Document])"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from haystack.components.embedders import SentenceTransformersTextEmbedder\n",
"from haystack.components.joiners import DocumentJoiner\n",
"from haystack.components.rankers import TransformersSimilarityRanker\n",
"from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever\n",
"\n",
"text_embedder = SentenceTransformersTextEmbedder(model=model_embeddings, device=gpu, progress_bar=False)\n",
"embedding_retriever = InMemoryEmbeddingRetriever(document_store)\n",
"bm25_retriever = InMemoryBM25Retriever(document_store)\n",
"document_joiner = DocumentJoiner()\n",
"ranker = TransformersSimilarityRanker(model=model_ranker, device=gpu)\n",
"\n",
"hybrid_retrieval = Pipeline()\n",
"hybrid_retrieval.add_component(\"text_embedder\", text_embedder)\n",
"hybrid_retrieval.add_component(\"embedding_retriever\", embedding_retriever)\n",
"hybrid_retrieval.add_component(\"bm25_retriever\", bm25_retriever)\n",
"hybrid_retrieval.add_component(\"document_joiner\", document_joiner)\n",
"hybrid_retrieval.add_component(\"ranker\", ranker)\n",
"\n",
"hybrid_retrieval.connect(\"text_embedder\", \"embedding_retriever\")\n",
"hybrid_retrieval.connect(\"bm25_retriever\", \"document_joiner\")\n",
"hybrid_retrieval.connect(\"embedding_retriever\", \"document_joiner\")\n",
"hybrid_retrieval.connect(\"document_joiner\", \"ranker\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"hybrid_retrieval.draw(\"hybrid-retrieval.png\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Results for 'dependency'\n",
"\n",
"| ID | Title | Link |\n",
"|----------|--------------------------|---------------------------|\n",
"| [ 1579 ] | Tech Loops | https://www.xkcd.com/1579 |\n",
"| [ 2102 ] | Internet Archive | https://www.xkcd.com/2102 |\n",
"| [ 1906 ] | Making Progress | https://www.xkcd.com/1906 |\n",
"| [ 2347 ] | Dependency | https://www.xkcd.com/2347 |\n",
"| [ 1654 ] | Universal Install Script | https://www.xkcd.com/1654 |\n"
]
}
],
"source": [
"from tabulate import tabulate\n",
"\n",
"query = \"dependency\"\n",
"\n",
"result = hybrid_retrieval.run(\n",
" data={\n",
" \"text_embedder\": {\n",
" \"text\": query\n",
" },\n",
" \"bm25_retriever\": {\n",
" \"query\": query\n",
" },\n",
" \"ranker\": {\n",
" \"query\": query,\n",
" \"top_k\": 5\n",
" }\n",
" }\n",
")\n",
"\n",
"result_table = []\n",
"headers = [\"ID\", \"Title\", \"Link\"]\n",
"\n",
"x: Document | None\n",
"for x in result[\"ranker\"][\"documents\"]:\n",
" if x is None:\n",
" continue\n",
" result_table.append([f\"[ {x.id:4} ]\", x.meta[\"title\"], x.meta[\"url\"]])\n",
"\n",
"print(f\"Results for '{query}'\\n\")\n",
"print(tabulate(result_table, headers=headers, tablefmt=\"github\"))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}