Initial commit

2024-11-29 14:45:19 +01:00 · 2024-11-29 14:45:19 +01:00 · b135ce28c5
commit b135ce28c5
5 changed files with 439 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
+data/**/* filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,7 @@
+# env
+.env
+*venv/
+
+# Binaries for installation
+*.deb
+*.whl
--- a/data/dataset.jsonl
+++ b/data/dataset.jsonl
--- a/hybrid-retrieval.png
+++ b/hybrid-retrieval.png
--- a/nlp.ipynb
+++ b/nlp.ipynb
@ -0,0 +1,428 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Is CUDA or ROCm available? Yes\n",
+      "Available devices:\n",
+      "- [0] AMD Radeon RX 7900 XT [ 42 processors, 21.39 GB ]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "print(f\"Is CUDA or ROCm available? { 'Yes' if torch.cuda.is_available() else 'No'}\")\n",
+    "print(\"Available devices:\")\n",
+    "for i in range(torch.cuda.device_count()):\n",
+    "   dev = torch.cuda.get_device_properties(i)\n",
+    "   print(f\"- [{i}] {dev.name} [ {dev.multi_processor_count} processors, {dev.total_memory / 1_000_000_000:.2f} GB ]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from haystack.utils import ComponentDevice\n",
+    "gpu = ComponentDevice.from_str(\"cuda:0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>title</th>\n",
+       "      <th>image_title</th>\n",
+       "      <th>url</th>\n",
+       "      <th>image_url</th>\n",
+       "      <th>explained_url</th>\n",
+       "      <th>transcript</th>\n",
+       "      <th>explanation</th>\n",
+       "      <th>text</th>\n",
+       "      <th>meta</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>Barrel - Part 1</td>\n",
+       "      <td>Barrel - Part 1</td>\n",
+       "      <td>https://www.xkcd.com/1</td>\n",
+       "      <td>https://imgs.xkcd.com/comics/barrel_cropped_(1...</td>\n",
+       "      <td>https://www.explainxkcd.com/wiki/index.php/1:_...</td>\n",
+       "      <td>[A boy sits in a barrel which is floating in a...</td>\n",
+       "      <td>This was the fifth comic originally posted to ...</td>\n",
+       "      <td>barrel - part 1 | [a boy sits in a barrel whic...</td>\n",
+       "      <td>{'title': 'Barrel - Part 1', 'url': 'https://w...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>Petit Trees (sketch)</td>\n",
+       "      <td>Petit Trees (sketch)</td>\n",
+       "      <td>https://www.xkcd.com/2</td>\n",
+       "      <td>https://imgs.xkcd.com/comics/tree_cropped_(1).jpg</td>\n",
+       "      <td>https://www.explainxkcd.com/wiki/index.php/2:_...</td>\n",
+       "      <td>[Two trees are growing on opposite sides of a ...</td>\n",
+       "      <td>This was the fourth comic originally posted to...</td>\n",
+       "      <td>petit trees (sketch) | [two trees are growing ...</td>\n",
+       "      <td>{'title': 'Petit Trees (sketch)', 'url': 'http...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3</td>\n",
+       "      <td>Island (sketch)</td>\n",
+       "      <td>Island (sketch)</td>\n",
+       "      <td>https://www.xkcd.com/3</td>\n",
+       "      <td>https://imgs.xkcd.com/comics/island_color.jpg</td>\n",
+       "      <td>https://www.explainxkcd.com/wiki/index.php/3:_...</td>\n",
+       "      <td>[A green island surrounded by blue water]\\nThi...</td>\n",
+       "      <td>This was the third comic originally posted to ...</td>\n",
+       "      <td>island (sketch) | [a green island surrounded b...</td>\n",
+       "      <td>{'title': 'Island (sketch)', 'url': 'https://w...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>Landscape (sketch)</td>\n",
+       "      <td>Landscape (sketch)</td>\n",
+       "      <td>https://www.xkcd.com/4</td>\n",
+       "      <td>https://imgs.xkcd.com/comics/landscape_cropped...</td>\n",
+       "      <td>https://www.explainxkcd.com/wiki/index.php/4:_...</td>\n",
+       "      <td>[A sketch of a landscape with sun on the horiz...</td>\n",
+       "      <td>This was the second comic originally posted to...</td>\n",
+       "      <td>landscape (sketch) | [a sketch of a landscape ...</td>\n",
+       "      <td>{'title': 'Landscape (sketch)', 'url': 'https:...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5</td>\n",
+       "      <td>Blown apart</td>\n",
+       "      <td>Blown apart</td>\n",
+       "      <td>https://www.xkcd.com/5</td>\n",
+       "      <td>https://imgs.xkcd.com/comics/blownapart_color.jpg</td>\n",
+       "      <td>https://www.explainxkcd.com/wiki/index.php/5:_...</td>\n",
+       "      <td>[A black number 70 sees a red package with the...</td>\n",
+       "      <td>This comic is a mathematical and technical jok...</td>\n",
+       "      <td>blown apart | [a black number 70 sees a red pa...</td>\n",
+       "      <td>{'title': 'Blown apart', 'url': 'https://www.x...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id                 title           image_title                     url  \\\n",
+       "0   1       Barrel - Part 1       Barrel - Part 1  https://www.xkcd.com/1   \n",
+       "1   2  Petit Trees (sketch)  Petit Trees (sketch)  https://www.xkcd.com/2   \n",
+       "2   3       Island (sketch)       Island (sketch)  https://www.xkcd.com/3   \n",
+       "3   4    Landscape (sketch)    Landscape (sketch)  https://www.xkcd.com/4   \n",
+       "4   5           Blown apart           Blown apart  https://www.xkcd.com/5   \n",
+       "\n",
+       "                                           image_url  \\\n",
+       "0  https://imgs.xkcd.com/comics/barrel_cropped_(1...   \n",
+       "1  https://imgs.xkcd.com/comics/tree_cropped_(1).jpg   \n",
+       "2      https://imgs.xkcd.com/comics/island_color.jpg   \n",
+       "3  https://imgs.xkcd.com/comics/landscape_cropped...   \n",
+       "4  https://imgs.xkcd.com/comics/blownapart_color.jpg   \n",
+       "\n",
+       "                                       explained_url  \\\n",
+       "0  https://www.explainxkcd.com/wiki/index.php/1:_...   \n",
+       "1  https://www.explainxkcd.com/wiki/index.php/2:_...   \n",
+       "2  https://www.explainxkcd.com/wiki/index.php/3:_...   \n",
+       "3  https://www.explainxkcd.com/wiki/index.php/4:_...   \n",
+       "4  https://www.explainxkcd.com/wiki/index.php/5:_...   \n",
+       "\n",
+       "                                          transcript  \\\n",
+       "0  [A boy sits in a barrel which is floating in a...   \n",
+       "1  [Two trees are growing on opposite sides of a ...   \n",
+       "2  [A green island surrounded by blue water]\\nThi...   \n",
+       "3  [A sketch of a landscape with sun on the horiz...   \n",
+       "4  [A black number 70 sees a red package with the...   \n",
+       "\n",
+       "                                         explanation  \\\n",
+       "0  This was the fifth comic originally posted to ...   \n",
+       "1  This was the fourth comic originally posted to...   \n",
+       "2  This was the third comic originally posted to ...   \n",
+       "3  This was the second comic originally posted to...   \n",
+       "4  This comic is a mathematical and technical jok...   \n",
+       "\n",
+       "                                                text  \\\n",
+       "0  barrel - part 1 | [a boy sits in a barrel whic...   \n",
+       "1  petit trees (sketch) | [two trees are growing ...   \n",
+       "2  island (sketch) | [a green island surrounded b...   \n",
+       "3  landscape (sketch) | [a sketch of a landscape ...   \n",
+       "4  blown apart | [a black number 70 sees a red pa...   \n",
+       "\n",
+       "                                                meta  \n",
+       "0  {'title': 'Barrel - Part 1', 'url': 'https://w...  \n",
+       "1  {'title': 'Petit Trees (sketch)', 'url': 'http...  \n",
+       "2  {'title': 'Island (sketch)', 'url': 'https://w...  \n",
+       "3  {'title': 'Landscape (sketch)', 'url': 'https:...  \n",
+       "4  {'title': 'Blown apart', 'url': 'https://www.x...  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import pathlib\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "dataset = pathlib.Path(\"data\") / \"dataset.jsonl\"\n",
+    "data = []\n",
+    "\n",
+    "with open(dataset, \"r\") as f:\n",
+    "    for x in f.readlines():\n",
+    "        j = json.loads(x)\n",
+    "        j.update({ \"text\": f\"{j['title']} | {j['transcript']} | {j['explanation']}\".lower() })\n",
+    "        j.update({ \"meta\": {\n",
+    "            \"title\": j[\"title\"],\n",
+    "            \"url\": j[\"url\"],\n",
+    "            \"image_url\": j[\"image_url\"],\n",
+    "            \"id\": j[\"id\"]\n",
+    "        }})\n",
+    "        data.append(j)\n",
+    "\n",
+    "df = pd.DataFrame(data)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "metadata": {}
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6e3b2d47cde54d50a8a92305bda7fed4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batches:   0%|          | 0/167 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finished running pipeline\n",
+      "Result: Wrote 5343 to document store\n"
+     ]
+    }
+   ],
+   "source": [
+    "from haystack import Document, Pipeline\n",
+    "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n",
+    "from haystack.components.preprocessors.document_splitter import DocumentSplitter\n",
+    "from haystack.components.writers import DocumentWriter\n",
+    "from haystack.document_stores.in_memory import InMemoryDocumentStore\n",
+    "\n",
+    "model_embeddings = \"BAAI/bge-small-en-v1.5\"\n",
+    "model_ranker = \"BAAI/bge-reranker-base\"\n",
+    "\n",
+    "results = [Document(content=d[\"text\"], meta=d[\"meta\"]) for d in data]\n",
+    "document_store = InMemoryDocumentStore()\n",
+    "\n",
+    "document_splitter = DocumentSplitter(split_by=\"word\", split_length=512, split_overlap=32)\n",
+    "document_embedder = SentenceTransformersDocumentEmbedder(model=model_embeddings, device=gpu)\n",
+    "document_writer = DocumentWriter(document_store)\n",
+    "\n",
+    "indexing_pipeline = Pipeline()\n",
+    "indexing_pipeline.add_component(\"document_splitter\", document_splitter)\n",
+    "indexing_pipeline.add_component(\"document_embedder\", document_embedder)\n",
+    "indexing_pipeline.add_component(\"document_writer\", document_writer)\n",
+    "\n",
+    "indexing_pipeline.connect(\"document_splitter\", \"document_embedder\")\n",
+    "indexing_pipeline.connect(\"document_embedder\", \"document_writer\")\n",
+    "\n",
+    "res = indexing_pipeline.run({\"document_splitter\": {\"documents\": results}})\n",
+    "\n",
+    "print(f\"Finished running indexing pipeline\\nResult: Wrote {res['document_writer']['documents_written']} to document store\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<haystack.core.pipeline.pipeline.Pipeline object at 0x7f7315fc1b10>\n",
+       "🚅 Components\n",
+       "  - text_embedder: SentenceTransformersTextEmbedder\n",
+       "  - embedding_retriever: InMemoryEmbeddingRetriever\n",
+       "  - bm25_retriever: InMemoryBM25Retriever\n",
+       "  - document_joiner: DocumentJoiner\n",
+       "  - ranker: TransformersSimilarityRanker\n",
+       "🛤️ Connections\n",
+       "  - text_embedder.embedding -> embedding_retriever.query_embedding (List[float])\n",
+       "  - embedding_retriever.documents -> document_joiner.documents (List[Document])\n",
+       "  - bm25_retriever.documents -> document_joiner.documents (List[Document])\n",
+       "  - document_joiner.documents -> ranker.documents (List[Document])"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from haystack.components.embedders import SentenceTransformersTextEmbedder\n",
+    "from haystack.components.joiners import DocumentJoiner\n",
+    "from haystack.components.rankers import TransformersSimilarityRanker\n",
+    "from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever\n",
+    "\n",
+    "text_embedder = SentenceTransformersTextEmbedder(model=model_embeddings, device=gpu, progress_bar=False)\n",
+    "embedding_retriever = InMemoryEmbeddingRetriever(document_store)\n",
+    "bm25_retriever = InMemoryBM25Retriever(document_store)\n",
+    "document_joiner = DocumentJoiner()\n",
+    "ranker = TransformersSimilarityRanker(model=model_ranker, device=gpu)\n",
+    "\n",
+    "hybrid_retrieval = Pipeline()\n",
+    "hybrid_retrieval.add_component(\"text_embedder\", text_embedder)\n",
+    "hybrid_retrieval.add_component(\"embedding_retriever\", embedding_retriever)\n",
+    "hybrid_retrieval.add_component(\"bm25_retriever\", bm25_retriever)\n",
+    "hybrid_retrieval.add_component(\"document_joiner\", document_joiner)\n",
+    "hybrid_retrieval.add_component(\"ranker\", ranker)\n",
+    "\n",
+    "hybrid_retrieval.connect(\"text_embedder\", \"embedding_retriever\")\n",
+    "hybrid_retrieval.connect(\"bm25_retriever\", \"document_joiner\")\n",
+    "hybrid_retrieval.connect(\"embedding_retriever\", \"document_joiner\")\n",
+    "hybrid_retrieval.connect(\"document_joiner\", \"ranker\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hybrid_retrieval.draw(\"hybrid-retrieval.png\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Results for 'Give an example for dependency'\n",
+      "\n",
+      "| ID       | Title            | Link                      |\n",
+      "|----------|------------------|---------------------------|\n",
+      "| [ 1579 ] | Tech Loops       | https://www.xkcd.com/1579 |\n",
+      "| [ 1906 ] | Making Progress  | https://www.xkcd.com/1906 |\n",
+      "| [ 2102 ] | Internet Archive | https://www.xkcd.com/2102 |\n",
+      "| [ 2347 ] | Dependency       | https://www.xkcd.com/2347 |\n",
+      "| [ 1988 ] | Containers       | https://www.xkcd.com/1988 |\n"
+     ]
+    }
+   ],
+   "source": [
+    "from tabulate import tabulate\n",
+    "\n",
+    "query = \"Give an example for dependency\"\n",
+    "\n",
+    "result = hybrid_retrieval.run(\n",
+    "    data={\n",
+    "        \"text_embedder\": {\n",
+    "            \"text\": query\n",
+    "        },\n",
+    "        \"bm25_retriever\": {\n",
+    "            \"query\": query\n",
+    "        },\n",
+    "        \"ranker\": {\n",
+    "            \"query\": query,\n",
+    "            \"top_k\": 5\n",
+    "        }\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "result_table = []\n",
+    "headers = [\"ID\", \"Title\", \"Link\"]\n",
+    "\n",
+    "x: Document | None\n",
+    "for x in result[\"ranker\"][\"documents\"]:\n",
+    "    if x is None:\n",
+    "        continue\n",
+    "    result_table.append([f\"[ {x.id:4} ]\", x.meta[\"title\"], x.meta[\"url\"]])\n",
+    "\n",
+    "print(f\"Results for '{query}'\\n\")\n",
+    "print(tabulate(result_table, headers=headers, tablefmt=\"github\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
				`@ -0,0 +1 @@`
				`data/*/ filter=lfs diff=lfs merge=lfs -text`