commit b135ce28c545bfde585adcfe5b44d9dfea61077a Author: Firq <firelp42@gmail.com> Date: Fri Nov 29 14:45:19 2024 +0100 Initial commit diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..45719f8 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +data/**/* filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..43a9d49 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +# env +.env +*venv/ + +# Binaries for installation +*.deb +*.whl \ No newline at end of file diff --git a/data/dataset.jsonl b/data/dataset.jsonl new file mode 100644 index 0000000..de990bb --- /dev/null +++ b/data/dataset.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43890db5669aa2af6193ef7c9a32e4126b8c9271f8055b307dd18ac1a54a6b9 +size 12942390 diff --git a/hybrid-retrieval.png b/hybrid-retrieval.png new file mode 100644 index 0000000..8d446ad Binary files /dev/null and b/hybrid-retrieval.png differ diff --git a/nlp.ipynb b/nlp.ipynb new file mode 100644 index 0000000..aaf0d84 --- /dev/null +++ b/nlp.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Is CUDA or ROCm available? Yes\n", + "Available devices:\n", + "- [0] AMD Radeon RX 7900 XT [ 42 processors, 21.39 GB ]\n" + ] + } + ], + "source": [ + "import torch\n", + "print(f\"Is CUDA or ROCm available? { 'Yes' if torch.cuda.is_available() else 'No'}\")\n", + "print(\"Available devices:\")\n", + "for i in range(torch.cuda.device_count()):\n", + " dev = torch.cuda.get_device_properties(i)\n", + " print(f\"- [{i}] {dev.name} [ {dev.multi_processor_count} processors, {dev.total_memory / 1_000_000_000:.2f} GB ]\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from haystack.utils import ComponentDevice\n", + "gpu = ComponentDevice.from_str(\"cuda:0\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>id</th>\n", + " <th>title</th>\n", + " <th>image_title</th>\n", + " <th>url</th>\n", + " <th>image_url</th>\n", + " <th>explained_url</th>\n", + " <th>transcript</th>\n", + " <th>explanation</th>\n", + " <th>text</th>\n", + " <th>meta</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>Barrel - Part 1</td>\n", + " <td>Barrel - Part 1</td>\n", + " <td>https://www.xkcd.com/1</td>\n", + " <td>https://imgs.xkcd.com/comics/barrel_cropped_(1...</td>\n", + " <td>https://www.explainxkcd.com/wiki/index.php/1:_...</td>\n", + " <td>[A boy sits in a barrel which is floating in a...</td>\n", + " <td>This was the fifth comic originally posted to ...</td>\n", + " <td>barrel - part 1 | [a boy sits in a barrel whic...</td>\n", + " <td>{'title': 'Barrel - Part 1', 'url': 'https://w...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>2</td>\n", + " <td>Petit Trees (sketch)</td>\n", + " <td>Petit Trees (sketch)</td>\n", + " <td>https://www.xkcd.com/2</td>\n", + " <td>https://imgs.xkcd.com/comics/tree_cropped_(1).jpg</td>\n", + " <td>https://www.explainxkcd.com/wiki/index.php/2:_...</td>\n", + " <td>[Two trees are growing on opposite sides of a ...</td>\n", + " <td>This was the fourth comic originally posted to...</td>\n", + " <td>petit trees (sketch) | [two trees are growing ...</td>\n", + " <td>{'title': 'Petit Trees (sketch)', 'url': 'http...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>3</td>\n", + " <td>Island (sketch)</td>\n", + " <td>Island (sketch)</td>\n", + " <td>https://www.xkcd.com/3</td>\n", + " <td>https://imgs.xkcd.com/comics/island_color.jpg</td>\n", + " <td>https://www.explainxkcd.com/wiki/index.php/3:_...</td>\n", + " <td>[A green island surrounded by blue water]\\nThi...</td>\n", + " <td>This was the third comic originally posted to ...</td>\n", + " <td>island (sketch) | [a green island surrounded b...</td>\n", + " <td>{'title': 'Island (sketch)', 'url': 'https://w...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>4</td>\n", + " <td>Landscape (sketch)</td>\n", + " <td>Landscape (sketch)</td>\n", + " <td>https://www.xkcd.com/4</td>\n", + " <td>https://imgs.xkcd.com/comics/landscape_cropped...</td>\n", + " <td>https://www.explainxkcd.com/wiki/index.php/4:_...</td>\n", + " <td>[A sketch of a landscape with sun on the horiz...</td>\n", + " <td>This was the second comic originally posted to...</td>\n", + " <td>landscape (sketch) | [a sketch of a landscape ...</td>\n", + " <td>{'title': 'Landscape (sketch)', 'url': 'https:...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>5</td>\n", + " <td>Blown apart</td>\n", + " <td>Blown apart</td>\n", + " <td>https://www.xkcd.com/5</td>\n", + " <td>https://imgs.xkcd.com/comics/blownapart_color.jpg</td>\n", + " <td>https://www.explainxkcd.com/wiki/index.php/5:_...</td>\n", + " <td>[A black number 70 sees a red package with the...</td>\n", + " <td>This comic is a mathematical and technical jok...</td>\n", + " <td>blown apart | [a black number 70 sees a red pa...</td>\n", + " <td>{'title': 'Blown apart', 'url': 'https://www.x...</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " id title image_title url \\\n", + "0 1 Barrel - Part 1 Barrel - Part 1 https://www.xkcd.com/1 \n", + "1 2 Petit Trees (sketch) Petit Trees (sketch) https://www.xkcd.com/2 \n", + "2 3 Island (sketch) Island (sketch) https://www.xkcd.com/3 \n", + "3 4 Landscape (sketch) Landscape (sketch) https://www.xkcd.com/4 \n", + "4 5 Blown apart Blown apart https://www.xkcd.com/5 \n", + "\n", + " image_url \\\n", + "0 https://imgs.xkcd.com/comics/barrel_cropped_(1... \n", + "1 https://imgs.xkcd.com/comics/tree_cropped_(1).jpg \n", + "2 https://imgs.xkcd.com/comics/island_color.jpg \n", + "3 https://imgs.xkcd.com/comics/landscape_cropped... \n", + "4 https://imgs.xkcd.com/comics/blownapart_color.jpg \n", + "\n", + " explained_url \\\n", + "0 https://www.explainxkcd.com/wiki/index.php/1:_... \n", + "1 https://www.explainxkcd.com/wiki/index.php/2:_... \n", + "2 https://www.explainxkcd.com/wiki/index.php/3:_... \n", + "3 https://www.explainxkcd.com/wiki/index.php/4:_... \n", + "4 https://www.explainxkcd.com/wiki/index.php/5:_... \n", + "\n", + " transcript \\\n", + "0 [A boy sits in a barrel which is floating in a... \n", + "1 [Two trees are growing on opposite sides of a ... \n", + "2 [A green island surrounded by blue water]\\nThi... \n", + "3 [A sketch of a landscape with sun on the horiz... \n", + "4 [A black number 70 sees a red package with the... \n", + "\n", + " explanation \\\n", + "0 This was the fifth comic originally posted to ... \n", + "1 This was the fourth comic originally posted to... \n", + "2 This was the third comic originally posted to ... \n", + "3 This was the second comic originally posted to... \n", + "4 This comic is a mathematical and technical jok... \n", + "\n", + " text \\\n", + "0 barrel - part 1 | [a boy sits in a barrel whic... \n", + "1 petit trees (sketch) | [two trees are growing ... \n", + "2 island (sketch) | [a green island surrounded b... \n", + "3 landscape (sketch) | [a sketch of a landscape ... \n", + "4 blown apart | [a black number 70 sees a red pa... \n", + "\n", + " meta \n", + "0 {'title': 'Barrel - Part 1', 'url': 'https://w... \n", + "1 {'title': 'Petit Trees (sketch)', 'url': 'http... \n", + "2 {'title': 'Island (sketch)', 'url': 'https://w... \n", + "3 {'title': 'Landscape (sketch)', 'url': 'https:... \n", + "4 {'title': 'Blown apart', 'url': 'https://www.x... " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import json\n", + "import pathlib\n", + "\n", + "import pandas as pd\n", + "\n", + "dataset = pathlib.Path(\"data\") / \"dataset.jsonl\"\n", + "data = []\n", + "\n", + "with open(dataset, \"r\") as f:\n", + " for x in f.readlines():\n", + " j = json.loads(x)\n", + " j.update({ \"text\": f\"{j['title']} | {j['transcript']} | {j['explanation']}\".lower() })\n", + " j.update({ \"meta\": {\n", + " \"title\": j[\"title\"],\n", + " \"url\": j[\"url\"],\n", + " \"image_url\": j[\"image_url\"],\n", + " \"id\": j[\"id\"]\n", + " }})\n", + " data.append(j)\n", + "\n", + "df = pd.DataFrame(data)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "metadata": {} + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6e3b2d47cde54d50a8a92305bda7fed4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/167 [00:00<?, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished running pipeline\n", + "Result: Wrote 5343 to document store\n" + ] + } + ], + "source": [ + "from haystack import Document, Pipeline\n", + "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n", + "from haystack.components.preprocessors.document_splitter import DocumentSplitter\n", + "from haystack.components.writers import DocumentWriter\n", + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "\n", + "model_embeddings = \"BAAI/bge-small-en-v1.5\"\n", + "model_ranker = \"BAAI/bge-reranker-base\"\n", + "\n", + "results = [Document(content=d[\"text\"], meta=d[\"meta\"]) for d in data]\n", + "document_store = InMemoryDocumentStore()\n", + "\n", + "document_splitter = DocumentSplitter(split_by=\"word\", split_length=512, split_overlap=32)\n", + "document_embedder = SentenceTransformersDocumentEmbedder(model=model_embeddings, device=gpu)\n", + "document_writer = DocumentWriter(document_store)\n", + "\n", + "indexing_pipeline = Pipeline()\n", + "indexing_pipeline.add_component(\"document_splitter\", document_splitter)\n", + "indexing_pipeline.add_component(\"document_embedder\", document_embedder)\n", + "indexing_pipeline.add_component(\"document_writer\", document_writer)\n", + "\n", + "indexing_pipeline.connect(\"document_splitter\", \"document_embedder\")\n", + "indexing_pipeline.connect(\"document_embedder\", \"document_writer\")\n", + "\n", + "res = indexing_pipeline.run({\"document_splitter\": {\"documents\": results}})\n", + "\n", + "print(f\"Finished running indexing pipeline\\nResult: Wrote {res['document_writer']['documents_written']} to document store\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<haystack.core.pipeline.pipeline.Pipeline object at 0x7f7315fc1b10>\n", + "🚅 Components\n", + " - text_embedder: SentenceTransformersTextEmbedder\n", + " - embedding_retriever: InMemoryEmbeddingRetriever\n", + " - bm25_retriever: InMemoryBM25Retriever\n", + " - document_joiner: DocumentJoiner\n", + " - ranker: TransformersSimilarityRanker\n", + "🛤️ Connections\n", + " - text_embedder.embedding -> embedding_retriever.query_embedding (List[float])\n", + " - embedding_retriever.documents -> document_joiner.documents (List[Document])\n", + " - bm25_retriever.documents -> document_joiner.documents (List[Document])\n", + " - document_joiner.documents -> ranker.documents (List[Document])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from haystack.components.embedders import SentenceTransformersTextEmbedder\n", + "from haystack.components.joiners import DocumentJoiner\n", + "from haystack.components.rankers import TransformersSimilarityRanker\n", + "from haystack.components.retrievers.in_memory import InMemoryBM25Retriever, InMemoryEmbeddingRetriever\n", + "\n", + "text_embedder = SentenceTransformersTextEmbedder(model=model_embeddings, device=gpu, progress_bar=False)\n", + "embedding_retriever = InMemoryEmbeddingRetriever(document_store)\n", + "bm25_retriever = InMemoryBM25Retriever(document_store)\n", + "document_joiner = DocumentJoiner()\n", + "ranker = TransformersSimilarityRanker(model=model_ranker, device=gpu)\n", + "\n", + "hybrid_retrieval = Pipeline()\n", + "hybrid_retrieval.add_component(\"text_embedder\", text_embedder)\n", + "hybrid_retrieval.add_component(\"embedding_retriever\", embedding_retriever)\n", + "hybrid_retrieval.add_component(\"bm25_retriever\", bm25_retriever)\n", + "hybrid_retrieval.add_component(\"document_joiner\", document_joiner)\n", + "hybrid_retrieval.add_component(\"ranker\", ranker)\n", + "\n", + "hybrid_retrieval.connect(\"text_embedder\", \"embedding_retriever\")\n", + "hybrid_retrieval.connect(\"bm25_retriever\", \"document_joiner\")\n", + "hybrid_retrieval.connect(\"embedding_retriever\", \"document_joiner\")\n", + "hybrid_retrieval.connect(\"document_joiner\", \"ranker\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "hybrid_retrieval.draw(\"hybrid-retrieval.png\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Results for 'Give an example for dependency'\n", + "\n", + "| ID | Title | Link |\n", + "|----------|------------------|---------------------------|\n", + "| [ 1579 ] | Tech Loops | https://www.xkcd.com/1579 |\n", + "| [ 1906 ] | Making Progress | https://www.xkcd.com/1906 |\n", + "| [ 2102 ] | Internet Archive | https://www.xkcd.com/2102 |\n", + "| [ 2347 ] | Dependency | https://www.xkcd.com/2347 |\n", + "| [ 1988 ] | Containers | https://www.xkcd.com/1988 |\n" + ] + } + ], + "source": [ + "from tabulate import tabulate\n", + "\n", + "query = \"Give an example for dependency\"\n", + "\n", + "result = hybrid_retrieval.run(\n", + " data={\n", + " \"text_embedder\": {\n", + " \"text\": query\n", + " },\n", + " \"bm25_retriever\": {\n", + " \"query\": query\n", + " },\n", + " \"ranker\": {\n", + " \"query\": query,\n", + " \"top_k\": 5\n", + " }\n", + " }\n", + ")\n", + "\n", + "result_table = []\n", + "headers = [\"ID\", \"Title\", \"Link\"]\n", + "\n", + "x: Document | None\n", + "for x in result[\"ranker\"][\"documents\"]:\n", + " if x is None:\n", + " continue\n", + " result_table.append([f\"[ {x.id:4} ]\", x.meta[\"title\"], x.meta[\"url\"]])\n", + "\n", + "print(f\"Results for '{query}'\\n\")\n", + "print(tabulate(result_table, headers=headers, tablefmt=\"github\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}