diff --git a/nlp.ipynb b/nlp.ipynb index aaf0d84..cabe4a7 100644 --- a/nlp.ipynb +++ b/nlp.ipynb @@ -42,6 +42,34 @@ "metadata": { "metadata": {} }, + "outputs": [], + "source": [ + "import json\n", + "import pathlib\n", + "\n", + "dataset = pathlib.Path(\"data\") / \"dataset.jsonl\"\n", + "data = []\n", + "\n", + "with open(dataset, \"r\") as f:\n", + " for x in f.readlines():\n", + " j: dict = json.loads(x)\n", + " j.update({\n", + " \"text\": f\"{j['title']} | {j['transcript']} | {j['explanation']}\",\n", + " \"meta\": {\n", + " \"title\": j[\"title\"],\n", + " \"url\": j[\"url\"],\n", + " \"image_url\": j[\"image_url\"],\n", + " \"id\": j[\"id\"]\n", + " }\n", + " }\n", + " )\n", + " data.append(j)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, "outputs": [ { "data": { @@ -87,7 +115,7 @@ " <td>https://www.explainxkcd.com/wiki/index.php/1:_...</td>\n", " <td>[A boy sits in a barrel which is floating in a...</td>\n", " <td>This was the fifth comic originally posted to ...</td>\n", - " <td>barrel - part 1 | [a boy sits in a barrel whic...</td>\n", + " <td>Barrel - Part 1 | [A boy sits in a barrel whic...</td>\n", " <td>{'title': 'Barrel - Part 1', 'url': 'https://w...</td>\n", " </tr>\n", " <tr>\n", @@ -100,7 +128,7 @@ " <td>https://www.explainxkcd.com/wiki/index.php/2:_...</td>\n", " <td>[Two trees are growing on opposite sides of a ...</td>\n", " <td>This was the fourth comic originally posted to...</td>\n", - " <td>petit trees (sketch) | [two trees are growing ...</td>\n", + " <td>Petit Trees (sketch) | [Two trees are growing ...</td>\n", " <td>{'title': 'Petit Trees (sketch)', 'url': 'http...</td>\n", " </tr>\n", " <tr>\n", @@ -113,7 +141,7 @@ " <td>https://www.explainxkcd.com/wiki/index.php/3:_...</td>\n", " <td>[A green island surrounded by blue water]\\nThi...</td>\n", " <td>This was the third comic originally posted to ...</td>\n", - " <td>island (sketch) | [a green island surrounded b...</td>\n", + " <td>Island (sketch) | [A green island surrounded b...</td>\n", " <td>{'title': 'Island (sketch)', 'url': 'https://w...</td>\n", " </tr>\n", " <tr>\n", @@ -126,7 +154,7 @@ " <td>https://www.explainxkcd.com/wiki/index.php/4:_...</td>\n", " <td>[A sketch of a landscape with sun on the horiz...</td>\n", " <td>This was the second comic originally posted to...</td>\n", - " <td>landscape (sketch) | [a sketch of a landscape ...</td>\n", + " <td>Landscape (sketch) | [A sketch of a landscape ...</td>\n", " <td>{'title': 'Landscape (sketch)', 'url': 'https:...</td>\n", " </tr>\n", " <tr>\n", @@ -139,7 +167,7 @@ " <td>https://www.explainxkcd.com/wiki/index.php/5:_...</td>\n", " <td>[A black number 70 sees a red package with the...</td>\n", " <td>This comic is a mathematical and technical jok...</td>\n", - " <td>blown apart | [a black number 70 sees a red pa...</td>\n", + " <td>Blown apart | [A black number 70 sees a red pa...</td>\n", " <td>{'title': 'Blown apart', 'url': 'https://www.x...</td>\n", " </tr>\n", " </tbody>\n", @@ -183,11 +211,11 @@ "4 This comic is a mathematical and technical jok... \n", "\n", " text \\\n", - "0 barrel - part 1 | [a boy sits in a barrel whic... \n", - "1 petit trees (sketch) | [two trees are growing ... \n", - "2 island (sketch) | [a green island surrounded b... \n", - "3 landscape (sketch) | [a sketch of a landscape ... \n", - "4 blown apart | [a black number 70 sees a red pa... \n", + "0 Barrel - Part 1 | [A boy sits in a barrel whic... \n", + "1 Petit Trees (sketch) | [Two trees are growing ... \n", + "2 Island (sketch) | [A green island surrounded b... \n", + "3 Landscape (sketch) | [A sketch of a landscape ... \n", + "4 Blown apart | [A black number 70 sees a red pa... \n", "\n", " meta \n", "0 {'title': 'Barrel - Part 1', 'url': 'https://w... \n", @@ -197,40 +225,32 @@ "4 {'title': 'Blown apart', 'url': 'https://www.x... " ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import json\n", - "import pathlib\n", - "\n", "import pandas as pd\n", "\n", - "dataset = pathlib.Path(\"data\") / \"dataset.jsonl\"\n", - "data = []\n", - "\n", - "with open(dataset, \"r\") as f:\n", - " for x in f.readlines():\n", - " j = json.loads(x)\n", - " j.update({ \"text\": f\"{j['title']} | {j['transcript']} | {j['explanation']}\".lower() })\n", - " j.update({ \"meta\": {\n", - " \"title\": j[\"title\"],\n", - " \"url\": j[\"url\"],\n", - " \"image_url\": j[\"image_url\"],\n", - " \"id\": j[\"id\"]\n", - " }})\n", - " data.append(j)\n", - "\n", "df = pd.DataFrame(data)\n", - "\n", "df.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from haystack import Document\n", + "\n", + "documents = [Document(content=d[\"text\"], meta=d[\"meta\"]) for d in data]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "metadata": { "metadata": {} }, @@ -238,7 +258,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6e3b2d47cde54d50a8a92305bda7fed4", + "model_id": "705b147162734470908bbf2ab6d45db3", "version_major": 2, "version_minor": 0 }, @@ -253,13 +273,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Finished running pipeline\n", + "Finished running indexing pipeline\n", "Result: Wrote 5343 to document store\n" ] } ], "source": [ - "from haystack import Document, Pipeline\n", + "from haystack import Pipeline\n", "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n", "from haystack.components.preprocessors.document_splitter import DocumentSplitter\n", "from haystack.components.writers import DocumentWriter\n", @@ -268,7 +288,6 @@ "model_embeddings = \"BAAI/bge-small-en-v1.5\"\n", "model_ranker = \"BAAI/bge-reranker-base\"\n", "\n", - "results = [Document(content=d[\"text\"], meta=d[\"meta\"]) for d in data]\n", "document_store = InMemoryDocumentStore()\n", "\n", "document_splitter = DocumentSplitter(split_by=\"word\", split_length=512, split_overlap=32)\n", @@ -276,6 +295,7 @@ "document_writer = DocumentWriter(document_store)\n", "\n", "indexing_pipeline = Pipeline()\n", + "\n", "indexing_pipeline.add_component(\"document_splitter\", document_splitter)\n", "indexing_pipeline.add_component(\"document_embedder\", document_embedder)\n", "indexing_pipeline.add_component(\"document_writer\", document_writer)\n", @@ -283,20 +303,20 @@ "indexing_pipeline.connect(\"document_splitter\", \"document_embedder\")\n", "indexing_pipeline.connect(\"document_embedder\", \"document_writer\")\n", "\n", - "res = indexing_pipeline.run({\"document_splitter\": {\"documents\": results}})\n", + "res = indexing_pipeline.run({\"document_splitter\": {\"documents\": documents}})\n", "\n", "print(f\"Finished running indexing pipeline\\nResult: Wrote {res['document_writer']['documents_written']} to document store\")" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "<haystack.core.pipeline.pipeline.Pipeline object at 0x7f7315fc1b10>\n", + "<haystack.core.pipeline.pipeline.Pipeline object at 0x7ef6cfa25900>\n", "🚅 Components\n", " - text_embedder: SentenceTransformersTextEmbedder\n", " - embedding_retriever: InMemoryEmbeddingRetriever\n", @@ -310,7 +330,7 @@ " - document_joiner.documents -> ranker.documents (List[Document])" ] }, - "execution_count": 12, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -342,7 +362,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -351,7 +371,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -366,7 +386,7 @@ "| [ 1906 ] | Making Progress | https://www.xkcd.com/1906 |\n", "| [ 2102 ] | Internet Archive | https://www.xkcd.com/2102 |\n", "| [ 2347 ] | Dependency | https://www.xkcd.com/2347 |\n", - "| [ 1988 ] | Containers | https://www.xkcd.com/1988 |\n" + "| [ 2166 ] | Stack | https://www.xkcd.com/2166 |\n" ] } ],