Some improvements

2024-11-29 15:09:20 +01:00 · 2024-11-29 15:09:20 +01:00 · 6c1d6762b5
commit 6c1d6762b5
parent b135ce28c5
1 changed files with 62 additions and 42 deletions
--- a/nlp.ipynb
+++ b/nlp.ipynb
@ -42,6 +42,34 @@
   "metadata": {
    "metadata": {}
   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import pathlib\n",
+    "\n",
+    "dataset = pathlib.Path(\"data\") / \"dataset.jsonl\"\n",
+    "data = []\n",
+    "\n",
+    "with open(dataset, \"r\") as f:\n",
+    "    for x in f.readlines():\n",
+    "        j: dict = json.loads(x)\n",
+    "        j.update({\n",
+    "            \"text\": f\"{j['title']} | {j['transcript']} | {j['explanation']}\",\n",
+    "            \"meta\": {\n",
+    "                \"title\": j[\"title\"],\n",
+    "                \"url\": j[\"url\"],\n",
+    "                \"image_url\": j[\"image_url\"],\n",
+    "                \"id\": j[\"id\"]\n",
+    "                }\n",
+    "            }\n",
+    "        )\n",
+    "        data.append(j)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
   "outputs": [
    {
     "data": {
@ -87,7 +115,7 @@
       "      <td>https://www.explainxkcd.com/wiki/index.php/1:_...</td>\n",
       "      <td>[A boy sits in a barrel which is floating in a...</td>\n",
       "      <td>This was the fifth comic originally posted to ...</td>\n",
-       "      <td>barrel - part 1 | [a boy sits in a barrel whic...</td>\n",
+       "      <td>Barrel - Part 1 | [A boy sits in a barrel whic...</td>\n",
       "      <td>{'title': 'Barrel - Part 1', 'url': 'https://w...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
@ -100,7 +128,7 @@
       "      <td>https://www.explainxkcd.com/wiki/index.php/2:_...</td>\n",
       "      <td>[Two trees are growing on opposite sides of a ...</td>\n",
       "      <td>This was the fourth comic originally posted to...</td>\n",
-       "      <td>petit trees (sketch) | [two trees are growing ...</td>\n",
+       "      <td>Petit Trees (sketch) | [Two trees are growing ...</td>\n",
       "      <td>{'title': 'Petit Trees (sketch)', 'url': 'http...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
@ -113,7 +141,7 @@
       "      <td>https://www.explainxkcd.com/wiki/index.php/3:_...</td>\n",
       "      <td>[A green island surrounded by blue water]\\nThi...</td>\n",
       "      <td>This was the third comic originally posted to ...</td>\n",
-       "      <td>island (sketch) | [a green island surrounded b...</td>\n",
+       "      <td>Island (sketch) | [A green island surrounded b...</td>\n",
       "      <td>{'title': 'Island (sketch)', 'url': 'https://w...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
@ -126,7 +154,7 @@
       "      <td>https://www.explainxkcd.com/wiki/index.php/4:_...</td>\n",
       "      <td>[A sketch of a landscape with sun on the horiz...</td>\n",
       "      <td>This was the second comic originally posted to...</td>\n",
-       "      <td>landscape (sketch) | [a sketch of a landscape ...</td>\n",
+       "      <td>Landscape (sketch) | [A sketch of a landscape ...</td>\n",
       "      <td>{'title': 'Landscape (sketch)', 'url': 'https:...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
@ -139,7 +167,7 @@
       "      <td>https://www.explainxkcd.com/wiki/index.php/5:_...</td>\n",
       "      <td>[A black number 70 sees a red package with the...</td>\n",
       "      <td>This comic is a mathematical and technical jok...</td>\n",
-       "      <td>blown apart | [a black number 70 sees a red pa...</td>\n",
+       "      <td>Blown apart | [A black number 70 sees a red pa...</td>\n",
       "      <td>{'title': 'Blown apart', 'url': 'https://www.x...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
@ -183,11 +211,11 @@
       "4  This comic is a mathematical and technical jok...   \n",
       "\n",
       "                                                text  \\\n",
-       "0  barrel - part 1 | [a boy sits in a barrel whic...   \n",
-       "1  petit trees (sketch) | [two trees are growing ...   \n",
-       "2  island (sketch) | [a green island surrounded b...   \n",
-       "3  landscape (sketch) | [a sketch of a landscape ...   \n",
-       "4  blown apart | [a black number 70 sees a red pa...   \n",
+       "0  Barrel - Part 1 | [A boy sits in a barrel whic...   \n",
+       "1  Petit Trees (sketch) | [Two trees are growing ...   \n",
+       "2  Island (sketch) | [A green island surrounded b...   \n",
+       "3  Landscape (sketch) | [A sketch of a landscape ...   \n",
+       "4  Blown apart | [A black number 70 sees a red pa...   \n",
       "\n",
       "                                                meta  \n",
       "0  {'title': 'Barrel - Part 1', 'url': 'https://w...  \n",
@ -197,40 +225,32 @@
       "4  {'title': 'Blown apart', 'url': 'https://www.x...  "
      ]
     },
-     "execution_count": 3,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "import json\n",
-    "import pathlib\n",
-    "\n",
    "import pandas as pd\n",
    "\n",
-    "dataset = pathlib.Path(\"data\") / \"dataset.jsonl\"\n",
-    "data = []\n",
-    "\n",
-    "with open(dataset, \"r\") as f:\n",
-    "    for x in f.readlines():\n",
-    "        j = json.loads(x)\n",
-    "        j.update({ \"text\": f\"{j['title']} | {j['transcript']} | {j['explanation']}\".lower() })\n",
-    "        j.update({ \"meta\": {\n",
-    "            \"title\": j[\"title\"],\n",
-    "            \"url\": j[\"url\"],\n",
-    "            \"image_url\": j[\"image_url\"],\n",
-    "            \"id\": j[\"id\"]\n",
-    "        }})\n",
-    "        data.append(j)\n",
-    "\n",
    "df = pd.DataFrame(data)\n",
-    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from haystack import Document\n",
+    "\n",
+    "documents = [Document(content=d[\"text\"], meta=d[\"meta\"]) for d in data]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
   "metadata": {
    "metadata": {}
   },
@ -238,7 +258,7 @@
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6e3b2d47cde54d50a8a92305bda7fed4",
+       "model_id": "705b147162734470908bbf2ab6d45db3",
       "version_major": 2,
       "version_minor": 0
      },
@ -253,13 +273,13 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Finished running pipeline\n",
+      "Finished running indexing pipeline\n",
      "Result: Wrote 5343 to document store\n"
     ]
    }
   ],
   "source": [
-    "from haystack import Document, Pipeline\n",
+    "from haystack import Pipeline\n",
    "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n",
    "from haystack.components.preprocessors.document_splitter import DocumentSplitter\n",
    "from haystack.components.writers import DocumentWriter\n",
@ -268,7 +288,6 @@
    "model_embeddings = \"BAAI/bge-small-en-v1.5\"\n",
    "model_ranker = \"BAAI/bge-reranker-base\"\n",
    "\n",
-    "results = [Document(content=d[\"text\"], meta=d[\"meta\"]) for d in data]\n",
    "document_store = InMemoryDocumentStore()\n",
    "\n",
    "document_splitter = DocumentSplitter(split_by=\"word\", split_length=512, split_overlap=32)\n",
@ -276,6 +295,7 @@
    "document_writer = DocumentWriter(document_store)\n",
    "\n",
    "indexing_pipeline = Pipeline()\n",
+    "\n",
    "indexing_pipeline.add_component(\"document_splitter\", document_splitter)\n",
    "indexing_pipeline.add_component(\"document_embedder\", document_embedder)\n",
    "indexing_pipeline.add_component(\"document_writer\", document_writer)\n",
@ -283,20 +303,20 @@
    "indexing_pipeline.connect(\"document_splitter\", \"document_embedder\")\n",
    "indexing_pipeline.connect(\"document_embedder\", \"document_writer\")\n",
    "\n",
-    "res = indexing_pipeline.run({\"document_splitter\": {\"documents\": results}})\n",
+    "res = indexing_pipeline.run({\"document_splitter\": {\"documents\": documents}})\n",
    "\n",
    "print(f\"Finished running indexing pipeline\\nResult: Wrote {res['document_writer']['documents_written']} to document store\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "<haystack.core.pipeline.pipeline.Pipeline object at 0x7f7315fc1b10>\n",
+       "<haystack.core.pipeline.pipeline.Pipeline object at 0x7ef6cfa25900>\n",
       "🚅 Components\n",
       "  - text_embedder: SentenceTransformersTextEmbedder\n",
       "  - embedding_retriever: InMemoryEmbeddingRetriever\n",
@ -310,7 +330,7 @@
       "  - document_joiner.documents -> ranker.documents (List[Document])"
      ]
     },
-     "execution_count": 12,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -342,7 +362,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -351,7 +371,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@ -366,7 +386,7 @@
      "| [ 1906 ] | Making Progress  | https://www.xkcd.com/1906 |\n",
      "| [ 2102 ] | Internet Archive | https://www.xkcd.com/2102 |\n",
      "| [ 2347 ] | Dependency       | https://www.xkcd.com/2347 |\n",
-      "| [ 1988 ] | Containers       | https://www.xkcd.com/1988 |\n"
+      "| [ 2166 ] | Stack            | https://www.xkcd.com/2166 |\n"
     ]
    }
   ],