Some improvements

This commit is contained in:
Firq 2024-11-29 15:09:20 +01:00
parent b135ce28c5
commit 6c1d6762b5

104
nlp.ipynb
View file

@ -42,6 +42,34 @@
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"import json\n",
"import pathlib\n",
"\n",
"dataset = pathlib.Path(\"data\") / \"dataset.jsonl\"\n",
"data = []\n",
"\n",
"with open(dataset, \"r\") as f:\n",
" for x in f.readlines():\n",
" j: dict = json.loads(x)\n",
" j.update({\n",
" \"text\": f\"{j['title']} | {j['transcript']} | {j['explanation']}\",\n",
" \"meta\": {\n",
" \"title\": j[\"title\"],\n",
" \"url\": j[\"url\"],\n",
" \"image_url\": j[\"image_url\"],\n",
" \"id\": j[\"id\"]\n",
" }\n",
" }\n",
" )\n",
" data.append(j)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
@ -87,7 +115,7 @@
" <td>https://www.explainxkcd.com/wiki/index.php/1:_...</td>\n",
" <td>[A boy sits in a barrel which is floating in a...</td>\n",
" <td>This was the fifth comic originally posted to ...</td>\n",
" <td>barrel - part 1 | [a boy sits in a barrel whic...</td>\n",
" <td>Barrel - Part 1 | [A boy sits in a barrel whic...</td>\n",
" <td>{'title': 'Barrel - Part 1', 'url': 'https://w...</td>\n",
" </tr>\n",
" <tr>\n",
@ -100,7 +128,7 @@
" <td>https://www.explainxkcd.com/wiki/index.php/2:_...</td>\n",
" <td>[Two trees are growing on opposite sides of a ...</td>\n",
" <td>This was the fourth comic originally posted to...</td>\n",
" <td>petit trees (sketch) | [two trees are growing ...</td>\n",
" <td>Petit Trees (sketch) | [Two trees are growing ...</td>\n",
" <td>{'title': 'Petit Trees (sketch)', 'url': 'http...</td>\n",
" </tr>\n",
" <tr>\n",
@ -113,7 +141,7 @@
" <td>https://www.explainxkcd.com/wiki/index.php/3:_...</td>\n",
" <td>[A green island surrounded by blue water]\\nThi...</td>\n",
" <td>This was the third comic originally posted to ...</td>\n",
" <td>island (sketch) | [a green island surrounded b...</td>\n",
" <td>Island (sketch) | [A green island surrounded b...</td>\n",
" <td>{'title': 'Island (sketch)', 'url': 'https://w...</td>\n",
" </tr>\n",
" <tr>\n",
@ -126,7 +154,7 @@
" <td>https://www.explainxkcd.com/wiki/index.php/4:_...</td>\n",
" <td>[A sketch of a landscape with sun on the horiz...</td>\n",
" <td>This was the second comic originally posted to...</td>\n",
" <td>landscape (sketch) | [a sketch of a landscape ...</td>\n",
" <td>Landscape (sketch) | [A sketch of a landscape ...</td>\n",
" <td>{'title': 'Landscape (sketch)', 'url': 'https:...</td>\n",
" </tr>\n",
" <tr>\n",
@ -139,7 +167,7 @@
" <td>https://www.explainxkcd.com/wiki/index.php/5:_...</td>\n",
" <td>[A black number 70 sees a red package with the...</td>\n",
" <td>This comic is a mathematical and technical jok...</td>\n",
" <td>blown apart | [a black number 70 sees a red pa...</td>\n",
" <td>Blown apart | [A black number 70 sees a red pa...</td>\n",
" <td>{'title': 'Blown apart', 'url': 'https://www.x...</td>\n",
" </tr>\n",
" </tbody>\n",
@ -183,11 +211,11 @@
"4 This comic is a mathematical and technical jok... \n",
"\n",
" text \\\n",
"0 barrel - part 1 | [a boy sits in a barrel whic... \n",
"1 petit trees (sketch) | [two trees are growing ... \n",
"2 island (sketch) | [a green island surrounded b... \n",
"3 landscape (sketch) | [a sketch of a landscape ... \n",
"4 blown apart | [a black number 70 sees a red pa... \n",
"0 Barrel - Part 1 | [A boy sits in a barrel whic... \n",
"1 Petit Trees (sketch) | [Two trees are growing ... \n",
"2 Island (sketch) | [A green island surrounded b... \n",
"3 Landscape (sketch) | [A sketch of a landscape ... \n",
"4 Blown apart | [A black number 70 sees a red pa... \n",
"\n",
" meta \n",
"0 {'title': 'Barrel - Part 1', 'url': 'https://w... \n",
@ -197,40 +225,32 @@
"4 {'title': 'Blown apart', 'url': 'https://www.x... "
]
},
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import json\n",
"import pathlib\n",
"\n",
"import pandas as pd\n",
"\n",
"dataset = pathlib.Path(\"data\") / \"dataset.jsonl\"\n",
"data = []\n",
"\n",
"with open(dataset, \"r\") as f:\n",
" for x in f.readlines():\n",
" j = json.loads(x)\n",
" j.update({ \"text\": f\"{j['title']} | {j['transcript']} | {j['explanation']}\".lower() })\n",
" j.update({ \"meta\": {\n",
" \"title\": j[\"title\"],\n",
" \"url\": j[\"url\"],\n",
" \"image_url\": j[\"image_url\"],\n",
" \"id\": j[\"id\"]\n",
" }})\n",
" data.append(j)\n",
"\n",
"df = pd.DataFrame(data)\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from haystack import Document\n",
"\n",
"documents = [Document(content=d[\"text\"], meta=d[\"meta\"]) for d in data]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"metadata": {}
},
@ -238,7 +258,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6e3b2d47cde54d50a8a92305bda7fed4",
"model_id": "705b147162734470908bbf2ab6d45db3",
"version_major": 2,
"version_minor": 0
},
@ -253,13 +273,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Finished running pipeline\n",
"Finished running indexing pipeline\n",
"Result: Wrote 5343 to document store\n"
]
}
],
"source": [
"from haystack import Document, Pipeline\n",
"from haystack import Pipeline\n",
"from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n",
"from haystack.components.preprocessors.document_splitter import DocumentSplitter\n",
"from haystack.components.writers import DocumentWriter\n",
@ -268,7 +288,6 @@
"model_embeddings = \"BAAI/bge-small-en-v1.5\"\n",
"model_ranker = \"BAAI/bge-reranker-base\"\n",
"\n",
"results = [Document(content=d[\"text\"], meta=d[\"meta\"]) for d in data]\n",
"document_store = InMemoryDocumentStore()\n",
"\n",
"document_splitter = DocumentSplitter(split_by=\"word\", split_length=512, split_overlap=32)\n",
@ -276,6 +295,7 @@
"document_writer = DocumentWriter(document_store)\n",
"\n",
"indexing_pipeline = Pipeline()\n",
"\n",
"indexing_pipeline.add_component(\"document_splitter\", document_splitter)\n",
"indexing_pipeline.add_component(\"document_embedder\", document_embedder)\n",
"indexing_pipeline.add_component(\"document_writer\", document_writer)\n",
@ -283,20 +303,20 @@
"indexing_pipeline.connect(\"document_splitter\", \"document_embedder\")\n",
"indexing_pipeline.connect(\"document_embedder\", \"document_writer\")\n",
"\n",
"res = indexing_pipeline.run({\"document_splitter\": {\"documents\": results}})\n",
"res = indexing_pipeline.run({\"document_splitter\": {\"documents\": documents}})\n",
"\n",
"print(f\"Finished running indexing pipeline\\nResult: Wrote {res['document_writer']['documents_written']} to document store\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<haystack.core.pipeline.pipeline.Pipeline object at 0x7f7315fc1b10>\n",
"<haystack.core.pipeline.pipeline.Pipeline object at 0x7ef6cfa25900>\n",
"🚅 Components\n",
" - text_embedder: SentenceTransformersTextEmbedder\n",
" - embedding_retriever: InMemoryEmbeddingRetriever\n",
@ -310,7 +330,7 @@
" - document_joiner.documents -> ranker.documents (List[Document])"
]
},
"execution_count": 12,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -342,7 +362,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@ -351,7 +371,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 9,
"metadata": {},
"outputs": [
{
@ -366,7 +386,7 @@
"| [ 1906 ] | Making Progress | https://www.xkcd.com/1906 |\n",
"| [ 2102 ] | Internet Archive | https://www.xkcd.com/2102 |\n",
"| [ 2347 ] | Dependency | https://www.xkcd.com/2347 |\n",
"| [ 1988 ] | Containers | https://www.xkcd.com/1988 |\n"
"| [ 2166 ] | Stack | https://www.xkcd.com/2166 |\n"
]
}
],