Some improvements
This commit is contained in:
parent
b135ce28c5
commit
6c1d6762b5
1 changed files with 62 additions and 42 deletions
104
nlp.ipynb
104
nlp.ipynb
|
@ -42,6 +42,34 @@
|
|||
"metadata": {
|
||||
"metadata": {}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import pathlib\n",
|
||||
"\n",
|
||||
"dataset = pathlib.Path(\"data\") / \"dataset.jsonl\"\n",
|
||||
"data = []\n",
|
||||
"\n",
|
||||
"with open(dataset, \"r\") as f:\n",
|
||||
" for x in f.readlines():\n",
|
||||
" j: dict = json.loads(x)\n",
|
||||
" j.update({\n",
|
||||
" \"text\": f\"{j['title']} | {j['transcript']} | {j['explanation']}\",\n",
|
||||
" \"meta\": {\n",
|
||||
" \"title\": j[\"title\"],\n",
|
||||
" \"url\": j[\"url\"],\n",
|
||||
" \"image_url\": j[\"image_url\"],\n",
|
||||
" \"id\": j[\"id\"]\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
" data.append(j)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
|
@ -87,7 +115,7 @@
|
|||
" <td>https://www.explainxkcd.com/wiki/index.php/1:_...</td>\n",
|
||||
" <td>[A boy sits in a barrel which is floating in a...</td>\n",
|
||||
" <td>This was the fifth comic originally posted to ...</td>\n",
|
||||
" <td>barrel - part 1 | [a boy sits in a barrel whic...</td>\n",
|
||||
" <td>Barrel - Part 1 | [A boy sits in a barrel whic...</td>\n",
|
||||
" <td>{'title': 'Barrel - Part 1', 'url': 'https://w...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
|
@ -100,7 +128,7 @@
|
|||
" <td>https://www.explainxkcd.com/wiki/index.php/2:_...</td>\n",
|
||||
" <td>[Two trees are growing on opposite sides of a ...</td>\n",
|
||||
" <td>This was the fourth comic originally posted to...</td>\n",
|
||||
" <td>petit trees (sketch) | [two trees are growing ...</td>\n",
|
||||
" <td>Petit Trees (sketch) | [Two trees are growing ...</td>\n",
|
||||
" <td>{'title': 'Petit Trees (sketch)', 'url': 'http...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
|
@ -113,7 +141,7 @@
|
|||
" <td>https://www.explainxkcd.com/wiki/index.php/3:_...</td>\n",
|
||||
" <td>[A green island surrounded by blue water]\\nThi...</td>\n",
|
||||
" <td>This was the third comic originally posted to ...</td>\n",
|
||||
" <td>island (sketch) | [a green island surrounded b...</td>\n",
|
||||
" <td>Island (sketch) | [A green island surrounded b...</td>\n",
|
||||
" <td>{'title': 'Island (sketch)', 'url': 'https://w...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
|
@ -126,7 +154,7 @@
|
|||
" <td>https://www.explainxkcd.com/wiki/index.php/4:_...</td>\n",
|
||||
" <td>[A sketch of a landscape with sun on the horiz...</td>\n",
|
||||
" <td>This was the second comic originally posted to...</td>\n",
|
||||
" <td>landscape (sketch) | [a sketch of a landscape ...</td>\n",
|
||||
" <td>Landscape (sketch) | [A sketch of a landscape ...</td>\n",
|
||||
" <td>{'title': 'Landscape (sketch)', 'url': 'https:...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
|
@ -139,7 +167,7 @@
|
|||
" <td>https://www.explainxkcd.com/wiki/index.php/5:_...</td>\n",
|
||||
" <td>[A black number 70 sees a red package with the...</td>\n",
|
||||
" <td>This comic is a mathematical and technical jok...</td>\n",
|
||||
" <td>blown apart | [a black number 70 sees a red pa...</td>\n",
|
||||
" <td>Blown apart | [A black number 70 sees a red pa...</td>\n",
|
||||
" <td>{'title': 'Blown apart', 'url': 'https://www.x...</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
|
@ -183,11 +211,11 @@
|
|||
"4 This comic is a mathematical and technical jok... \n",
|
||||
"\n",
|
||||
" text \\\n",
|
||||
"0 barrel - part 1 | [a boy sits in a barrel whic... \n",
|
||||
"1 petit trees (sketch) | [two trees are growing ... \n",
|
||||
"2 island (sketch) | [a green island surrounded b... \n",
|
||||
"3 landscape (sketch) | [a sketch of a landscape ... \n",
|
||||
"4 blown apart | [a black number 70 sees a red pa... \n",
|
||||
"0 Barrel - Part 1 | [A boy sits in a barrel whic... \n",
|
||||
"1 Petit Trees (sketch) | [Two trees are growing ... \n",
|
||||
"2 Island (sketch) | [A green island surrounded b... \n",
|
||||
"3 Landscape (sketch) | [A sketch of a landscape ... \n",
|
||||
"4 Blown apart | [A black number 70 sees a red pa... \n",
|
||||
"\n",
|
||||
" meta \n",
|
||||
"0 {'title': 'Barrel - Part 1', 'url': 'https://w... \n",
|
||||
|
@ -197,40 +225,32 @@
|
|||
"4 {'title': 'Blown apart', 'url': 'https://www.x... "
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import pathlib\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"dataset = pathlib.Path(\"data\") / \"dataset.jsonl\"\n",
|
||||
"data = []\n",
|
||||
"\n",
|
||||
"with open(dataset, \"r\") as f:\n",
|
||||
" for x in f.readlines():\n",
|
||||
" j = json.loads(x)\n",
|
||||
" j.update({ \"text\": f\"{j['title']} | {j['transcript']} | {j['explanation']}\".lower() })\n",
|
||||
" j.update({ \"meta\": {\n",
|
||||
" \"title\": j[\"title\"],\n",
|
||||
" \"url\": j[\"url\"],\n",
|
||||
" \"image_url\": j[\"image_url\"],\n",
|
||||
" \"id\": j[\"id\"]\n",
|
||||
" }})\n",
|
||||
" data.append(j)\n",
|
||||
"\n",
|
||||
"df = pd.DataFrame(data)\n",
|
||||
"\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from haystack import Document\n",
|
||||
"\n",
|
||||
"documents = [Document(content=d[\"text\"], meta=d[\"meta\"]) for d in data]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"metadata": {}
|
||||
},
|
||||
|
@ -238,7 +258,7 @@
|
|||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "6e3b2d47cde54d50a8a92305bda7fed4",
|
||||
"model_id": "705b147162734470908bbf2ab6d45db3",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
|
@ -253,13 +273,13 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Finished running pipeline\n",
|
||||
"Finished running indexing pipeline\n",
|
||||
"Result: Wrote 5343 to document store\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from haystack import Document, Pipeline\n",
|
||||
"from haystack import Pipeline\n",
|
||||
"from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n",
|
||||
"from haystack.components.preprocessors.document_splitter import DocumentSplitter\n",
|
||||
"from haystack.components.writers import DocumentWriter\n",
|
||||
|
@ -268,7 +288,6 @@
|
|||
"model_embeddings = \"BAAI/bge-small-en-v1.5\"\n",
|
||||
"model_ranker = \"BAAI/bge-reranker-base\"\n",
|
||||
"\n",
|
||||
"results = [Document(content=d[\"text\"], meta=d[\"meta\"]) for d in data]\n",
|
||||
"document_store = InMemoryDocumentStore()\n",
|
||||
"\n",
|
||||
"document_splitter = DocumentSplitter(split_by=\"word\", split_length=512, split_overlap=32)\n",
|
||||
|
@ -276,6 +295,7 @@
|
|||
"document_writer = DocumentWriter(document_store)\n",
|
||||
"\n",
|
||||
"indexing_pipeline = Pipeline()\n",
|
||||
"\n",
|
||||
"indexing_pipeline.add_component(\"document_splitter\", document_splitter)\n",
|
||||
"indexing_pipeline.add_component(\"document_embedder\", document_embedder)\n",
|
||||
"indexing_pipeline.add_component(\"document_writer\", document_writer)\n",
|
||||
|
@ -283,20 +303,20 @@
|
|||
"indexing_pipeline.connect(\"document_splitter\", \"document_embedder\")\n",
|
||||
"indexing_pipeline.connect(\"document_embedder\", \"document_writer\")\n",
|
||||
"\n",
|
||||
"res = indexing_pipeline.run({\"document_splitter\": {\"documents\": results}})\n",
|
||||
"res = indexing_pipeline.run({\"document_splitter\": {\"documents\": documents}})\n",
|
||||
"\n",
|
||||
"print(f\"Finished running indexing pipeline\\nResult: Wrote {res['document_writer']['documents_written']} to document store\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<haystack.core.pipeline.pipeline.Pipeline object at 0x7f7315fc1b10>\n",
|
||||
"<haystack.core.pipeline.pipeline.Pipeline object at 0x7ef6cfa25900>\n",
|
||||
"🚅 Components\n",
|
||||
" - text_embedder: SentenceTransformersTextEmbedder\n",
|
||||
" - embedding_retriever: InMemoryEmbeddingRetriever\n",
|
||||
|
@ -310,7 +330,7 @@
|
|||
" - document_joiner.documents -> ranker.documents (List[Document])"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -342,7 +362,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -351,7 +371,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -366,7 +386,7 @@
|
|||
"| [ 1906 ] | Making Progress | https://www.xkcd.com/1906 |\n",
|
||||
"| [ 2102 ] | Internet Archive | https://www.xkcd.com/2102 |\n",
|
||||
"| [ 2347 ] | Dependency | https://www.xkcd.com/2347 |\n",
|
||||
"| [ 1988 ] | Containers | https://www.xkcd.com/1988 |\n"
|
||||
"| [ 2166 ] | Stack | https://www.xkcd.com/2166 |\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
Loading…
Add table
Reference in a new issue