Skip to content

Commit fea2a5e

Browse files
adding chunk id
Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
1 parent 38f8c23 commit fea2a5e

File tree

1 file changed

+38
-11
lines changed

1 file changed

+38
-11
lines changed

examples/rag-docling/docling-demo.ipynb

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,9 @@
393393
"cell_type": "code",
394394
"execution_count": 15,
395395
"id": "d250b618-8320-4237-84e6-d46d9d932613",
396-
"metadata": {},
396+
"metadata": {
397+
"scrolled": true
398+
},
397399
"outputs": [
398400
{
399401
"data": {
@@ -507,16 +509,6 @@
507509
"finaldf.head()"
508510
]
509511
},
510-
{
511-
"cell_type": "code",
512-
"execution_count": 16,
513-
"id": "c51f5e3d-e894-4b78-9e81-2a4a3a3ea02e",
514-
"metadata": {},
515-
"outputs": [],
516-
"source": [
517-
"finaldf.to_parquet('feature_repo/data/docling_samples.parquet', index=False)"
518-
]
519-
},
520512
{
521513
"cell_type": "code",
522514
"execution_count": 17,
@@ -548,6 +540,41 @@
548540
"source": [
549541
"pdf_example"
550542
]
543+
},
544+
{
545+
"cell_type": "code",
546+
"execution_count": 19,
547+
"id": "4006f71d-975b-4863-a133-681253d643f5",
548+
"metadata": {},
549+
"outputs": [],
550+
"source": [
551+
"import hashlib\n",
552+
"\n",
553+
"def generate_chunk_id(file_name: str, raw_chunk_markdown: str) -> str:\n",
554+
" \"\"\"Generate a unique chunk ID based on file_name and raw_chunk_markdown.\"\"\"\n",
555+
" unique_string = f\"{file_name}-{raw_chunk_markdown}\"\n",
556+
" return hashlib.sha256(unique_string.encode()).hexdigest()"
557+
]
558+
},
559+
{
560+
"cell_type": "code",
561+
"execution_count": 20,
562+
"id": "8449ad8a-c8d3-4da4-9bf1-45939f6647a9",
563+
"metadata": {},
564+
"outputs": [],
565+
"source": [
566+
"finaldf[\"chunk_id\"] = finaldf.apply(lambda row: generate_chunk_id(row[\"file_name\"], row[\"raw_chunk_markdown\"]), axis=1)"
567+
]
568+
},
569+
{
570+
"cell_type": "code",
571+
"execution_count": 21,
572+
"id": "c51f5e3d-e894-4b78-9e81-2a4a3a3ea02e",
573+
"metadata": {},
574+
"outputs": [],
575+
"source": [
576+
"finaldf.to_parquet('feature_repo/data/docling_samples.parquet', index=False)"
577+
]
551578
}
552579
],
553580
"metadata": {

0 commit comments

Comments
 (0)