File tree Expand file tree Collapse file tree 1 file changed +38
-11
lines changed
Expand file tree Collapse file tree 1 file changed +38
-11
lines changed Original file line number Diff line number Diff line change 393393 "cell_type": "code",
394394 "execution_count": 15,
395395 "id": "d250b618-8320-4237-84e6-d46d9d932613",
396- "metadata": {},
396+ "metadata": {
397+ "scrolled": true
398+ },
397399 "outputs": [
398400 {
399401 "data": {
507509 "finaldf.head()"
508510 ]
509511 },
510- {
511- "cell_type": "code",
512- "execution_count": 16,
513- "id": "c51f5e3d-e894-4b78-9e81-2a4a3a3ea02e",
514- "metadata": {},
515- "outputs": [],
516- "source": [
517- "finaldf.to_parquet('feature_repo/data/docling_samples.parquet', index=False)"
518- ]
519- },
520512 {
521513 "cell_type": "code",
522514 "execution_count": 17,
548540 "source": [
549541 "pdf_example"
550542 ]
543+ },
544+ {
545+ "cell_type": "code",
546+ "execution_count": 19,
547+ "id": "4006f71d-975b-4863-a133-681253d643f5",
548+ "metadata": {},
549+ "outputs": [],
550+ "source": [
551+ "import hashlib\n",
552+ "\n",
553+ "def generate_chunk_id(file_name: str, raw_chunk_markdown: str) -> str:\n",
554+ " \"\"\"Generate a unique chunk ID based on file_name and raw_chunk_markdown.\"\"\"\n",
555+ " unique_string = f\"{file_name}-{raw_chunk_markdown}\"\n",
556+ " return hashlib.sha256(unique_string.encode()).hexdigest()"
557+ ]
558+ },
559+ {
560+ "cell_type": "code",
561+ "execution_count": 20,
562+ "id": "8449ad8a-c8d3-4da4-9bf1-45939f6647a9",
563+ "metadata": {},
564+ "outputs": [],
565+ "source": [
566+ "finaldf[\"chunk_id\"] = finaldf.apply(lambda row: generate_chunk_id(row[\"file_name\"], row[\"raw_chunk_markdown\"]), axis=1)"
567+ ]
568+ },
569+ {
570+ "cell_type": "code",
571+ "execution_count": 21,
572+ "id": "c51f5e3d-e894-4b78-9e81-2a4a3a3ea02e",
573+ "metadata": {},
574+ "outputs": [],
575+ "source": [
576+ "finaldf.to_parquet('feature_repo/data/docling_samples.parquet', index=False)"
577+ ]
551578 }
552579 ],
553580 "metadata": {
You can’t perform that action at this time.
0 commit comments