-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Open
Labels
questionFurther information is requestedFurther information is requested
Description
Question
In what ways can I improve/alter my approach to extracting the nested structure of a pdf?
What about using VLM? Or update it's understanding on how to parse a nested list?
`import logging
from pathlib import Path
import time
logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(name)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.accelerator_options import AcceleratorOptions, AcceleratorDevice
from docling.document_converter import DocumentConverter, PdfFormatOption
def main():
input_doc_path = Path("/content/00-Begleitschreiben.pdf")
# Docling Parse with EasyOCR (default configuration)
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
# Set language for OCR (adjust as needed for your document)
pipeline_options.ocr_options.lang = ["de"] # German
# Configure accelerator options for better performance
pipeline_options.accelerator_options = AcceleratorOptions(
num_threads=4,
device=AcceleratorDevice.AUTO # Will use GPU if available, otherwise CPU
)
# Create converter with default Docling Parse backend
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
_log.info("Starting conversion with Docling Parse + EasyOCR...")
start_time = time.time()
try:
conv_result = doc_converter.convert(input_doc_path)
end_time = time.time() - start_time
_log.info(f"Document converted in {end_time:.2f} seconds.")
## Export results
output_dir = Path("/content")
doc_filename = conv_result.input.file.stem
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_markdown())
# Export HTML format:
with (output_dir / f"{doc_filename}.html").open("w", encoding="utf-8") as fp:
fp.write(conv_result.document.export_to_html())
print(f"✓ Conversion complete!")
print(f"✓ Files exported:")
print(f" - {doc_filename}.md")
print(f" - {doc_filename}.html")
except Exception as e:
_log.error(f"Conversion failed: {str(e)}")
import traceback
traceback.print_exc()
raise
if name == "main":
main()`
...
Metadata
Metadata
Assignees
Labels
questionFurther information is requestedFurther information is requested