Skip to content

Improve nested list parsing #2562

@darksoothingshadow

Description

@darksoothingshadow

Question

In what ways can I improve/alter my approach to extracting the nested structure of a pdf?

What about using VLM? Or update it's understanding on how to parse a nested list?

`import logging
from pathlib import Path
import time

logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(name)

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.accelerator_options import AcceleratorOptions, AcceleratorDevice
from docling.document_converter import DocumentConverter, PdfFormatOption

def main():
input_doc_path = Path("/content/00-Begleitschreiben.pdf")

# Docling Parse with EasyOCR (default configuration)
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

# Set language for OCR (adjust as needed for your document)
pipeline_options.ocr_options.lang = ["de"]  # German

# Configure accelerator options for better performance
pipeline_options.accelerator_options = AcceleratorOptions(
    num_threads=4,
    device=AcceleratorDevice.AUTO  # Will use GPU if available, otherwise CPU
)

# Create converter with default Docling Parse backend
doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

_log.info("Starting conversion with Docling Parse + EasyOCR...")
start_time = time.time()

try:
    conv_result = doc_converter.convert(input_doc_path)
    end_time = time.time() - start_time

    _log.info(f"Document converted in {end_time:.2f} seconds.")

    ## Export results
    output_dir = Path("/content")
    doc_filename = conv_result.input.file.stem

    # Export Markdown format:
    with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
        fp.write(conv_result.document.export_to_markdown())

    # Export HTML format:
    with (output_dir / f"{doc_filename}.html").open("w", encoding="utf-8") as fp:
        fp.write(conv_result.document.export_to_html())

    print(f"✓ Conversion complete!")
    print(f"✓ Files exported:")
    print(f"  - {doc_filename}.md")
    print(f"  - {doc_filename}.html")

except Exception as e:
    _log.error(f"Conversion failed: {str(e)}")
    import traceback
    traceback.print_exc()
    raise

if name == "main":
main()`
...

Metadata

Metadata

Assignees

No one assigned

    Labels

    questionFurther information is requested

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions