Improve nested list parsing

### Question

In what ways can I improve/alter my approach to extracting the nested structure of a pdf?

What about using VLM? Or update it's understanding on how to parse a nested list?

`import logging
from pathlib import Path
import time

logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(__name__)

from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.accelerator_options import AcceleratorOptions, AcceleratorDevice
from docling.document_converter import DocumentConverter, PdfFormatOption

def main():
    input_doc_path = Path("/content/00-Begleitschreiben.pdf")

    # Docling Parse with EasyOCR (default configuration)
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True

    # Set language for OCR (adjust as needed for your document)
    pipeline_options.ocr_options.lang = ["de"]  # German

    # Configure accelerator options for better performance
    pipeline_options.accelerator_options = AcceleratorOptions(
        num_threads=4,
        device=AcceleratorDevice.AUTO  # Will use GPU if available, otherwise CPU
    )

    # Create converter with default Docling Parse backend
    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    _log.info("Starting conversion with Docling Parse + EasyOCR...")
    start_time = time.time()

    try:
        conv_result = doc_converter.convert(input_doc_path)
        end_time = time.time() - start_time

        _log.info(f"Document converted in {end_time:.2f} seconds.")

        ## Export results
        output_dir = Path("/content")
        doc_filename = conv_result.input.file.stem

        # Export Markdown format:
        with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
            fp.write(conv_result.document.export_to_markdown())

        # Export HTML format:
        with (output_dir / f"{doc_filename}.html").open("w", encoding="utf-8") as fp:
            fp.write(conv_result.document.export_to_html())

        print(f"✓ Conversion complete!")
        print(f"✓ Files exported:")
        print(f"  - {doc_filename}.md")
        print(f"  - {doc_filename}.html")

    except Exception as e:
        _log.error(f"Conversion failed: {str(e)}")
        import traceback
        traceback.print_exc()
        raise

if __name__ == "__main__":
    main()`
...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Improve nested list parsing #2562

Question

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Improve nested list parsing #2562

Description

Question

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions