docs: update custom convert and dockerfile (#226)

* docs: remove old code from custom_convert.py Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * docs: update example Dockerfile Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2024-11-04 14:27:40 +01:00 · 2024-11-04 14:27:40 +01:00 · 5f5fea90a9
commit 5f5fea90a9
parent 41acaa9e2e
2 changed files with 42 additions and 30 deletions
--- a/2
+++ b/2
@ -17,7 +17,7 @@ ENV TORCH_HOME=/tmp/
 COPY examples/minimal.py /root/minimal.py

 RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
-RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
+RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'

 # On container environments, always set a thread budget to avoid undesired thread congestion.
 ENV OMP_NUM_THREADS=4
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@ -3,9 +3,12 @@ import logging
 import time
 from pathlib import Path

+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
+from docling.models.tesseract_ocr_model import TesseractOcrOptions

 _log = logging.getLogger(__name__)

@ -23,32 +26,51 @@ def main():

    # PyPdfium without EasyOCR
    # --------------------
-    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr=False
-    # pipeline_options.do_table_structure=True
+    # pipeline_options = PdfPipelineOptions()
+    # pipeline_options.do_ocr = False
+    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = False

    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=PyPdfiumDocumentBackend,
+    #     format_options={
+    #         InputFormat.PDF: PdfFormatOption(
+    #             pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
+    #         )
+    #     }
    # )

    # PyPdfium with EasyOCR
    # -----------------
-    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr=True
-    # pipeline_options.do_table_structure=True
+    # pipeline_options = PdfPipelineOptions()
+    # pipeline_options.do_ocr = True
+    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True

    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=PyPdfiumDocumentBackend,
+    #     format_options={
+    #         InputFormat.PDF: PdfFormatOption(
+    #             pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
+    #         )
+    #     }
    # )

    # Docling Parse without EasyOCR
    # -------------------------
+    # pipeline_options = PdfPipelineOptions()
+    # pipeline_options.do_ocr = False
+    # pipeline_options.do_table_structure = True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+
+    # doc_converter = DocumentConverter(
+    #     format_options={
+    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+    #     }
+    # )
+
+    # Docling Parse with EasyOCR
+    # ----------------------
    pipeline_options = PdfPipelineOptions()
-    pipeline_options.do_ocr = False
+    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True

@ -58,42 +80,32 @@ def main():
        }
    )

-    # Docling Parse with EasyOCR
-    # ----------------------
-    # pipeline_options = PipelineOptions()
-    # pipeline_options.do_ocr=True
-    # pipeline_options.do_table_structure=True
-    # pipeline_options.table_structure_options.do_cell_matching = True
-
-    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=DoclingParseDocumentBackend,
-    # )
-
    # Docling Parse with Tesseract
    # ----------------------
-    # pipeline_options = PipelineOptions()
+    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True
    # pipeline_options.ocr_options = TesseractOcrOptions()

    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=DoclingParseDocumentBackend,
+    #     format_options={
+    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+    #     }
    # )

    # Docling Parse with Tesseract CLI
    # ----------------------
-    # pipeline_options = PipelineOptions()
+    # pipeline_options = PdfPipelineOptions()
    # pipeline_options.do_ocr = True
    # pipeline_options.do_table_structure = True
    # pipeline_options.table_structure_options.do_cell_matching = True
    # pipeline_options.ocr_options = TesseractCliOcrOptions()

    # doc_converter = DocumentConverter(
-    #     pipeline_options=pipeline_options,
-    #     pdf_backend=DoclingParseDocumentBackend,
+    #     format_options={
+    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+    #     }
    # )

    ###########################################################################