docs: update custom convert and dockerfile (#226)

* docs: remove old code from custom_convert.py

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* docs: update example Dockerfile

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-11-04 14:27:40 +01:00 committed by GitHub
parent 41acaa9e2e
commit 5f5fea90a9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 42 additions and 30 deletions

View File

@ -17,7 +17,7 @@ ENV TORCH_HOME=/tmp/
COPY examples/minimal.py /root/minimal.py COPY examples/minimal.py /root/minimal.py
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);' RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'
# On container environments, always set a thread budget to avoid undesired thread congestion. # On container environments, always set a thread budget to avoid undesired thread congestion.
ENV OMP_NUM_THREADS=4 ENV OMP_NUM_THREADS=4

View File

@ -3,9 +3,12 @@ import logging
import time import time
from pathlib import Path from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -23,32 +26,51 @@ def main():
# PyPdfium without EasyOCR # PyPdfium without EasyOCR
# -------------------- # --------------------
# pipeline_options = PipelineOptions() # pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr=False # pipeline_options.do_ocr = False
# pipeline_options.do_table_structure=True # pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = False # pipeline_options.table_structure_options.do_cell_matching = False
# doc_converter = DocumentConverter( # doc_converter = DocumentConverter(
# pipeline_options=pipeline_options, # format_options={
# pdf_backend=PyPdfiumDocumentBackend, # InputFormat.PDF: PdfFormatOption(
# pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
# )
# }
# ) # )
# PyPdfium with EasyOCR # PyPdfium with EasyOCR
# ----------------- # -----------------
# pipeline_options = PipelineOptions() # pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr=True # pipeline_options.do_ocr = True
# pipeline_options.do_table_structure=True # pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True # pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter( # doc_converter = DocumentConverter(
# pipeline_options=pipeline_options, # format_options={
# pdf_backend=PyPdfiumDocumentBackend, # InputFormat.PDF: PdfFormatOption(
# pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
# )
# }
# ) # )
# Docling Parse without EasyOCR # Docling Parse without EasyOCR
# ------------------------- # -------------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = False
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter(
# format_options={
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
# }
# )
# Docling Parse with EasyOCR
# ----------------------
pipeline_options = PdfPipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
@ -58,42 +80,32 @@ def main():
} }
) )
# Docling Parse with EasyOCR
# ----------------------
# pipeline_options = PipelineOptions()
# pipeline_options.do_ocr=True
# pipeline_options.do_table_structure=True
# pipeline_options.table_structure_options.do_cell_matching = True
# doc_converter = DocumentConverter(
# pipeline_options=pipeline_options,
# pdf_backend=DoclingParseDocumentBackend,
# )
# Docling Parse with Tesseract # Docling Parse with Tesseract
# ---------------------- # ----------------------
# pipeline_options = PipelineOptions() # pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True # pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True # pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True # pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractOcrOptions() # pipeline_options.ocr_options = TesseractOcrOptions()
# doc_converter = DocumentConverter( # doc_converter = DocumentConverter(
# pipeline_options=pipeline_options, # format_options={
# pdf_backend=DoclingParseDocumentBackend, # InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
# }
# ) # )
# Docling Parse with Tesseract CLI # Docling Parse with Tesseract CLI
# ---------------------- # ----------------------
# pipeline_options = PipelineOptions() # pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True # pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True # pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True # pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractCliOcrOptions() # pipeline_options.ocr_options = TesseractCliOcrOptions()
# doc_converter = DocumentConverter( # doc_converter = DocumentConverter(
# pipeline_options=pipeline_options, # format_options={
# pdf_backend=DoclingParseDocumentBackend, # InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
# }
# ) # )
########################################################################### ###########################################################################