docs: update custom convert and dockerfile (#226)
* docs: remove old code from custom_convert.py Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * docs: update example Dockerfile Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
41acaa9e2e
commit
5f5fea90a9
@ -17,7 +17,7 @@ ENV TORCH_HOME=/tmp/
|
||||
COPY examples/minimal.py /root/minimal.py
|
||||
|
||||
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
|
||||
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
|
||||
RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);'
|
||||
|
||||
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||
ENV OMP_NUM_THREADS=4
|
||||
|
@ -3,9 +3,12 @@ import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrOptions
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -23,32 +26,51 @@ def main():
|
||||
|
||||
# PyPdfium without EasyOCR
|
||||
# --------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=False
|
||||
# pipeline_options.do_table_structure=True
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = False
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = False
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=PyPdfiumDocumentBackend,
|
||||
# format_options={
|
||||
# InputFormat.PDF: PdfFormatOption(
|
||||
# pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
|
||||
# )
|
||||
# }
|
||||
# )
|
||||
|
||||
# PyPdfium with EasyOCR
|
||||
# -----------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=True
|
||||
# pipeline_options.do_table_structure=True
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=PyPdfiumDocumentBackend,
|
||||
# format_options={
|
||||
# InputFormat.PDF: PdfFormatOption(
|
||||
# pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
|
||||
# )
|
||||
# }
|
||||
# )
|
||||
|
||||
# Docling Parse without EasyOCR
|
||||
# -------------------------
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = False
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# format_options={
|
||||
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
# }
|
||||
# )
|
||||
|
||||
# Docling Parse with EasyOCR
|
||||
# ----------------------
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_ocr = True
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
@ -58,42 +80,32 @@ def main():
|
||||
}
|
||||
)
|
||||
|
||||
# Docling Parse with EasyOCR
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=True
|
||||
# pipeline_options.do_table_structure=True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
# pipeline_options.ocr_options = TesseractOcrOptions()
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# format_options={
|
||||
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
# }
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract CLI
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
# pipeline_options.ocr_options = TesseractCliOcrOptions()
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# format_options={
|
||||
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
# }
|
||||
# )
|
||||
|
||||
###########################################################################
|
||||
|
Loading…
Reference in New Issue
Block a user