diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index e04e280..19e8c1e 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -28,7 +28,7 @@ jobs: run: | for file in docs/examples/*.py; do # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then + if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models).py ]]; then echo "Skipping $file" continue fi diff --git a/docs/examples/rapidocr_with_custom_models.py b/docs/examples/rapidocr_with_custom_models.py new file mode 100644 index 0000000..e6dd396 --- /dev/null +++ b/docs/examples/rapidocr_with_custom_models.py @@ -0,0 +1,58 @@ +import os + +from huggingface_hub import snapshot_download + +from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions +from docling.document_converter import ( + ConversionResult, + DocumentConverter, + InputFormat, + PdfFormatOption, +) + + +def main(): + # Source document to convert + source = "https://arxiv.org/pdf/2408.09869v4" + + # Download RappidOCR models from HuggingFace + print("Downloading RapidOCR models") + download_path = snapshot_download(repo_id="SWHL/RapidOCR") + + # Setup RapidOcrOptions for english detection + det_model_path = os.path.join( + download_path, "PP-OCRv4", "en_PP-OCRv3_det_infer.onnx" + ) + rec_model_path = os.path.join( + download_path, "PP-OCRv4", "ch_PP-OCRv4_rec_server_infer.onnx" + ) + cls_model_path = os.path.join( + download_path, "PP-OCRv3", "ch_ppocr_mobile_v2.0_cls_train.onnx" + ) + ocr_options = RapidOcrOptions( + det_model_path=det_model_path, + rec_model_path=rec_model_path, + cls_model_path=cls_model_path, + ) + + pipeline_options = PdfPipelineOptions( + ocr_options=ocr_options, + ) + + # Convert the document + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ), + }, + ) + + conversion_result: ConversionResult = converter.convert(source=source) + doc = conversion_result.document + md = doc.export_to_markdown() + print(md) + + +if __name__ == "__main__": + main() diff --git a/mkdocs.yml b/mkdocs.yml index 0fcc2ca..abb93a2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -77,6 +77,7 @@ nav: - "Multimodal export": examples/export_multimodal.py - "Force full page OCR": examples/full_page_ocr.py - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py + - "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py - "Accelerator options": examples/run_with_accelerator.py - "Simple translation": examples/translate.py - examples/backend_xml_rag.ipynb