From 64ac043786efdece0c61827051a5b41dddf6c5d7 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Thu, 19 Jun 2025 04:10:40 -0500 Subject: [PATCH] docs: support running examples from root or subfolder (#1816) support running examples from root or subfolder Signed-off-by: Michele Dolfi --- docs/examples/batch_convert.py | 11 ++++++----- docs/examples/custom_convert.py | 3 ++- docs/examples/develop_formula_understanding.py | 3 ++- docs/examples/develop_picture_enrichment.py | 3 ++- docs/examples/export_figures.py | 3 ++- docs/examples/export_multimodal.py | 3 ++- docs/examples/export_tables.py | 3 ++- docs/examples/full_page_ocr.py | 5 +++-- docs/examples/pictures_description_api.py | 3 ++- docs/examples/run_with_accelerator.py | 5 +++-- docs/examples/tesseract_lang_detection.py | 5 +++-- docs/examples/translate.py | 3 ++- docs/examples/vlm_pipeline_api_model.py | 4 ++-- 13 files changed, 33 insertions(+), 21 deletions(-) diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index 25eb2ba..6a4da7d 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -121,14 +121,15 @@ def export_documents( def main(): logging.basicConfig(level=logging.INFO) + data_folder = Path(__file__).parent / "../../tests/data" input_doc_paths = [ - Path("./tests/data/pdf/2206.01062.pdf"), - Path("./tests/data/pdf/2203.01017v2.pdf"), - Path("./tests/data/pdf/2305.03393v1.pdf"), - Path("./tests/data/pdf/redp5110_sampled.pdf"), + data_folder / "pdf/2206.01062.pdf", + data_folder / "pdf/2203.01017v2.pdf", + data_folder / "pdf/2305.03393v1.pdf", + data_folder / "pdf/redp5110_sampled.pdf", ] - # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) + # buf = BytesIO((data_folder / "pdf/2206.01062.pdf").open("rb").read()) # docs = [DocumentStream(name="my_doc.pdf", stream=buf)] # input = DocumentConversionInput.from_streams(docs) diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py index 12dfacd..6a90075 100644 --- a/docs/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -16,7 +16,8 @@ _log = logging.getLogger(__name__) def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + data_folder = Path(__file__).parent / "../../tests/data" + input_doc_path = data_folder / "pdf/2206.01062.pdf" ########################################################################### diff --git a/docs/examples/develop_formula_understanding.py b/docs/examples/develop_formula_understanding.py index beb1575..8b90613 100644 --- a/docs/examples/develop_formula_understanding.py +++ b/docs/examples/develop_formula_understanding.py @@ -71,7 +71,8 @@ class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline): def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf") + data_folder = Path(__file__).parent / "../../tests/data" + input_doc_path = data_folder / "pdf/2203.01017v2.pdf" pipeline_options = ExampleFormulaUnderstandingPipelineOptions() pipeline_options.do_formula_understanding = True diff --git a/docs/examples/develop_picture_enrichment.py b/docs/examples/develop_picture_enrichment.py index 9e3d306..f027898 100644 --- a/docs/examples/develop_picture_enrichment.py +++ b/docs/examples/develop_picture_enrichment.py @@ -76,7 +76,8 @@ class ExamplePictureClassifierPipeline(StandardPdfPipeline): def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + data_folder = Path(__file__).parent / "../../tests/data" + input_doc_path = data_folder / "pdf/2206.01062.pdf" pipeline_options = ExamplePictureClassifierPipelineOptions() pipeline_options.images_scale = 2.0 diff --git a/docs/examples/export_figures.py b/docs/examples/export_figures.py index 8ed14a7..ab9d1e9 100644 --- a/docs/examples/export_figures.py +++ b/docs/examples/export_figures.py @@ -16,7 +16,8 @@ IMAGE_RESOLUTION_SCALE = 2.0 def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + data_folder = Path(__file__).parent / "../../tests/data" + input_doc_path = data_folder / "pdf/2206.01062.pdf" output_dir = Path("scratch") # Important: For operating with page images, we must keep them, otherwise the DocumentConverter diff --git a/docs/examples/export_multimodal.py b/docs/examples/export_multimodal.py index bef74bf..addbe37 100644 --- a/docs/examples/export_multimodal.py +++ b/docs/examples/export_multimodal.py @@ -19,7 +19,8 @@ IMAGE_RESOLUTION_SCALE = 2.0 def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + data_folder = Path(__file__).parent / "../../tests/data" + input_doc_path = data_folder / "pdf/2206.01062.pdf" output_dir = Path("scratch") # Important: For operating with page images, we must keep them, otherwise the DocumentConverter diff --git a/docs/examples/export_tables.py b/docs/examples/export_tables.py index 9a911d8..4d6c2b4 100644 --- a/docs/examples/export_tables.py +++ b/docs/examples/export_tables.py @@ -12,7 +12,8 @@ _log = logging.getLogger(__name__) def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + data_folder = Path(__file__).parent / "../../tests/data" + input_doc_path = data_folder / "pdf/2206.01062.pdf" output_dir = Path("scratch") doc_converter = DocumentConverter() diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py index 5525e87..7ff5f82 100644 --- a/docs/examples/full_page_ocr.py +++ b/docs/examples/full_page_ocr.py @@ -9,7 +9,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption def main(): - input_doc = Path("./tests/data/pdf/2206.01062.pdf") + data_folder = Path(__file__).parent / "../../tests/data" + input_doc_path = data_folder / "pdf/2206.01062.pdf" pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = True @@ -32,7 +33,7 @@ def main(): } ) - doc = converter.convert(input_doc).document + doc = converter.convert(input_doc_path).document md = doc.export_to_markdown() print(md) diff --git a/docs/examples/pictures_description_api.py b/docs/examples/pictures_description_api.py index 938d0a5..44ffc22 100644 --- a/docs/examples/pictures_description_api.py +++ b/docs/examples/pictures_description_api.py @@ -96,7 +96,8 @@ def watsonx_vlm_options(): def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + data_folder = Path(__file__).parent / "../../tests/data" + input_doc_path = data_folder / "pdf/2206.01062.pdf" pipeline_options = PdfPipelineOptions( enable_remote_services=True # <-- this is required! diff --git a/docs/examples/run_with_accelerator.py b/docs/examples/run_with_accelerator.py index 6b3ddc6..31d467d 100644 --- a/docs/examples/run_with_accelerator.py +++ b/docs/examples/run_with_accelerator.py @@ -10,7 +10,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption def main(): - input_doc = Path("./tests/data/pdf/2206.01062.pdf") + data_folder = Path(__file__).parent / "../../tests/data" + input_doc_path = data_folder / "pdf/2206.01062.pdf" # Explicitly set the accelerator # accelerator_options = AcceleratorOptions( @@ -47,7 +48,7 @@ def main(): settings.debug.profile_pipeline_timings = True # Convert the document - conversion_result = converter.convert(input_doc) + conversion_result = converter.convert(input_doc_path) doc = conversion_result.document # List with total time per document diff --git a/docs/examples/tesseract_lang_detection.py b/docs/examples/tesseract_lang_detection.py index 37859b9..bb11708 100644 --- a/docs/examples/tesseract_lang_detection.py +++ b/docs/examples/tesseract_lang_detection.py @@ -9,7 +9,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption def main(): - input_doc = Path("./tests/data/pdf/2206.01062.pdf") + data_folder = Path(__file__).parent / "../../tests/data" + input_doc_path = data_folder / "pdf/2206.01062.pdf" # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions # ocr_options = TesseractOcrOptions(lang=["auto"]) @@ -27,7 +28,7 @@ def main(): } ) - doc = converter.convert(input_doc).document + doc = converter.convert(input_doc_path).document md = doc.export_to_markdown() print(md) diff --git a/docs/examples/translate.py b/docs/examples/translate.py index f2711a2..4698168 100644 --- a/docs/examples/translate.py +++ b/docs/examples/translate.py @@ -30,7 +30,8 @@ def translate(text: str, src: str = "en", dest: str = "de"): def main(): logging.basicConfig(level=logging.INFO) - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") + data_folder = Path(__file__).parent / "../../tests/data" + input_doc_path = data_folder / "pdf/2206.01062.pdf" output_dir = Path("scratch") # Important: For operating with page images, we must keep them, otherwise the DocumentConverter diff --git a/docs/examples/vlm_pipeline_api_model.py b/docs/examples/vlm_pipeline_api_model.py index 20ca259..679f7bd 100644 --- a/docs/examples/vlm_pipeline_api_model.py +++ b/docs/examples/vlm_pipeline_api_model.py @@ -95,8 +95,8 @@ def watsonx_vlm_options(model: str, prompt: str): def main(): logging.basicConfig(level=logging.INFO) - # input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") - input_doc_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") + data_folder = Path(__file__).parent / "../../tests/data" + input_doc_path = data_folder / "pdf/2305.03393v1-pg9.pdf" pipeline_options = VlmPipelineOptions( enable_remote_services=True # <-- this is required!