fix: allow to explicitly initialize the pipeline (#189)

* feat: allow to explicitly initialize the pipeline

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* clean examples

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-10-30 17:54:53 +01:00 committed by GitHub
parent 43349865d0
commit 904d24d600
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -139,6 +139,10 @@ class DocumentConverter:
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {} self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
def initialize_pipeline(self, format: InputFormat):
"""Initialize the conversion pipeline for the selected format."""
self._get_pipeline(doc_format=format)
@validate_call(config=ConfigDict(strict=True)) @validate_call(config=ConfigDict(strict=True))
def convert( def convert(
self, self,
@ -219,13 +223,13 @@ class DocumentConverter:
else: else:
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.") _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
def _get_pipeline(self, doc: InputDocument) -> Optional[BasePipeline]: def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
assert self.format_to_options is not None assert self.format_to_options is not None
fopt = self.format_to_options.get(doc.format) fopt = self.format_to_options.get(doc_format)
if fopt is None: if fopt is None:
raise RuntimeError(f"Could not get pipeline for document {doc.file}") raise RuntimeError(f"Could not get pipeline for {doc_format}")
else: else:
pipeline_class = fopt.pipeline_cls pipeline_class = fopt.pipeline_cls
pipeline_options = fopt.pipeline_options pipeline_options = fopt.pipeline_options
@ -256,7 +260,7 @@ class DocumentConverter:
self, in_doc: InputDocument, raises_on_error: bool self, in_doc: InputDocument, raises_on_error: bool
) -> ConversionResult: ) -> ConversionResult:
if in_doc.valid: if in_doc.valid:
pipeline = self._get_pipeline(in_doc) pipeline = self._get_pipeline(in_doc.format)
if pipeline is None: # Can't find a default pipeline. Should this raise? if pipeline is None: # Can't find a default pipeline. Should this raise?
if raises_on_error: if raises_on_error:
raise RuntimeError( raise RuntimeError(