fix(converter): Cache same pipeline class with different options (#1152)
* Update document_converter.py Fixing caching same class with different options by using composite key (class, options) # TODO this will ignore if different options have been defined for the same pipeline class. at row 292 Signed-off-by: mislavmartinic <mislav.martinic@pontistechnology.com> * formatted script * removed unnecessary hasattr check * pre-commit chain run --------- Signed-off-by: mislavmartinic <mislav.martinic@pontistechnology.com>
This commit is contained in:
parent
6df8827231
commit
825b226fab
@ -1,3 +1,4 @@
|
|||||||
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
import sys
|
import sys
|
||||||
@ -181,7 +182,14 @@ class DocumentConverter:
|
|||||||
)
|
)
|
||||||
for format in self.allowed_formats
|
for format in self.allowed_formats
|
||||||
}
|
}
|
||||||
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
self.initialized_pipelines: Dict[
|
||||||
|
Tuple[Type[BasePipeline], str], BasePipeline
|
||||||
|
] = {}
|
||||||
|
|
||||||
|
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
|
||||||
|
"""Generate a hash of pipeline options to use as part of the cache key."""
|
||||||
|
options_str = str(pipeline_options.model_dump())
|
||||||
|
return hashlib.md5(options_str.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def initialize_pipeline(self, format: InputFormat):
|
def initialize_pipeline(self, format: InputFormat):
|
||||||
"""Initialize the conversion pipeline for the selected format."""
|
"""Initialize the conversion pipeline for the selected format."""
|
||||||
@ -279,31 +287,36 @@ class DocumentConverter:
|
|||||||
yield item
|
yield item
|
||||||
|
|
||||||
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
||||||
|
"""Retrieve or initialize a pipeline, reusing instances based on class and options."""
|
||||||
fopt = self.format_to_options.get(doc_format)
|
fopt = self.format_to_options.get(doc_format)
|
||||||
|
|
||||||
if fopt is None:
|
if fopt is None or fopt.pipeline_options is None:
|
||||||
return None
|
return None
|
||||||
else:
|
|
||||||
pipeline_class = fopt.pipeline_cls
|
pipeline_class = fopt.pipeline_cls
|
||||||
pipeline_options = fopt.pipeline_options
|
pipeline_options = fopt.pipeline_options
|
||||||
|
options_hash = self._get_pipeline_options_hash(pipeline_options)
|
||||||
|
|
||||||
if pipeline_options is None:
|
# Use a composite key to cache pipelines
|
||||||
return None
|
cache_key = (pipeline_class, options_hash)
|
||||||
# TODO this will ignore if different options have been defined for the same pipeline class.
|
|
||||||
if (
|
if cache_key not in self.initialized_pipelines:
|
||||||
pipeline_class not in self.initialized_pipelines
|
_log.info(
|
||||||
or self.initialized_pipelines[pipeline_class].pipeline_options
|
f"Initializing pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
||||||
!= pipeline_options
|
)
|
||||||
):
|
self.initialized_pipelines[cache_key] = pipeline_class(
|
||||||
self.initialized_pipelines[pipeline_class] = pipeline_class(
|
|
||||||
pipeline_options=pipeline_options
|
pipeline_options=pipeline_options
|
||||||
)
|
)
|
||||||
return self.initialized_pipelines[pipeline_class]
|
else:
|
||||||
|
_log.debug(
|
||||||
|
f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return self.initialized_pipelines[cache_key]
|
||||||
|
|
||||||
def _process_document(
|
def _process_document(
|
||||||
self, in_doc: InputDocument, raises_on_error: bool
|
self, in_doc: InputDocument, raises_on_error: bool
|
||||||
) -> ConversionResult:
|
) -> ConversionResult:
|
||||||
|
|
||||||
valid = (
|
valid = (
|
||||||
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
||||||
)
|
)
|
||||||
@ -345,7 +358,6 @@ class DocumentConverter:
|
|||||||
else:
|
else:
|
||||||
if raises_on_error:
|
if raises_on_error:
|
||||||
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# invalid doc or not of desired format
|
# invalid doc or not of desired format
|
||||||
conv_res = ConversionResult(
|
conv_res = ConversionResult(
|
||||||
|
Loading…
Reference in New Issue
Block a user