feat: Add timeout limit to document parsing job. DS4SD#270 (#552)
Signed-off-by: Abhishek Kumar <abhishekrocketeer@gmail.com> Testing: (.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=10 --verbose INFO:docling.document_converter:Going to convert document batch... INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf WARNING:docling.pipeline.base_pipeline:Document processing time (24.555 seconds) exceeded the specified timeout of 10.000 seconds INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 36.29 sec. WARNING:docling.cli.main:Document /var/folders/d7/dsfkllxs0xs8x2t4fcjknj4c0000gn/T/tmpl6p08u5i/2206.01062v1.pdf failed to convert. INFO:docling.cli.main:Processed 1 docs, of which 1 failed INFO:docling.cli.main:All documents were converted in 36.29 seconds. (.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=100 --verbose INFO:docling.document_converter:Going to convert document batch... INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 58.36 sec. INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md INFO:docling.cli.main:Processed 1 docs, of which 0 failed INFO:docling.cli.main:All documents were converted in 58.56 seconds. (.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --verbose INFO:docling.document_converter:Going to convert document batch... INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 59.82 sec. INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md INFO:docling.cli.main:Processed 1 docs, of which 0 failed INFO:docling.cli.main:All documents were converted in 59.88 seconds. (.venv) mario@Abhisheks-MacBook-Air docling % docling Usage: docling [OPTIONS] source ╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ --from [docx|pptx|html|image|pdf|asciido Specify input formats to convert │ │ c|md|xlsx] from. Defaults to all formats. │ │ [default: None] │ │ --to [md|json|html|text|doctags] Specify output formats. Defaults to │ │ Markdown. │ │ [default: None] │ │ --image-export-mode [placeholder|embedded|referenced] Image export mode for the document │ │ (only in case of JSON, Markdown or │ │ HTML). With `placeholder`, only the │ │ position of the image is marked in │ │ the output. In `embedded` mode, the │ │ image is embedded as base64 encoded │ │ string. In `referenced` mode, the │ │ image is exported in PNG format and │ │ referenced from the main exported │ │ document. │ │ [default: embedded] │ │ --ocr --no-ocr If enabled, the bitmap content will │ │ be processed using OCR. │ │ [default: ocr] │ │ --force-ocr --no-force-ocr Replace any existing text with OCR │ │ generated text over the full │ │ content. │ │ [default: no-force-ocr] │ │ --ocr-engine [easyocr|tesseract_cli|tesseract| The OCR engine to use. │ │ ocrmac|rapidocr] [default: easyocr] │ │ --ocr-lang TEXT Provide a comma-separated list of │ │ languages used by the OCR engine. │ │ Note that each OCR engine has │ │ different values for the language │ │ names. │ │ [default: None] │ │ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │ │ [default: dlparse_v2] │ │ --table-mode [fast|accurate] The mode to use in the table │ │ structure model. │ │ [default: fast] │ │ --artifacts-path PATH If provided, the location of the │ │ model artifacts. │ │ [default: None] │ │ --abort-on-error --no-abort-on-error If enabled, the bitmap content will │ │ be processed using OCR. │ │ [default: no-abort-on-error] │ │ --output PATH Output directory where results are │ │ saved. │ │ [default: .] │ │ --verbose -v INTEGER Set the verbosity level. -v for │ │ info logging, -vv for debug │ │ logging. │ │ [default: 0] │ │ --debug-visualize-cells --no-debug-visualize-cells Enable debug output which │ │ visualizes the PDF cells │ │ [default: no-debug-visualize-cells] │ │ --debug-visualize-ocr --no-debug-visualize-ocr Enable debug output which │ │ visualizes the OCR cells │ │ [default: no-debug-visualize-ocr] │ │ --debug-visualize-layout --no-debug-visualize-layout Enable debug output which │ │ visualizes the layour clusters │ │ [default: │ │ no-debug-visualize-layout] │ │ --debug-visualize-tables --no-debug-visualize-tables Enable debug output which │ │ visualizes the table cells │ │ [default: │ │ no-debug-visualize-tables] │ │ --version Show version information. │ │ --document-timeout FLOAT The timeout for processing each │ │ document, in seconds. │ │ [default: None] │ │ --help Show this message and exit. │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
This commit is contained in:
parent
aee9c0b324
commit
3da166eafa
@ -250,6 +250,13 @@ def convert(
|
||||
help="Show version information.",
|
||||
),
|
||||
] = None,
|
||||
document_timeout: Annotated[
|
||||
Optional[float],
|
||||
typer.Option(
|
||||
...,
|
||||
help="The timeout for processing each document, in seconds.",
|
||||
),
|
||||
] = None,
|
||||
):
|
||||
if verbose == 0:
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
@ -333,6 +340,7 @@ def convert(
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
document_timeout=document_timeout,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = (
|
||||
True # do_cell_matching
|
||||
|
@ -150,8 +150,9 @@ class PipelineOptions(BaseModel):
|
||||
"""Base pipeline options."""
|
||||
|
||||
create_legacy_output: bool = (
|
||||
True # This defautl will be set to False on a future version of docling
|
||||
True # This default will be set to False on a future version of docling
|
||||
)
|
||||
document_timeout: Optional[float] = None
|
||||
|
||||
|
||||
class PdfPipelineOptions(PipelineOptions):
|
||||
|
@ -126,6 +126,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
# conv_res.status = ConversionStatus.FAILURE
|
||||
# return conv_res
|
||||
|
||||
total_elapsed_time = 0.0
|
||||
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||
|
||||
for i in range(0, conv_res.input.page_count):
|
||||
@ -136,7 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
for page_batch in chunkify(
|
||||
conv_res.pages, settings.perf.page_batch_size
|
||||
):
|
||||
start_pb_time = time.time()
|
||||
start_batch_time = time.monotonic()
|
||||
|
||||
# 1. Initialise the page resources
|
||||
init_pages = map(
|
||||
@ -149,8 +150,21 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
for p in pipeline_pages: # Must exhaust!
|
||||
pass
|
||||
|
||||
end_pb_time = time.time() - start_pb_time
|
||||
_log.debug(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||
end_batch_time = time.monotonic()
|
||||
total_elapsed_time += end_batch_time - start_batch_time
|
||||
if (
|
||||
self.pipeline_options.document_timeout is not None
|
||||
and total_elapsed_time > self.pipeline_options.document_timeout
|
||||
):
|
||||
_log.warning(
|
||||
f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds"
|
||||
)
|
||||
conv_res.status = ConversionStatus.PARTIAL_SUCCESS
|
||||
break
|
||||
|
||||
_log.debug(
|
||||
f"Finished converting page batch time={end_batch_time:.3f}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
|
Loading…
Reference in New Issue
Block a user