fix: enable locks for threadsafe pdfium (#1052)

* enable locks for threadsafe pdfium Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix deadlock in pypdfium2 backend Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-03-02 20:06:44 +01:00
parent e25d557c06
commit 8dc0562542
3 changed files with 98 additions and 71 deletions
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@@ -12,6 +12,7 @@ from pypdfium2 import PdfPage

 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import Cell, Size
+from docling.utils.locks import pypdfium2_lock

 if TYPE_CHECKING:
    from docling.datamodel.document import InputDocument
@@ -182,20 +183,24 @@ class DoclingParseV2PageBackend(PdfPageBackend):
            padbox.r = page_size.width - padbox.r
            padbox.t = page_size.height - padbox.t

-        image = (
-            self._ppage.render(
-                scale=scale * 1.5,
-                rotation=0,  # no additional rotation
-                crop=padbox.as_tuple(),
-            )
-            .to_pil()
-            .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
-        )  # We resize the image from 1.5x the given scale to make it sharper.
+        with pypdfium2_lock:
+            image = (
+                self._ppage.render(
+                    scale=scale * 1.5,
+                    rotation=0,  # no additional rotation
+                    crop=padbox.as_tuple(),
+                )
+                .to_pil()
+                .resize(
+                    size=(round(cropbox.width * scale), round(cropbox.height * scale))
+                )
+            )  # We resize the image from 1.5x the given scale to make it sharper.

        return image

    def get_size(self) -> Size:
-        return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
+        with pypdfium2_lock:
+            return Size(width=self._ppage.get_width(), height=self._ppage.get_height())

    def unload(self):
        self._ppage = None
@@ -206,23 +211,24 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)

-        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
-        self.parser = pdf_parser_v2("fatal")
+        with pypdfium2_lock:
+            self._pdoc = pdfium.PdfDocument(self.path_or_stream)
+            self.parser = pdf_parser_v2("fatal")

-        success = False
-        if isinstance(self.path_or_stream, BytesIO):
-            success = self.parser.load_document_from_bytesio(
-                self.document_hash, self.path_or_stream
-            )
-        elif isinstance(self.path_or_stream, Path):
-            success = self.parser.load_document(
-                self.document_hash, str(self.path_or_stream)
-            )
+            success = False
+            if isinstance(self.path_or_stream, BytesIO):
+                success = self.parser.load_document_from_bytesio(
+                    self.document_hash, self.path_or_stream
+                )
+            elif isinstance(self.path_or_stream, Path):
+                success = self.parser.load_document(
+                    self.document_hash, str(self.path_or_stream)
+                )

-        if not success:
-            raise RuntimeError(
-                f"docling-parse v2 could not load document {self.document_hash}."
-            )
+            if not success:
+                raise RuntimeError(
+                    f"docling-parse v2 could not load document {self.document_hash}."
+                )

    def page_count(self) -> int:
        # return len(self._pdoc)  # To be replaced with docling-parse API
@@ -236,9 +242,10 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
        return len_2

    def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
-        return DoclingParseV2PageBackend(
-            self.parser, self.document_hash, page_no, self._pdoc[page_no]
-        )
+        with pypdfium2_lock:
+            return DoclingParseV2PageBackend(
+                self.parser, self.document_hash, page_no, self._pdoc[page_no]
+            )

    def is_valid(self) -> bool:
        return self.page_count() > 0
@@ -246,5 +253,6 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
    def unload(self):
        super().unload()
        self.parser.unload_document(self.document_hash)
-        self._pdoc.close()
-        self._pdoc = None
+        with pypdfium2_lock:
+            self._pdoc.close()
+            self._pdoc = None