From b886e4df312447d39f58cf6e3c45b0f863940321 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Mon, 16 Jun 2025 10:38:46 +0200 Subject: [PATCH] fix(asciidoc): set default size when missing in image directive (#1769) The AsciiDoc backend should not create an ImageRef with Size equal to None, instead use default size values. Refactor static methods as such and add the staticmethod decorator. Extend the regression test for this fix. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/asciidoc_backend.py | 57 +++++++++++++------ tests/data/asciidoc/test_03.asciidoc | 29 ++++++++++ .../docling_v2/test_03.asciidoc.md | 23 ++++++++ tests/test_backend_asciidoc.py | 24 +++++++- 4 files changed, 114 insertions(+), 19 deletions(-) create mode 100644 tests/data/asciidoc/test_03.asciidoc create mode 100644 tests/data/groundtruth/docling_v2/test_03.asciidoc.md diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 5328e8d..859646e 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -2,7 +2,7 @@ import logging import re from io import BytesIO from pathlib import Path -from typing import Set, Union +from typing import Final, Set, Union from docling_core.types.doc import ( DocItemLabel, @@ -22,6 +22,9 @@ from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) +DEFAULT_IMAGE_WIDTH: Final = 128 +DEFAULT_IMAGE_HEIGHT: Final = 128 + class AsciiDocBackend(DeclarativeDocumentBackend): def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]): @@ -200,9 +203,11 @@ class AsciiDocBackend(DeclarativeDocumentBackend): item = self._parse_picture(line) - size = None + size: Size if "width" in item and "height" in item: size = Size(width=int(item["width"]), height=int(item["height"])) + else: + size = Size(width=DEFAULT_IMAGE_WIDTH, height=DEFAULT_IMAGE_HEIGHT) uri = None if ( @@ -264,14 +269,16 @@ class AsciiDocBackend(DeclarativeDocumentBackend): return doc - def _get_current_level(self, parents): + @staticmethod + def _get_current_level(parents): for k, v in parents.items(): if v is None and k > 0: return k - 1 return 0 - def _get_current_parent(self, parents): + @staticmethod + def _get_current_parent(parents): for k, v in parents.items(): if v is None and k > 0: return parents[k - 1] @@ -279,17 +286,21 @@ class AsciiDocBackend(DeclarativeDocumentBackend): return None # ========= Title - def _is_title(self, line): + @staticmethod + def _is_title(line): return re.match(r"^= ", line) - def _parse_title(self, line): + @staticmethod + def _parse_title(line): return {"type": "title", "text": line[2:].strip(), "level": 0} # ========= Section headers - def _is_section_header(self, line): + @staticmethod + def _is_section_header(line): return re.match(r"^==+\s+", line) - def _parse_section_header(self, line): + @staticmethod + def _parse_section_header(line): match = re.match(r"^(=+)\s+(.*)", line) marker = match.group(1) # The list marker (e.g., "*", "-", "1.") @@ -303,10 +314,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend): } # ========= Lists - def _is_list_item(self, line): + @staticmethod + def _is_list_item(line): return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line) - def _parse_list_item(self, line): + @staticmethod + def _parse_list_item(line): """Extract the item marker (number or bullet symbol) and the text of the item.""" match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line) @@ -342,14 +355,17 @@ class AsciiDocBackend(DeclarativeDocumentBackend): } # ========= Tables - def _is_table_line(self, line): + @staticmethod + def _is_table_line(line): return re.match(r"^\|.*\|", line) - def _parse_table_line(self, line): + @staticmethod + def _parse_table_line(line): # Split table cells and trim extra spaces return [cell.strip() for cell in line.split("|") if cell.strip()] - def _populate_table_as_grid(self, table_data): + @staticmethod + def _populate_table_as_grid(table_data): num_rows = len(table_data) # Adjust the table data into a grid format @@ -380,10 +396,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend): return data # ========= Pictures - def _is_picture(self, line): + @staticmethod + def _is_picture(line): return re.match(r"^image::", line) - def _parse_picture(self, line): + @staticmethod + def _parse_picture(line): """ Parse an image macro, extracting its path and attributes. Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center] @@ -406,10 +424,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend): return {"type": "picture", "uri": line} # ========= Captions - def _is_caption(self, line): + @staticmethod + def _is_caption(line): return re.match(r"^\.(.+)", line) - def _parse_caption(self, line): + @staticmethod + def _parse_caption(line): mtch = re.match(r"^\.(.+)", line) if mtch: text = mtch.group(1) @@ -418,5 +438,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): return {"type": "caption", "text": ""} # ========= Plain text - def _parse_text(self, line): + @staticmethod + def _parse_text(line): return {"type": "text", "text": line.strip()} diff --git a/tests/data/asciidoc/test_03.asciidoc b/tests/data/asciidoc/test_03.asciidoc new file mode 100644 index 0000000..7ca6157 --- /dev/null +++ b/tests/data/asciidoc/test_03.asciidoc @@ -0,0 +1,29 @@ +:_mod-docs-content-type: PROCEDURE +:experimental: + +[id="renaming-a-bookmark_{context}"] += Renaming a bookmark + +You can rename a bookmark to distinguish it from other bookmarks. If you have bookmarks to several folders that all share the same name, you can tell the bookmarks apart if you rename them. + +Renaming the bookmark does not rename the folder. + +.Procedure + +. Right-click the bookmark in the side bar. + +. Select *Rename…*. ++ +image::rename-bookmark-menu.png[Rename bookmark menu] + +. In the *Name* field, enter the new name for the bookmark. ++ +image::rename-bookmark-text.png[Bookmark name field] + +. Click btn:[Rename]. + +.Verification + +* Check that the side bar lists the bookmark under the new name. ++ +image::renamed-bookmark.png[Renamed bookmark] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/test_03.asciidoc.md b/tests/data/groundtruth/docling_v2/test_03.asciidoc.md new file mode 100644 index 0000000..394405c --- /dev/null +++ b/tests/data/groundtruth/docling_v2/test_03.asciidoc.md @@ -0,0 +1,23 @@ +:\_mod-docs-content-type: PROCEDURE :experimental: + +# Renaming a bookmark + +[id="renaming-a-bookmark\_{context}"] + +You can rename a bookmark to distinguish it from other bookmarks. If you have bookmarks to several folders that all share the same name, you can tell the bookmarks apart if you rename them. + +Renaming the bookmark does not rename the folder. + +- Check that the side bar lists the bookmark under the new name. + +Procedure . Right-click the bookmark in the side bar. . Select *Rename…*. + + + + + In the *Name* field, enter the new name for the bookmark. + + + + + Click btn:[Rename]. .Verification + + \ No newline at end of file diff --git a/tests/test_backend_asciidoc.py b/tests/test_backend_asciidoc.py index fc047ba..4decdff 100644 --- a/tests/test_backend_asciidoc.py +++ b/tests/test_backend_asciidoc.py @@ -2,7 +2,11 @@ import glob import os from pathlib import Path -from docling.backend.asciidoc_backend import AsciiDocBackend +from docling.backend.asciidoc_backend import ( + DEFAULT_IMAGE_HEIGHT, + DEFAULT_IMAGE_WIDTH, + AsciiDocBackend, +) from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument @@ -18,6 +22,24 @@ def _get_backend(fname): return doc_backend +def test_parse_picture(): + line = ( + "image::images/example1.png[Example Image, width=200, height=150, align=center]" + ) + res = AsciiDocBackend._parse_picture(line) + assert res + assert res.get("width", 0) == "200" + assert res.get("height", 0) == "150" + assert res.get("uri", "") == "images/example1.png" + + line = "image::renamed-bookmark.png[Renamed bookmark]" + res = AsciiDocBackend._parse_picture(line) + assert res + assert "width" not in res + assert "height" not in res + assert res.get("uri", "") == "renamed-bookmark.png" + + def test_asciidocs_examples(): fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))