diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 00ef05b..f232069 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -256,10 +256,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[self.level], name="list", label=GroupLabel.LIST ) elif element.name == "ol": + start_attr = element.get("start") + start: int = ( + int(start_attr) + if isinstance(start_attr, str) and start_attr.isnumeric() + else 1 + ) # create a list group self.parents[self.level + 1] = doc.add_group( parent=self.parents[self.level], - name="ordered list", + name="ordered list" + (f" start {start}" if start != 1 else ""), label=GroupLabel.ORDERED_LIST, ) self.level += 1 @@ -270,15 +276,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.level -= 1 def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None: - """Handles listitem tags (li).""" + """Handles list item tags (li).""" nested_list = element.find(["ul", "ol"]) parent = self.parents[self.level] if parent is None: - _log.warning(f"list-item has no parent in DoclingDocument: {element}") + _log.debug(f"list-item has no parent in DoclingDocument: {element}") return parent_label: str = parent.label index_in_list = len(parent.children) + 1 + if ( + parent_label == GroupLabel.ORDERED_LIST + and isinstance(parent, GroupItem) + and parent.name + ): + start_in_list: str = parent.name.split(" ")[-1] + start: int = int(start_in_list) if start_in_list.isnumeric() else 1 + index_in_list += start - 1 if nested_list: # Text in list item can be hidden within hierarchy, hence @@ -324,13 +338,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=parent, ) else: - _log.warning(f"list-item has no text: {element}") + _log.debug(f"list-item has no text: {element}") @staticmethod def parse_table_data(element: Tag) -> Optional[TableData]: nested_tables = element.find("table") if nested_tables is not None: - _log.warning("Skipping nested table.") + _log.debug("Skipping nested table.") return None # Count the number of rows (number of elements) diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index 6c1db06..a04ae21 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -1,4 +1,4 @@ -import os +from io import BytesIO from pathlib import Path from docling.backend.html_backend import HTMLDocumentBackend @@ -41,6 +41,62 @@ def test_heading_levels(): assert found_lvl_2 and found_lvl_3 +def test_ordered_lists(): + test_set: list[tuple[bytes, str]] = [] + + test_set.append( + ( + b"
  1. 1st item
  2. 2nd item
", + "1. 1st item\n2. 2nd item", + ) + ) + test_set.append( + ( + b'
  1. 1st item
  2. 2nd item
', + "1. 1st item\n2. 2nd item", + ) + ) + test_set.append( + ( + b'
  1. 1st item
  2. 2nd item
', + "2. 1st item\n3. 2nd item", + ) + ) + test_set.append( + ( + b'
  1. 1st item
  2. 2nd item
', + "0. 1st item\n1. 2nd item", + ) + ) + test_set.append( + ( + b'
  1. 1st item
  2. 2nd item
', + "1. 1st item\n2. 2nd item", + ) + ) + test_set.append( + ( + b'
  1. 1st item
  2. 2nd item
', + "1. 1st item\n2. 2nd item", + ) + ) + + for pair in test_set: + in_doc = InputDocument( + path_or_stream=BytesIO(pair[0]), + format=InputFormat.HTML, + backend=HTMLDocumentBackend, + filename="test", + ) + backend = HTMLDocumentBackend( + in_doc=in_doc, + path_or_stream=BytesIO(pair[0]), + ) + doc: DoclingDocument = backend.convert() + assert doc + assert doc.export_to_markdown() == pair[1] + + def get_html_paths(): # Define the directory you want to search