feat: add textbox content extraction in msword_backend (#1538)
* feat: add textbox content extraction in msword_backend Signed-off-by: Andrew <tsai247365@gmail.com> * feat: add textbox content extraction in msword_backend Signed-off-by: Andrew <tsai247365@gmail.com> * feat: add textbox content extraction in msword_backend Signed-off-by: Andrew <tsai247365@gmail.com> --------- Signed-off-by: Andrew <tsai247365@gmail.com>
This commit is contained in:
parent
7c4c356e76
commit
12a0e64892
@ -2,7 +2,7 @@ import logging
|
|||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Optional, Union
|
from typing import Any, List, Optional, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
@ -24,7 +24,6 @@ from docx.text.hyperlink import Hyperlink
|
|||||||
from docx.text.paragraph import Paragraph
|
from docx.text.paragraph import Paragraph
|
||||||
from docx.text.run import Run
|
from docx.text.run import Run
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from lxml.etree import XPath
|
|
||||||
from PIL import Image, UnidentifiedImageError
|
from PIL import Image, UnidentifiedImageError
|
||||||
from pydantic import AnyUrl
|
from pydantic import AnyUrl
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
@ -59,6 +58,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.parents: dict[int, Optional[NodeItem]] = {}
|
self.parents: dict[int, Optional[NodeItem]] = {}
|
||||||
self.numbered_headers: dict[int, int] = {}
|
self.numbered_headers: dict[int, int] = {}
|
||||||
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
self.equation_bookends: str = "<eq>{EQ}</eq>"
|
||||||
|
# Track processed textbox elements to avoid duplication
|
||||||
|
self.processed_textbox_elements: List[int] = []
|
||||||
|
# Track content hash of processed paragraphs to avoid duplicate content
|
||||||
|
self.processed_paragraph_content: List[str] = []
|
||||||
|
|
||||||
for i in range(-1, self.max_levels):
|
for i in range(-1, self.max_levels):
|
||||||
self.parents[i] = None
|
self.parents[i] = None
|
||||||
|
|
||||||
@ -175,10 +179,74 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
||||||
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
||||||
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
||||||
|
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
||||||
|
"mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
|
||||||
|
"v": "urn:schemas-microsoft-com:vml",
|
||||||
|
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
|
||||||
|
"w10": "urn:schemas-microsoft-com:office:word",
|
||||||
|
"a14": "http://schemas.microsoft.com/office/drawing/2010/main",
|
||||||
}
|
}
|
||||||
xpath_expr = XPath(".//a:blip", namespaces=namespaces)
|
xpath_expr = etree.XPath(".//a:blip", namespaces=namespaces)
|
||||||
drawing_blip = xpath_expr(element)
|
drawing_blip = xpath_expr(element)
|
||||||
|
|
||||||
|
# Check for textbox content - check multiple textbox formats
|
||||||
|
# Only process if the element hasn't been processed before
|
||||||
|
element_id = id(element)
|
||||||
|
if element_id not in self.processed_textbox_elements:
|
||||||
|
# Modern Word textboxes
|
||||||
|
txbx_xpath = etree.XPath(
|
||||||
|
".//w:txbxContent|.//v:textbox//w:p", namespaces=namespaces
|
||||||
|
)
|
||||||
|
textbox_elements = txbx_xpath(element)
|
||||||
|
|
||||||
|
# No modern textboxes found, check for alternate/legacy textbox formats
|
||||||
|
if not textbox_elements and tag_name in ["drawing", "pict"]:
|
||||||
|
# Additional checks for textboxes in DrawingML and VML formats
|
||||||
|
alt_txbx_xpath = etree.XPath(
|
||||||
|
".//wps:txbx//w:p|.//w10:wrap//w:p|.//a:p//a:t",
|
||||||
|
namespaces=namespaces,
|
||||||
|
)
|
||||||
|
textbox_elements = alt_txbx_xpath(element)
|
||||||
|
|
||||||
|
# Check for shape text that's not in a standard textbox
|
||||||
|
if not textbox_elements:
|
||||||
|
shape_text_xpath = etree.XPath(
|
||||||
|
".//a:bodyPr/ancestor::*//a:t|.//a:txBody//a:t",
|
||||||
|
namespaces=namespaces,
|
||||||
|
)
|
||||||
|
shape_text_elements = shape_text_xpath(element)
|
||||||
|
if shape_text_elements:
|
||||||
|
# Create custom text elements from shape text
|
||||||
|
text_content = " ".join(
|
||||||
|
[t.text for t in shape_text_elements if t.text]
|
||||||
|
)
|
||||||
|
if text_content.strip():
|
||||||
|
_log.debug(f"Found shape text: {text_content[:50]}...")
|
||||||
|
# Create a paragraph-like element to process with standard handler
|
||||||
|
level = self._get_level()
|
||||||
|
shape_group = doc.add_group(
|
||||||
|
label=GroupLabel.SECTION,
|
||||||
|
parent=self.parents[level - 1],
|
||||||
|
name="shape-text",
|
||||||
|
)
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.PARAGRAPH,
|
||||||
|
parent=shape_group,
|
||||||
|
text=text_content,
|
||||||
|
)
|
||||||
|
|
||||||
|
if textbox_elements:
|
||||||
|
# Mark the parent element as processed
|
||||||
|
self.processed_textbox_elements.append(element_id)
|
||||||
|
# Also mark all found textbox elements as processed
|
||||||
|
for tb_element in textbox_elements:
|
||||||
|
self.processed_textbox_elements.append(id(tb_element))
|
||||||
|
|
||||||
|
_log.debug(
|
||||||
|
f"Found textbox content with {len(textbox_elements)} elements"
|
||||||
|
)
|
||||||
|
self._handle_textbox_content(textbox_elements, docx_obj, doc)
|
||||||
|
|
||||||
# Check for Tables
|
# Check for Tables
|
||||||
if element.tag.endswith("tbl"):
|
if element.tag.endswith("tbl"):
|
||||||
try:
|
try:
|
||||||
@ -291,15 +359,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
|
def _get_format_from_run(cls, run: Run) -> Optional[Formatting]:
|
||||||
has_any_formatting = run.bold or run.italic or run.underline
|
# The .bold and .italic properties are booleans, but .underline can be an enum
|
||||||
return (
|
# like WD_UNDERLINE.THICK (value 6), so we need to convert it to a boolean
|
||||||
Formatting(
|
has_bold = run.bold or False
|
||||||
bold=run.bold or False,
|
has_italic = run.italic or False
|
||||||
italic=run.italic or False,
|
# Convert any non-None underline value to True
|
||||||
underline=run.underline or False,
|
has_underline = bool(run.underline is not None and run.underline)
|
||||||
)
|
|
||||||
if has_any_formatting
|
return Formatting(
|
||||||
else None
|
bold=has_bold,
|
||||||
|
italic=has_italic,
|
||||||
|
underline=has_underline,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _get_paragraph_elements(self, paragraph: Paragraph):
|
def _get_paragraph_elements(self, paragraph: Paragraph):
|
||||||
@ -355,6 +425,182 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
return paragraph_elements
|
return paragraph_elements
|
||||||
|
|
||||||
|
def _get_paragraph_position(self, paragraph_element):
|
||||||
|
"""Extract vertical position information from paragraph element."""
|
||||||
|
# First try to directly get the index from w:p element that has an order-related attribute
|
||||||
|
if (
|
||||||
|
hasattr(paragraph_element, "getparent")
|
||||||
|
and paragraph_element.getparent() is not None
|
||||||
|
):
|
||||||
|
parent = paragraph_element.getparent()
|
||||||
|
# Get all paragraph siblings
|
||||||
|
paragraphs = [
|
||||||
|
p for p in parent.getchildren() if etree.QName(p).localname == "p"
|
||||||
|
]
|
||||||
|
# Find index of current paragraph within its siblings
|
||||||
|
try:
|
||||||
|
paragraph_index = paragraphs.index(paragraph_element)
|
||||||
|
return paragraph_index # Use index as position for consistent ordering
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Look for position hints in element attributes and ancestor elements
|
||||||
|
for elem in (*[paragraph_element], *paragraph_element.iterancestors()):
|
||||||
|
# Check for direct position attributes
|
||||||
|
for attr_name in ["y", "top", "positionY", "y-position", "position"]:
|
||||||
|
value = elem.get(attr_name)
|
||||||
|
if value:
|
||||||
|
try:
|
||||||
|
# Remove any non-numeric characters (like 'pt', 'px', etc.)
|
||||||
|
clean_value = re.sub(r"[^0-9.]", "", value)
|
||||||
|
if clean_value:
|
||||||
|
return float(clean_value)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check for position in transform attribute
|
||||||
|
transform = elem.get("transform")
|
||||||
|
if transform:
|
||||||
|
# Extract translation component from transform matrix
|
||||||
|
match = re.search(r"translate\([^,]+,\s*([0-9.]+)", transform)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
return float(match.group(1))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check for anchors or relative position indicators in Word format
|
||||||
|
# 'dist' attributes can indicate relative positioning
|
||||||
|
for attr_name in ["distT", "distB", "anchor", "relativeFrom"]:
|
||||||
|
if elem.get(attr_name) is not None:
|
||||||
|
return elem.sourceline # Use the XML source line number as fallback
|
||||||
|
|
||||||
|
# For VML shapes, look for specific attributes
|
||||||
|
for ns_uri in paragraph_element.nsmap.values():
|
||||||
|
if "vml" in ns_uri:
|
||||||
|
# Try to extract position from style attribute
|
||||||
|
style = paragraph_element.get("style")
|
||||||
|
if style:
|
||||||
|
match = re.search(r"top:([0-9.]+)pt", style)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
return float(match.group(1))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If no better position indicator found, use XML source line number as proxy for order
|
||||||
|
return (
|
||||||
|
paragraph_element.sourceline
|
||||||
|
if hasattr(paragraph_element, "sourceline")
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
def _collect_textbox_paragraphs(self, textbox_elements):
|
||||||
|
"""Collect and organize paragraphs from textbox elements."""
|
||||||
|
processed_paragraphs = []
|
||||||
|
container_paragraphs = {}
|
||||||
|
|
||||||
|
for element in textbox_elements:
|
||||||
|
element_id = id(element)
|
||||||
|
# Skip if we've already processed this exact element
|
||||||
|
if element_id in processed_paragraphs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tag_name = etree.QName(element).localname
|
||||||
|
processed_paragraphs.append(element_id)
|
||||||
|
|
||||||
|
# Handle paragraphs directly found (VML textboxes)
|
||||||
|
if tag_name == "p":
|
||||||
|
# Find the containing textbox or shape element
|
||||||
|
container_id = None
|
||||||
|
for ancestor in element.iterancestors():
|
||||||
|
if any(ns in ancestor.tag for ns in ["textbox", "shape", "txbx"]):
|
||||||
|
container_id = id(ancestor)
|
||||||
|
break
|
||||||
|
|
||||||
|
if container_id not in container_paragraphs:
|
||||||
|
container_paragraphs[container_id] = []
|
||||||
|
container_paragraphs[container_id].append(
|
||||||
|
(element, self._get_paragraph_position(element))
|
||||||
|
)
|
||||||
|
|
||||||
|
# Handle txbxContent elements (Word DrawingML textboxes)
|
||||||
|
elif tag_name == "txbxContent":
|
||||||
|
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
|
||||||
|
container_id = id(element)
|
||||||
|
if container_id not in container_paragraphs:
|
||||||
|
container_paragraphs[container_id] = []
|
||||||
|
|
||||||
|
for p in paragraphs:
|
||||||
|
p_id = id(p)
|
||||||
|
if p_id not in processed_paragraphs:
|
||||||
|
processed_paragraphs.append(p_id)
|
||||||
|
container_paragraphs[container_id].append(
|
||||||
|
(p, self._get_paragraph_position(p))
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Try to extract any paragraphs from unknown elements
|
||||||
|
paragraphs = element.findall(".//w:p", namespaces=element.nsmap)
|
||||||
|
container_id = id(element)
|
||||||
|
if container_id not in container_paragraphs:
|
||||||
|
container_paragraphs[container_id] = []
|
||||||
|
|
||||||
|
for p in paragraphs:
|
||||||
|
p_id = id(p)
|
||||||
|
if p_id not in processed_paragraphs:
|
||||||
|
processed_paragraphs.append(p_id)
|
||||||
|
container_paragraphs[container_id].append(
|
||||||
|
(p, self._get_paragraph_position(p))
|
||||||
|
)
|
||||||
|
|
||||||
|
return container_paragraphs
|
||||||
|
|
||||||
|
def _handle_textbox_content(
|
||||||
|
self,
|
||||||
|
textbox_elements: list,
|
||||||
|
docx_obj: DocxDocument,
|
||||||
|
doc: DoclingDocument,
|
||||||
|
) -> None:
|
||||||
|
"""Process textbox content and add it to the document structure."""
|
||||||
|
level = self._get_level()
|
||||||
|
# Create a textbox group to contain all text from the textbox
|
||||||
|
textbox_group = doc.add_group(
|
||||||
|
label=GroupLabel.SECTION, parent=self.parents[level - 1], name="textbox"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Set this as the current parent to ensure textbox content
|
||||||
|
# is properly nested in document structure
|
||||||
|
original_parent = self.parents[level]
|
||||||
|
self.parents[level] = textbox_group
|
||||||
|
|
||||||
|
# Collect and organize paragraphs
|
||||||
|
container_paragraphs = self._collect_textbox_paragraphs(textbox_elements)
|
||||||
|
|
||||||
|
# Process all paragraphs
|
||||||
|
all_paragraphs = []
|
||||||
|
|
||||||
|
# Sort paragraphs within each container, then process containers
|
||||||
|
for container_id, paragraphs in container_paragraphs.items():
|
||||||
|
# Sort by vertical position within each container
|
||||||
|
sorted_container_paragraphs = sorted(
|
||||||
|
paragraphs,
|
||||||
|
key=lambda x: (
|
||||||
|
x[1] is None,
|
||||||
|
x[1] if x[1] is not None else float("inf"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add the sorted paragraphs to our processing list
|
||||||
|
all_paragraphs.extend(sorted_container_paragraphs)
|
||||||
|
|
||||||
|
# Process all the paragraphs
|
||||||
|
for p, _ in all_paragraphs:
|
||||||
|
self._handle_text_elements(p, docx_obj, doc, is_from_textbox=True)
|
||||||
|
|
||||||
|
# Restore original parent
|
||||||
|
self.parents[level] = original_parent
|
||||||
|
return
|
||||||
|
|
||||||
def _handle_equations_in_text(self, element, text):
|
def _handle_equations_in_text(self, element, text):
|
||||||
only_texts = []
|
only_texts = []
|
||||||
only_equations = []
|
only_equations = []
|
||||||
@ -423,10 +669,21 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
element: BaseOxmlElement,
|
element: BaseOxmlElement,
|
||||||
docx_obj: DocxDocument,
|
docx_obj: DocxDocument,
|
||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
|
is_from_textbox: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
paragraph = Paragraph(element, docx_obj)
|
paragraph = Paragraph(element, docx_obj)
|
||||||
|
|
||||||
|
# Skip if from a textbox and this exact paragraph content was already processed
|
||||||
|
# Skip if from a textbox and this exact paragraph content was already processed
|
||||||
raw_text = paragraph.text
|
raw_text = paragraph.text
|
||||||
|
if is_from_textbox and raw_text:
|
||||||
|
# Create a simple hash of content to detect duplicates
|
||||||
|
content_hash = f"{len(raw_text)}:{raw_text[:50]}"
|
||||||
|
if content_hash in self.processed_paragraph_content:
|
||||||
|
_log.debug(f"Skipping duplicate paragraph content: {content_hash}")
|
||||||
|
return
|
||||||
|
self.processed_paragraph_content.append(content_hash)
|
||||||
|
|
||||||
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
|
text, equations = self._handle_equations_in_text(element=element, text=raw_text)
|
||||||
|
|
||||||
if text is None:
|
if text is None:
|
||||||
|
BIN
tests/data/docx/textbox.docx
Normal file
BIN
tests/data/docx/textbox.docx
Normal file
Binary file not shown.
@ -16,6 +16,27 @@ from .verify_utils import verify_document, verify_export
|
|||||||
GENERATE = GEN_TEST_DATA
|
GENERATE = GEN_TEST_DATA
|
||||||
|
|
||||||
|
|
||||||
|
def test_textbox_extraction():
|
||||||
|
in_path = Path("tests/data/docx/textbox.docx")
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=in_path,
|
||||||
|
format=InputFormat.DOCX,
|
||||||
|
backend=MsWordDocumentBackend,
|
||||||
|
)
|
||||||
|
backend = MsWordDocumentBackend(
|
||||||
|
in_doc=in_doc,
|
||||||
|
path_or_stream=in_path,
|
||||||
|
)
|
||||||
|
doc = backend.convert()
|
||||||
|
|
||||||
|
# Verify if a particular textbox content is extracted
|
||||||
|
textbox_found = False
|
||||||
|
for item, _ in doc.iterate_items():
|
||||||
|
if item.text[:30] == """Suggested Reportable Symptoms:""":
|
||||||
|
textbox_found = True
|
||||||
|
assert textbox_found
|
||||||
|
|
||||||
|
|
||||||
def test_heading_levels():
|
def test_heading_levels():
|
||||||
in_path = Path("tests/data/docx/word_sample.docx")
|
in_path = Path("tests/data/docx/word_sample.docx")
|
||||||
in_doc = InputDocument(
|
in_doc = InputDocument(
|
||||||
|
Loading…
Reference in New Issue
Block a user