fix(markdown): add support for HTML content (#855)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2025-02-03 12:21:05 +01:00 committed by GitHub
parent 6a76b49a47
commit 94751a78f4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 142 additions and 5 deletions

View File

@ -24,11 +24,16 @@ from docling_core.types.doc import (
from marko import Markdown
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
@ -67,6 +72,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.in_table = False
self.md_table_buffer: list[str] = []
self.inline_texts: list[str] = []
self._html_blocks: int = 0
try:
if isinstance(self.path_or_stream, BytesIO):
@ -295,16 +301,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.md_table_buffer.append("")
elif isinstance(element, marko.block.HTMLBlock):
self._html_blocks += 1
self.process_inline_text(parent_element, doc)
self.close_table(doc)
_log.debug("HTML Block: {}".format(element))
if (
len(element.children) > 0
len(element.body) > 0
): # If Marko doesn't return any content for HTML block, skip it
snippet_text = str(element.children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
html_block = element.body.strip()
# wrap in markers to enable post-processing in convert()
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
doc.add_code(parent=parent_element, text=text_to_add)
else:
if not isinstance(element, str):
self.close_table(doc)
@ -360,6 +368,42 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# Start iterating from the root of the AST
self.iterate_elements(parsed_ast, 0, doc, None)
self.process_inline_text(None, doc) # handle last hanging inline text
# if HTML blocks were detected, export to HTML and delegate to HTML backend
if self._html_blocks > 0:
# export to HTML
html_backend_cls = HTMLDocumentBackend
html_str = doc.export_to_html()
def _restore_original_html(txt, regex):
_txt, count = re.subn(regex, "", txt)
if count != self._html_blocks:
raise RuntimeError(
"An internal error has occurred during Markdown conversion."
)
return _txt
# restore original HTML by removing previouly added markers
for regex in [
rf"<pre>\s*<code>\s*{_START_MARKER}",
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
]:
html_str = _restore_original_html(txt=html_str, regex=regex)
self._html_blocks = 0
# delegate to HTML backend
stream = BytesIO(bytes(html_str, encoding="utf-8"))
in_doc = InputDocument(
path_or_stream=stream,
format=InputFormat.HTML,
backend=html_backend_cls,
filename=self.file.name,
)
html_backend_obj = html_backend_cls(
in_doc=in_doc, path_or_stream=stream
)
doc = html_backend_obj.convert()
else:
raise RuntimeError(
f"Cannot convert md with {self.document_hash} because the backend failed to init."

View File

@ -0,0 +1,25 @@
# Title
Some text
## Famous ducks
Here is a table:
| Character | Name in German | Name in French | Name in Italian |
|----------------|------------------|------------------|-------------------|
| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone |
| Huey | Tick | Riri | Qui |
| Dewey | Trick | Fifi | Quo |
| Louie | Track | Loulou | Qua |
And here is more HTML:
Some paragraph.
Now a div — almost there...
- foo
- bar
The end!

View File

@ -53,6 +53,20 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
.formula-not-decoded {
background: repeating-linear-gradient(
45deg, /* Angle of the stripes */
LightGray, /* First color */
LightGray 10px, /* Length of the first color */
White 10px, /* Second color */
White 20px /* Length of the second color */
);
margin: 0;
text-align: center;
}
</style>
</head>
<h2>Test with tables</h2>

54
tests/data/md/mixed.md Normal file
View File

@ -0,0 +1,54 @@
# Title
Some text
## Famous ducks
Here is a table:
<table>
<tr>
<th>Character</th>
<th>Name in German</th>
<th>Name in French</th>
<th>Name in Italian</th>
</tr>
<tr>
<td>Scrooge McDuck</td>
<td>Dagobert Duck</td>
<td>Balthazar Picsou</td>
<td>Paperone</td>
</tr>
<tr>
<td>Huey</td>
<td>Tick</td>
<td>Riri</td>
<td>Qui</td>
</tr>
<tr>
<td>Dewey</td>
<td>Trick</td>
<td>Fifi</td>
<td>Quo</td>
</tr>
<tr>
<td>Louie</td>
<td>Track</td>
<td>Loulou</td>
<td>Qua</td>
</tr>
</table>
And here is more HTML:
<p>Some paragraph.</p>
<div>
<p>Now a div — almost there...</p>
<ul>
<li>foo</li>
<li>bar</li>
</ul>
</div>
The end!