fix(markdown): add support for HTML content (#855)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
6a76b49a47
commit
94751a78f4
@ -24,11 +24,16 @@ from docling_core.types.doc import (
|
|||||||
from marko import Markdown
|
from marko import Markdown
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
|
||||||
|
_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
|
||||||
|
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
||||||
|
|
||||||
|
|
||||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||||
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
||||||
@ -67,6 +72,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.in_table = False
|
self.in_table = False
|
||||||
self.md_table_buffer: list[str] = []
|
self.md_table_buffer: list[str] = []
|
||||||
self.inline_texts: list[str] = []
|
self.inline_texts: list[str] = []
|
||||||
|
self._html_blocks: int = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
@ -295,16 +301,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.md_table_buffer.append("")
|
self.md_table_buffer.append("")
|
||||||
|
|
||||||
elif isinstance(element, marko.block.HTMLBlock):
|
elif isinstance(element, marko.block.HTMLBlock):
|
||||||
|
self._html_blocks += 1
|
||||||
self.process_inline_text(parent_element, doc)
|
self.process_inline_text(parent_element, doc)
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
_log.debug("HTML Block: {}".format(element))
|
_log.debug("HTML Block: {}".format(element))
|
||||||
if (
|
if (
|
||||||
len(element.children) > 0
|
len(element.body) > 0
|
||||||
): # If Marko doesn't return any content for HTML block, skip it
|
): # If Marko doesn't return any content for HTML block, skip it
|
||||||
snippet_text = str(element.children).strip()
|
html_block = element.body.strip()
|
||||||
doc.add_text(
|
|
||||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
# wrap in markers to enable post-processing in convert()
|
||||||
)
|
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
||||||
|
doc.add_code(parent=parent_element, text=text_to_add)
|
||||||
else:
|
else:
|
||||||
if not isinstance(element, str):
|
if not isinstance(element, str):
|
||||||
self.close_table(doc)
|
self.close_table(doc)
|
||||||
@ -360,6 +368,42 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Start iterating from the root of the AST
|
# Start iterating from the root of the AST
|
||||||
self.iterate_elements(parsed_ast, 0, doc, None)
|
self.iterate_elements(parsed_ast, 0, doc, None)
|
||||||
self.process_inline_text(None, doc) # handle last hanging inline text
|
self.process_inline_text(None, doc) # handle last hanging inline text
|
||||||
|
|
||||||
|
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
||||||
|
if self._html_blocks > 0:
|
||||||
|
|
||||||
|
# export to HTML
|
||||||
|
html_backend_cls = HTMLDocumentBackend
|
||||||
|
html_str = doc.export_to_html()
|
||||||
|
|
||||||
|
def _restore_original_html(txt, regex):
|
||||||
|
_txt, count = re.subn(regex, "", txt)
|
||||||
|
if count != self._html_blocks:
|
||||||
|
raise RuntimeError(
|
||||||
|
"An internal error has occurred during Markdown conversion."
|
||||||
|
)
|
||||||
|
return _txt
|
||||||
|
|
||||||
|
# restore original HTML by removing previouly added markers
|
||||||
|
for regex in [
|
||||||
|
rf"<pre>\s*<code>\s*{_START_MARKER}",
|
||||||
|
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
|
||||||
|
]:
|
||||||
|
html_str = _restore_original_html(txt=html_str, regex=regex)
|
||||||
|
self._html_blocks = 0
|
||||||
|
|
||||||
|
# delegate to HTML backend
|
||||||
|
stream = BytesIO(bytes(html_str, encoding="utf-8"))
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=stream,
|
||||||
|
format=InputFormat.HTML,
|
||||||
|
backend=html_backend_cls,
|
||||||
|
filename=self.file.name,
|
||||||
|
)
|
||||||
|
html_backend_obj = html_backend_cls(
|
||||||
|
in_doc=in_doc, path_or_stream=stream
|
||||||
|
)
|
||||||
|
doc = html_backend_obj.convert()
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|
||||||
|
25
tests/data/groundtruth/docling_v2/mixed.md.md
Normal file
25
tests/data/groundtruth/docling_v2/mixed.md.md
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
# Title
|
||||||
|
|
||||||
|
Some text
|
||||||
|
|
||||||
|
## Famous ducks
|
||||||
|
|
||||||
|
Here is a table:
|
||||||
|
|
||||||
|
| Character | Name in German | Name in French | Name in Italian |
|
||||||
|
|----------------|------------------|------------------|-------------------|
|
||||||
|
| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone |
|
||||||
|
| Huey | Tick | Riri | Qui |
|
||||||
|
| Dewey | Trick | Fifi | Quo |
|
||||||
|
| Louie | Track | Loulou | Qua |
|
||||||
|
|
||||||
|
And here is more HTML:
|
||||||
|
|
||||||
|
Some paragraph.
|
||||||
|
|
||||||
|
Now a div — almost there...
|
||||||
|
|
||||||
|
- foo
|
||||||
|
- bar
|
||||||
|
|
||||||
|
The end!
|
@ -53,6 +53,20 @@
|
|||||||
table tr:nth-child(even) td{
|
table tr:nth-child(even) td{
|
||||||
background-color: LightGray;
|
background-color: LightGray;
|
||||||
}
|
}
|
||||||
|
math annotation {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
.formula-not-decoded {
|
||||||
|
background: repeating-linear-gradient(
|
||||||
|
45deg, /* Angle of the stripes */
|
||||||
|
LightGray, /* First color */
|
||||||
|
LightGray 10px, /* Length of the first color */
|
||||||
|
White 10px, /* Second color */
|
||||||
|
White 20px /* Length of the second color */
|
||||||
|
);
|
||||||
|
margin: 0;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
</head>
|
</head>
|
||||||
<h2>Test with tables</h2>
|
<h2>Test with tables</h2>
|
||||||
|
54
tests/data/md/mixed.md
Normal file
54
tests/data/md/mixed.md
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
# Title
|
||||||
|
|
||||||
|
Some text
|
||||||
|
|
||||||
|
## Famous ducks
|
||||||
|
|
||||||
|
Here is a table:
|
||||||
|
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<th>Character</th>
|
||||||
|
<th>Name in German</th>
|
||||||
|
<th>Name in French</th>
|
||||||
|
<th>Name in Italian</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Scrooge McDuck</td>
|
||||||
|
<td>Dagobert Duck</td>
|
||||||
|
<td>Balthazar Picsou</td>
|
||||||
|
<td>Paperone</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Huey</td>
|
||||||
|
<td>Tick</td>
|
||||||
|
<td>Riri</td>
|
||||||
|
<td>Qui</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Dewey</td>
|
||||||
|
<td>Trick</td>
|
||||||
|
<td>Fifi</td>
|
||||||
|
<td>Quo</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Louie</td>
|
||||||
|
<td>Track</td>
|
||||||
|
<td>Loulou</td>
|
||||||
|
<td>Qua</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
And here is more HTML:
|
||||||
|
|
||||||
|
<p>Some paragraph.</p>
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<p>Now a div — almost there...</p>
|
||||||
|
<ul>
|
||||||
|
<li>foo</li>
|
||||||
|
<li>bar</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
The end!
|
Loading…
Reference in New Issue
Block a user