fix(markdown): add support for HTML content (#855)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
6a76b49a47
commit
94751a78f4
@ -24,11 +24,16 @@ from docling_core.types.doc import (
|
||||
from marko import Markdown
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
|
||||
_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
|
||||
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
||||
|
||||
|
||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
||||
@ -67,6 +72,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.in_table = False
|
||||
self.md_table_buffer: list[str] = []
|
||||
self.inline_texts: list[str] = []
|
||||
self._html_blocks: int = 0
|
||||
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
@ -295,16 +301,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.md_table_buffer.append("")
|
||||
|
||||
elif isinstance(element, marko.block.HTMLBlock):
|
||||
self._html_blocks += 1
|
||||
self.process_inline_text(parent_element, doc)
|
||||
self.close_table(doc)
|
||||
_log.debug("HTML Block: {}".format(element))
|
||||
if (
|
||||
len(element.children) > 0
|
||||
len(element.body) > 0
|
||||
): # If Marko doesn't return any content for HTML block, skip it
|
||||
snippet_text = str(element.children).strip()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||
)
|
||||
html_block = element.body.strip()
|
||||
|
||||
# wrap in markers to enable post-processing in convert()
|
||||
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
||||
doc.add_code(parent=parent_element, text=text_to_add)
|
||||
else:
|
||||
if not isinstance(element, str):
|
||||
self.close_table(doc)
|
||||
@ -360,6 +368,42 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
# Start iterating from the root of the AST
|
||||
self.iterate_elements(parsed_ast, 0, doc, None)
|
||||
self.process_inline_text(None, doc) # handle last hanging inline text
|
||||
|
||||
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
||||
if self._html_blocks > 0:
|
||||
|
||||
# export to HTML
|
||||
html_backend_cls = HTMLDocumentBackend
|
||||
html_str = doc.export_to_html()
|
||||
|
||||
def _restore_original_html(txt, regex):
|
||||
_txt, count = re.subn(regex, "", txt)
|
||||
if count != self._html_blocks:
|
||||
raise RuntimeError(
|
||||
"An internal error has occurred during Markdown conversion."
|
||||
)
|
||||
return _txt
|
||||
|
||||
# restore original HTML by removing previouly added markers
|
||||
for regex in [
|
||||
rf"<pre>\s*<code>\s*{_START_MARKER}",
|
||||
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
|
||||
]:
|
||||
html_str = _restore_original_html(txt=html_str, regex=regex)
|
||||
self._html_blocks = 0
|
||||
|
||||
# delegate to HTML backend
|
||||
stream = BytesIO(bytes(html_str, encoding="utf-8"))
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=stream,
|
||||
format=InputFormat.HTML,
|
||||
backend=html_backend_cls,
|
||||
filename=self.file.name,
|
||||
)
|
||||
html_backend_obj = html_backend_cls(
|
||||
in_doc=in_doc, path_or_stream=stream
|
||||
)
|
||||
doc = html_backend_obj.convert()
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|
||||
|
25
tests/data/groundtruth/docling_v2/mixed.md.md
Normal file
25
tests/data/groundtruth/docling_v2/mixed.md.md
Normal file
@ -0,0 +1,25 @@
|
||||
# Title
|
||||
|
||||
Some text
|
||||
|
||||
## Famous ducks
|
||||
|
||||
Here is a table:
|
||||
|
||||
| Character | Name in German | Name in French | Name in Italian |
|
||||
|----------------|------------------|------------------|-------------------|
|
||||
| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone |
|
||||
| Huey | Tick | Riri | Qui |
|
||||
| Dewey | Trick | Fifi | Quo |
|
||||
| Louie | Track | Loulou | Qua |
|
||||
|
||||
And here is more HTML:
|
||||
|
||||
Some paragraph.
|
||||
|
||||
Now a div — almost there...
|
||||
|
||||
- foo
|
||||
- bar
|
||||
|
||||
The end!
|
@ -53,6 +53,20 @@
|
||||
table tr:nth-child(even) td{
|
||||
background-color: LightGray;
|
||||
}
|
||||
math annotation {
|
||||
display: none;
|
||||
}
|
||||
.formula-not-decoded {
|
||||
background: repeating-linear-gradient(
|
||||
45deg, /* Angle of the stripes */
|
||||
LightGray, /* First color */
|
||||
LightGray 10px, /* Length of the first color */
|
||||
White 10px, /* Second color */
|
||||
White 20px /* Length of the second color */
|
||||
);
|
||||
margin: 0;
|
||||
text-align: center;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<h2>Test with tables</h2>
|
||||
|
54
tests/data/md/mixed.md
Normal file
54
tests/data/md/mixed.md
Normal file
@ -0,0 +1,54 @@
|
||||
# Title
|
||||
|
||||
Some text
|
||||
|
||||
## Famous ducks
|
||||
|
||||
Here is a table:
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>Character</th>
|
||||
<th>Name in German</th>
|
||||
<th>Name in French</th>
|
||||
<th>Name in Italian</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Scrooge McDuck</td>
|
||||
<td>Dagobert Duck</td>
|
||||
<td>Balthazar Picsou</td>
|
||||
<td>Paperone</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Huey</td>
|
||||
<td>Tick</td>
|
||||
<td>Riri</td>
|
||||
<td>Qui</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Dewey</td>
|
||||
<td>Trick</td>
|
||||
<td>Fifi</td>
|
||||
<td>Quo</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Louie</td>
|
||||
<td>Track</td>
|
||||
<td>Loulou</td>
|
||||
<td>Qua</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
And here is more HTML:
|
||||
|
||||
<p>Some paragraph.</p>
|
||||
|
||||
<div>
|
||||
<p>Now a div — almost there...</p>
|
||||
<ul>
|
||||
<li>foo</li>
|
||||
<li>bar</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
The end!
|
Loading…
Reference in New Issue
Block a user