fix(html): handle address, details, and summary tags (#1436)
* fix(html): handle 'address' tag Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * fix(html): handle 'details' tag Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --------- Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
c2470ed216
commit
ed20124544
@ -26,6 +26,8 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
# tags that generate NodeItem elements
|
# tags that generate NodeItem elements
|
||||||
TAGS_FOR_NODE_ITEMS: Final = [
|
TAGS_FOR_NODE_ITEMS: Final = [
|
||||||
|
"address",
|
||||||
|
"details",
|
||||||
"h1",
|
"h1",
|
||||||
"h2",
|
"h2",
|
||||||
"h3",
|
"h3",
|
||||||
@ -38,6 +40,7 @@ TAGS_FOR_NODE_ITEMS: Final = [
|
|||||||
"ul",
|
"ul",
|
||||||
"ol",
|
"ol",
|
||||||
"li",
|
"li",
|
||||||
|
"summary",
|
||||||
"table",
|
"table",
|
||||||
"figure",
|
"figure",
|
||||||
"img",
|
"img",
|
||||||
@ -163,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
def analyze_tag(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||||
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
if tag.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
||||||
self.handle_header(tag, doc)
|
self.handle_header(tag, doc)
|
||||||
elif tag.name in ["p"]:
|
elif tag.name in ["p", "address", "summary"]:
|
||||||
self.handle_paragraph(tag, doc)
|
self.handle_paragraph(tag, doc)
|
||||||
elif tag.name in ["pre", "code"]:
|
elif tag.name in ["pre", "code"]:
|
||||||
self.handle_code(tag, doc)
|
self.handle_code(tag, doc)
|
||||||
@ -177,6 +180,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.handle_figure(tag, doc)
|
self.handle_figure(tag, doc)
|
||||||
elif tag.name == "img":
|
elif tag.name == "img":
|
||||||
self.handle_image(tag, doc)
|
self.handle_image(tag, doc)
|
||||||
|
elif tag.name == "details":
|
||||||
|
self.handle_details(tag, doc)
|
||||||
else:
|
else:
|
||||||
self.walk(tag, doc)
|
self.walk(tag, doc)
|
||||||
|
|
||||||
@ -201,6 +206,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
return ["".join(result) + " "]
|
return ["".join(result) + " "]
|
||||||
|
|
||||||
|
def handle_details(self, element: Tag, doc: DoclingDocument) -> None:
|
||||||
|
"""Handle details tag (details) and its content."""
|
||||||
|
|
||||||
|
self.parents[self.level + 1] = doc.add_group(
|
||||||
|
name="details",
|
||||||
|
label=GroupLabel.SECTION,
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.level += 1
|
||||||
|
self.walk(element, doc)
|
||||||
|
self.parents[self.level + 1] = None
|
||||||
|
self.level -= 1
|
||||||
|
|
||||||
def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
|
def handle_header(self, element: Tag, doc: DoclingDocument) -> None:
|
||||||
"""Handles header tags (h1, h2, etc.)."""
|
"""Handles header tags (h1, h2, etc.)."""
|
||||||
hlevel = int(element.name.replace("h", ""))
|
hlevel = int(element.name.replace("h", ""))
|
||||||
@ -258,7 +278,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
||||||
"""Handles paragraph tags (p)."""
|
"""Handles paragraph tags (p) or equivalent ones."""
|
||||||
if element.text is None:
|
if element.text is None:
|
||||||
return
|
return
|
||||||
text = element.text.strip()
|
text = element.text.strip()
|
||||||
|
@ -4,4 +4,7 @@ item-0 at level 0: unspecified: group _root_
|
|||||||
item-3 at level 1: text: This is a regular paragraph.
|
item-3 at level 1: text: This is a regular paragraph.
|
||||||
item-4 at level 1: text: This is a third div
|
item-4 at level 1: text: This is a third div
|
||||||
with a new line.
|
with a new line.
|
||||||
item-5 at level 1: text: This is a fourth div with a bold paragraph.
|
item-5 at level 1: section: group details
|
||||||
|
item-6 at level 2: text: Heading for the details element
|
||||||
|
item-7 at level 2: text: Description of the details element.
|
||||||
|
item-8 at level 1: text: This is a fourth div with a bold paragraph.
|
@ -4,7 +4,7 @@
|
|||||||
"name": "example_06",
|
"name": "example_06",
|
||||||
"origin": {
|
"origin": {
|
||||||
"mimetype": "text/html",
|
"mimetype": "text/html",
|
||||||
"binary_hash": 14574683870626799530,
|
"binary_hash": 10224930410364781672,
|
||||||
"filename": "example_06.html"
|
"filename": "example_06.html"
|
||||||
},
|
},
|
||||||
"furniture": {
|
"furniture": {
|
||||||
@ -30,14 +30,35 @@
|
|||||||
"$ref": "#/texts/3"
|
"$ref": "#/texts/3"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/4"
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/6"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"name": "_root_",
|
"name": "_root_",
|
||||||
"label": "unspecified"
|
"label": "unspecified"
|
||||||
},
|
},
|
||||||
"groups": [],
|
"groups": [
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/0",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/4"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/5"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "details",
|
||||||
|
"label": "section"
|
||||||
|
}
|
||||||
|
],
|
||||||
"texts": [
|
"texts": [
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/0",
|
"self_ref": "#/texts/0",
|
||||||
@ -89,6 +110,30 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/4",
|
"self_ref": "#/texts/4",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Heading for the details element",
|
||||||
|
"text": "Heading for the details element"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/5",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/0"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "Description of the details element.",
|
||||||
|
"text": "Description of the details element."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/6",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/body"
|
"$ref": "#/body"
|
||||||
},
|
},
|
||||||
|
@ -7,4 +7,8 @@ This is a regular paragraph.
|
|||||||
This is a third div
|
This is a third div
|
||||||
with a new line.
|
with a new line.
|
||||||
|
|
||||||
|
Heading for the details element
|
||||||
|
|
||||||
|
Description of the details element.
|
||||||
|
|
||||||
This is a fourth div with a bold paragraph.
|
This is a fourth div with a bold paragraph.
|
@ -7,6 +7,10 @@
|
|||||||
<div>This is another div with text.</div>
|
<div>This is another div with text.</div>
|
||||||
<p>This is a regular paragraph.</p>
|
<p>This is a regular paragraph.</p>
|
||||||
<div>This is a third div<br/>with a new line.</div>
|
<div>This is a third div<br/>with a new line.</div>
|
||||||
|
<details>
|
||||||
|
<summary>Heading for the details element</summary>
|
||||||
|
<p>Description of the details element.</p>
|
||||||
|
</details>
|
||||||
<div><p>This is a fourth div with a <b>bold</b> paragraph.</p></div>
|
<div><p>This is a fourth div with a <b>bold</b> paragraph.</p></div>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
Loading…
Reference in New Issue
Block a user