Docling/docling/backend/xml/jats_backend.py
nkh0472 a097ccd8d5
chore: typo fix (#1465)
* typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

---------

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>
2025-04-28 08:52:09 +02:00

708 lines
24 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
import traceback
from io import BytesIO
from pathlib import Path
from typing import Final, Optional, Union
from bs4 import BeautifulSoup, Tag
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupItem,
GroupLabel,
NodeItem,
TextItem,
)
from lxml import etree
from typing_extensions import TypedDict, override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
JATS_DTD_URL: Final = ["JATS-journalpublishing", "JATS-archive"]
DEFAULT_HEADER_ACKNOWLEDGMENTS: Final = "Acknowledgments"
DEFAULT_HEADER_ABSTRACT: Final = "Abstract"
DEFAULT_HEADER_REFERENCES: Final = "References"
DEFAULT_TEXT_ETAL: Final = "et al."
class Abstract(TypedDict):
label: str
content: str
class Author(TypedDict):
name: str
affiliation_names: list[str]
class Citation(TypedDict):
author_names: str
title: str
source: str
year: str
volume: str
page: str
pub_id: str
publisher_name: str
publisher_loc: str
class Table(TypedDict):
label: str
caption: str
content: str
class XMLComponents(TypedDict):
title: str
authors: list[Author]
abstract: list[Abstract]
class JatsDocumentBackend(DeclarativeDocumentBackend):
"""Backend to parse articles in XML format tagged according to JATS definition.
The Journal Article Tag Suite (JATS) is an definition standard for the
representation of journal articles in XML format. Several publishers and journal
archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv,
medRxiv, or Springer Nature.
Refer to https://jats.nlm.nih.gov for more details on JATS.
The code from this document backend has been developed by modifying parts of the
PubMed Parser library (version 0.5.0, released on 12.08.2024):
Achakulvisut et al., (2020).
Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML
Dataset XML Dataset.
Journal of Open Source Software, 5(46), 1979,
https://doi.org/10.21105/joss.01979
"""
@override
def __init__(
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
) -> None:
super().__init__(in_doc, path_or_stream)
self.path_or_stream = path_or_stream
# Initialize the root of the document hierarchy
self.root: Optional[NodeItem] = None
self.valid = False
try:
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.seek(0)
self.tree: etree._ElementTree = etree.parse(self.path_or_stream)
doc_info: etree.DocInfo = self.tree.docinfo
if doc_info.system_url and any(
kwd in doc_info.system_url for kwd in JATS_DTD_URL
):
self.valid = True
return
for ent in doc_info.internalDTD.iterentities():
if ent.system_url and any(
kwd in ent.system_url for kwd in JATS_DTD_URL
):
self.valid = True
return
except Exception as exc:
raise RuntimeError(
f"Could not initialize JATS backend for file with hash {self.document_hash}."
) from exc
@override
def is_valid(self) -> bool:
return self.valid
@classmethod
@override
def supports_pagination(cls) -> bool:
return False
@override
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
@override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.XML_JATS}
@override
def convert(self) -> DoclingDocument:
try:
# Create empty document
origin = DocumentOrigin(
filename=self.file.name or "file",
mimetype="application/xml",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
# Get metadata XML components
xml_components: XMLComponents = self._parse_metadata()
# Add metadata to the document
self._add_metadata(doc, xml_components)
# walk over the XML body
body = self.tree.xpath("//body")
if self.root and len(body) > 0:
self._walk_linear(doc, self.root, body[0])
# walk over the XML back matter
back = self.tree.xpath("//back")
if self.root and len(back) > 0:
self._walk_linear(doc, self.root, back[0])
except Exception:
_log.error(traceback.format_exc())
return doc
@staticmethod
def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:
skip_tags = ["term", "disp-formula", "inline-formula"]
text: str = (
node.text.replace("\n", " ")
if (node.tag not in skip_tags and node.text)
else ""
)
for child in list(node):
if child.tag not in skip_tags:
# TODO: apply styling according to child.tag when supported by docling-core
text += JatsDocumentBackend._get_text(child, sep)
if sep:
text = text.rstrip(sep) + sep
text += child.tail.replace("\n", " ") if child.tail else ""
return text
def _find_metadata(self) -> Optional[etree._Element]:
meta_names: list[str] = ["article-meta", "book-part-meta"]
meta: Optional[etree._Element] = None
for name in meta_names:
node = self.tree.xpath(f".//{name}")
if len(node) > 0:
meta = node[0]
break
return meta
def _parse_abstract(self) -> list[Abstract]:
# TODO: address cases with multiple sections
abs_list: list[Abstract] = []
for abs_node in self.tree.xpath(".//abstract"):
abstract: Abstract = dict(label="", content="")
texts = []
for abs_par in abs_node.xpath("p"):
texts.append(JatsDocumentBackend._get_text(abs_par).strip())
abstract["content"] = " ".join(texts)
label_node = abs_node.xpath("title|label")
if len(label_node) > 0:
abstract["label"] = label_node[0].text.strip()
abs_list.append(abstract)
return abs_list
def _parse_authors(self) -> list[Author]:
# Get mapping between affiliation ids and names
authors: list[Author] = []
meta: Optional[etree._Element] = self._find_metadata()
if meta is None:
return authors
affiliation_names = []
for affiliation_node in meta.xpath(".//aff[@id]"):
aff = ", ".join([t for t in affiliation_node.itertext() if t.strip()])
aff = aff.replace("\n", " ")
label = affiliation_node.xpath("label")
if label:
# TODO: once superscript is supported, add label with formatting
aff = aff.removeprefix(f"{label[0].text}, ")
affiliation_names.append(aff)
affiliation_ids_names = dict(
zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
)
# Get author names and affiliation names
for author_node in meta.xpath(
'.//contrib-group/contrib[@contrib-type="author"]'
):
author: Author = {
"name": "",
"affiliation_names": [],
}
# Affiliation names
affiliation_ids = [
a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]')
]
for id in affiliation_ids:
if id in affiliation_ids_names:
author["affiliation_names"].append(affiliation_ids_names[id])
# Name
author["name"] = (
author_node.xpath("name/given-names")[0].text
+ " "
+ author_node.xpath("name/surname")[0].text
)
authors.append(author)
return authors
def _parse_title(self) -> str:
meta_names: list[str] = [
"article-meta",
"collection-meta",
"book-meta",
"book-part-meta",
]
title_names: list[str] = ["article-title", "subtitle", "title", "label"]
titles: list[str] = [
" ".join(
elem.text.replace("\n", " ").strip()
for elem in list(title_node)
if elem.tag in title_names
).strip()
for title_node in self.tree.xpath(
"|".join([f".//{item}/title-group" for item in meta_names])
)
]
text = " - ".join(titles)
return text
def _parse_metadata(self) -> XMLComponents:
"""Parsing JATS document metadata."""
xml_components: XMLComponents = {
"title": self._parse_title(),
"authors": self._parse_authors(),
"abstract": self._parse_abstract(),
}
return xml_components
def _add_abstract(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
for abstract in xml_components["abstract"]:
text: str = abstract["content"]
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
if not text:
continue
parent = doc.add_heading(parent=self.root, text=title)
doc.add_text(
parent=parent,
text=text,
label=DocItemLabel.TEXT,
)
return
def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
# TODO: once docling supports text formatting, add affiliation reference to
# author names through superscripts
authors: list = [item["name"] for item in xml_components["authors"]]
authors_str = ", ".join(authors)
affiliations: list = [
item
for author in xml_components["authors"]
for item in author["affiliation_names"]
]
affiliations_str = "; ".join(list(dict.fromkeys(affiliations)))
if authors_str:
doc.add_text(
parent=self.root,
text=authors_str,
label=DocItemLabel.PARAGRAPH,
)
if affiliations_str:
doc.add_text(
parent=self.root,
text=affiliations_str,
label=DocItemLabel.PARAGRAPH,
)
return
def _add_citation(self, doc: DoclingDocument, parent: NodeItem, text: str) -> None:
if isinstance(parent, GroupItem) and parent.label == GroupLabel.LIST:
doc.add_list_item(text=text, enumerated=False, parent=parent)
else:
doc.add_text(text=text, label=DocItemLabel.TEXT, parent=parent)
return
def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
citation: Citation = {
"author_names": "",
"title": "",
"source": "",
"year": "",
"volume": "",
"page": "",
"pub_id": "",
"publisher_name": "",
"publisher_loc": "",
}
_log.debug("Citation parsing started")
# Author names
names = []
for name_node in node.xpath(".//name"):
name_str = (
name_node.xpath("surname")[0].text.replace("\n", " ").strip()
+ " "
+ name_node.xpath("given-names")[0].text.replace("\n", " ").strip()
)
names.append(name_str)
etal_node = node.xpath(".//etal")
if len(etal_node) > 0:
etal_text = etal_node[0].text or DEFAULT_TEXT_ETAL
names.append(etal_text)
citation["author_names"] = ", ".join(names)
titles: list[str] = [
"article-title",
"chapter-title",
"data-title",
"issue-title",
"part-title",
"trans-title",
]
title_node: Optional[etree._Element] = None
for name in titles:
name_node = node.xpath(name)
if len(name_node) > 0:
title_node = name_node[0]
break
citation["title"] = (
JatsDocumentBackend._get_text(title_node)
if title_node is not None
else node.text.replace("\n", " ").strip()
)
# Journal, year, publisher name, publisher location, volume, elocation
fields: list[str] = [
"source",
"year",
"publisher-name",
"publisher-loc",
"volume",
]
for item in fields:
item_node = node.xpath(item)
if len(item_node) > 0:
citation[item.replace("-", "_")] = ( # type: ignore[literal-required]
item_node[0].text.replace("\n", " ").strip()
)
# Publication identifier
if len(node.xpath("pub-id")) > 0:
pub_id: list[str] = []
for id_node in node.xpath("pub-id"):
id_type = id_node.get("assigning-authority") or id_node.get(
"pub-id-type"
)
id_text = id_node.text
if id_type and id_text:
pub_id.append(
id_type.replace("\n", " ").strip().upper()
+ ": "
+ id_text.replace("\n", " ").strip()
)
if pub_id:
citation["pub_id"] = ", ".join(pub_id)
# Pages
if len(node.xpath("elocation-id")) > 0:
citation["page"] = (
node.xpath("elocation-id")[0].text.replace("\n", " ").strip()
)
elif len(node.xpath("fpage")) > 0:
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
if len(node.xpath("lpage")) > 0:
citation["page"] += (
"" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
)
# Flatten the citation to string
text = ""
if citation["author_names"]:
text += citation["author_names"].rstrip(".") + ". "
if citation["title"]:
text += citation["title"] + ". "
if citation["source"]:
text += citation["source"] + ". "
if citation["publisher_name"]:
if citation["publisher_loc"]:
text += f"{citation['publisher_loc']}: "
text += citation["publisher_name"] + ". "
if citation["volume"]:
text = text.rstrip(". ")
text += f" {citation['volume']}. "
if citation["page"]:
text = text.rstrip(". ")
if citation["volume"]:
text += ":"
text += citation["page"] + ". "
if citation["year"]:
text = text.rstrip(". ")
text += f" ({citation['year']})."
if citation["pub_id"]:
text = text.rstrip(".") + ". "
text += citation["pub_id"]
_log.debug("Citation flattened")
return text
def _add_equation(
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> None:
math_text = node.text
math_parts = math_text.split("$$")
if len(math_parts) == 3:
math_formula = math_parts[1]
doc.add_text(label=DocItemLabel.FORMULA, text=math_formula, parent=parent)
return
def _add_figure_captions(
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> None:
label_node = node.xpath("label")
label: Optional[str] = (
JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else ""
)
caption_node = node.xpath("caption")
caption: Optional[str]
if len(caption_node) > 0:
caption = ""
for caption_par in list(caption_node[0]):
if caption_par.xpath(".//supplementary-material"):
continue
caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
caption = caption.strip()
else:
caption = None
# TODO: format label vs caption once styling is supported
fig_text: str = f"{label}{' ' if label and caption else ''}{caption}"
fig_caption: Optional[TextItem] = (
doc.add_text(label=DocItemLabel.CAPTION, text=fig_text)
if fig_text
else None
)
doc.add_picture(parent=parent, caption=fig_caption)
return
# TODO: add footnotes when DocItemLabel.FOOTNOTE and styling are supported
# def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None:
# new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent)
# for child in node.iterchildren(tag="fn"):
# text = JatsDocumentBackend._get_text(child)
# doc.add_list_item(text=text, parent=new_parent)
def _add_metadata(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
self._add_title(doc, xml_components)
self._add_authors(doc, xml_components)
self._add_abstract(doc, xml_components)
return
def _add_table(
self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table
) -> None:
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
table_tag = soup.find("table")
if not isinstance(table_tag, Tag):
return
data = HTMLDocumentBackend.parse_table_data(table_tag)
# TODO: format label vs caption once styling is supported
label = table_xml_component["label"]
caption = table_xml_component["caption"]
table_text: str = f"{label}{' ' if label and caption else ''}{caption}"
table_caption: Optional[TextItem] = (
doc.add_text(label=DocItemLabel.CAPTION, text=table_text)
if table_text
else None
)
if data is not None:
doc.add_table(data=data, parent=parent, caption=table_caption)
return
def _add_tables(
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> None:
table: Table = {"label": "", "caption": "", "content": ""}
# Content
if len(node.xpath("table")) > 0:
table_content_node = node.xpath("table")[0]
elif len(node.xpath("alternatives/table")) > 0:
table_content_node = node.xpath("alternatives/table")[0]
else:
table_content_node = None
if table_content_node is not None:
table["content"] = etree.tostring(table_content_node).decode("utf-8")
# Caption
caption_node = node.xpath("caption")
caption: Optional[str]
if caption_node:
caption = ""
for caption_par in list(caption_node[0]):
if caption_par.xpath(".//supplementary-material"):
continue
caption += JatsDocumentBackend._get_text(caption_par).strip() + " "
caption = caption.strip()
else:
caption = None
if caption is not None:
table["caption"] = caption
# Label
if len(node.xpath("label")) > 0:
table["label"] = node.xpath("label")[0].text
try:
self._add_table(doc, parent, table)
except Exception:
_log.warning(f"Skipping unsupported table in {self.file!s}")
return
def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:
self.root = doc.add_text(
parent=None,
text=xml_components["title"],
label=DocItemLabel.TITLE,
)
return
def _walk_linear( # noqa: C901
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> str:
skip_tags = ["term"]
flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
new_parent: NodeItem = parent
node_text: str = (
node.text.replace("\n", " ")
if (node.tag not in skip_tags and node.text)
else ""
)
for child in list(node):
stop_walk: bool = False
# flush text into TextItem for some tags in paragraph nodes
if node.tag == "p" and node_text.strip() and child.tag in flush_tags:
doc.add_text(
label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent
)
node_text = ""
# add elements and decide whether to stop walking
if child.tag in ("sec", "ack"):
header = child.xpath("title|label")
text: Optional[str] = None
if len(header) > 0:
text = JatsDocumentBackend._get_text(header[0])
elif child.tag == "ack":
text = DEFAULT_HEADER_ACKNOWLEDGMENTS
if text:
new_parent = doc.add_heading(text=text, parent=parent)
elif child.tag == "list":
new_parent = doc.add_group(
label=GroupLabel.LIST, name="list", parent=parent
)
elif child.tag == "list-item":
# TODO: address any type of content (another list, formula,...)
# TODO: address list type and item label
text = JatsDocumentBackend._get_text(child).strip()
new_parent = doc.add_list_item(text=text, parent=parent)
stop_walk = True
elif child.tag == "fig":
self._add_figure_captions(doc, parent, child)
stop_walk = True
elif child.tag == "table-wrap":
self._add_tables(doc, parent, child)
stop_walk = True
elif child.tag == "suplementary-material":
stop_walk = True
elif child.tag == "fn-group":
# header = child.xpath(".//title") or child.xpath(".//label")
# if header:
# text = JatsDocumentBackend._get_text(header[0])
# fn_parent = doc.add_heading(text=text, parent=new_parent)
# self._add_footnote_group(doc, fn_parent, child)
stop_walk = True
elif child.tag == "ref-list" and node.tag != "ref-list":
header = child.xpath("title|label")
text = (
JatsDocumentBackend._get_text(header[0])
if len(header) > 0
else DEFAULT_HEADER_REFERENCES
)
new_parent = doc.add_heading(text=text, parent=parent)
new_parent = doc.add_group(
parent=new_parent, label=GroupLabel.LIST, name="list"
)
elif child.tag == "element-citation":
text = self._parse_element_citation(child)
self._add_citation(doc, parent, text)
stop_walk = True
elif child.tag == "mixed-citation":
text = JatsDocumentBackend._get_text(child).strip()
self._add_citation(doc, parent, text)
stop_walk = True
elif child.tag == "tex-math":
self._add_equation(doc, parent, child)
stop_walk = True
elif child.tag == "inline-formula":
# TODO: address inline formulas when supported by docling-core
stop_walk = True
# step into child
if not stop_walk:
new_text = self._walk_linear(doc, new_parent, child)
if not (node.getparent().tag == "p" and node.tag in flush_tags):
node_text += new_text
# pick up the tail text
node_text += child.tail.replace("\n", " ") if child.tail else ""
# create paragraph
if node.tag == "p" and node_text.strip():
doc.add_text(label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent)
return ""
else:
# backpropagate the text
return node_text