feat: create a backend to parse USPTO patents into DoclingDocument (#606)

* feat: add PATENT_USPTO as input format

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* feat: add USPTO backend parser

Add a backend implementation to parse patent applications and
grants from the United States Patent Office (USPTO).

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* refactor: change the name of the USPTO input format

Change the name of the patent USPTO input format to show the typical format (XML).

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* refactor: address several input formats with same mime type

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* refactor: group XML backend parsers in a subfolder

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* chore: add safe initialization of PatentUsptoDocumentBackend

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis
2024-12-17 16:35:23 +01:00
committed by GitHub
parent 3e599c7bbe
commit 4e087504cc
32 changed files with 54419 additions and 18 deletions

View File

@@ -3,7 +3,17 @@ import re
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
from typing import (
TYPE_CHECKING,
Dict,
Iterable,
List,
Literal,
Optional,
Set,
Type,
Union,
)
import filetype
from docling_core.types.doc import (
@@ -235,7 +245,7 @@ class _DocumentConversionInput(BaseModel):
if isinstance(obj, Path):
yield InputDocument(
path_or_stream=obj,
format=format,
format=format, # type: ignore[arg-type]
filename=obj.name,
limits=self.limits,
backend=backend,
@@ -243,7 +253,7 @@ class _DocumentConversionInput(BaseModel):
elif isinstance(obj, DocumentStream):
yield InputDocument(
path_or_stream=obj.stream,
format=format,
format=format, # type: ignore[arg-type]
filename=obj.name,
limits=self.limits,
backend=backend,
@@ -251,15 +261,15 @@ class _DocumentConversionInput(BaseModel):
else:
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
def _guess_format(self, obj: Union[Path, DocumentStream]):
def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputFormat]:
content = b"" # empty binary blob
format = None
formats: list[InputFormat] = []
if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj))
if mime is None:
ext = obj.suffix[1:]
mime = self._mime_from_extension(ext)
mime = _DocumentConversionInput._mime_from_extension(ext)
if mime is None: # must guess from
with obj.open("rb") as f:
content = f.read(1024) # Read first 1KB
@@ -274,15 +284,53 @@ class _DocumentConversionInput(BaseModel):
if ("." in obj.name and not obj.name.startswith("."))
else ""
)
mime = self._mime_from_extension(ext)
mime = _DocumentConversionInput._mime_from_extension(ext)
mime = mime or self._detect_html_xhtml(content)
mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, [])
if formats:
# TODO: remove application/xml case after adding another XML parse
if len(formats) == 1 and mime not in ("text/plain", "application/xml"):
return formats[0]
else: # ambiguity in formats
return _DocumentConversionInput._guess_from_content(
content, mime, formats
)
else:
return None
format = MimeTypeToFormat.get(mime)
return format
@staticmethod
def _guess_from_content(
content: bytes, mime: str, formats: list[InputFormat]
) -> Optional[InputFormat]:
"""Guess the input format of a document by checking part of its content."""
input_format: Optional[InputFormat] = None
content_str = content.decode("utf-8")
def _mime_from_extension(self, ext):
if mime == "application/xml":
match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
if match_doctype:
xml_doctype = match_doctype.group()
if InputFormat.XML_USPTO in formats and any(
item in xml_doctype
for item in (
"us-patent-application-v4",
"us-patent-grant-v4",
"us-grant-025",
"patent-application-publication",
)
):
input_format = InputFormat.XML_USPTO
elif mime == "text/plain":
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
input_format = InputFormat.XML_USPTO
return input_format
@staticmethod
def _mime_from_extension(ext):
mime = None
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
@@ -293,7 +341,19 @@ class _DocumentConversionInput(BaseModel):
return mime
def _detect_html_xhtml(self, content):
@staticmethod
def _detect_html_xhtml(
content: bytes,
) -> Optional[Literal["application/xhtml+xml", "application/xml", "text/html"]]:
"""Guess the mime type of an XHTML, HTML, or XML file from its content.
Args:
content: A short piece of a document from its beginning.
Returns:
The mime type of an XHTML, HTML, or XML file, or None if the content does
not match any of these formats.
"""
content_str = content.decode("ascii", errors="ignore").lower()
# Remove XML comments
content_str = re.sub(r"<!--(.*?)-->", "", content_str, flags=re.DOTALL)
@@ -302,6 +362,8 @@ class _DocumentConversionInput(BaseModel):
if re.match(r"<\?xml", content_str):
if "xhtml" in content_str[:1000]:
return "application/xhtml+xml"
else:
return "application/xml"
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
return "text/html"