fix: align output formats (#49)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-08-26 13:30:26 +02:00 committed by GitHub
parent 053eae4bdf
commit 8cc147bc56
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 5 additions and 8 deletions

View File

@ -88,7 +88,7 @@ class DocumentConverter:
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
yield from map(self.process_document, input_batch)
def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConvertedDocument:
"""Convert a single document.
Args:
@ -133,11 +133,10 @@ class DocumentConverter:
converted_doc: ConvertedDocument = next(converted_docs_iter)
if converted_doc.status not in {
ConversionStatus.SUCCESS,
ConversionStatus.SUCCESS_WITH_ERRORS,
ConversionStatus.PARTIAL_SUCCESS,
}:
raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
doc = converted_doc.to_ds_document()
return doc
return converted_doc
def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
start_doc_time = time.time()

View File

@ -1,8 +1,6 @@
from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter()
doc = converter.convert_single(source)
print(
doc.export_to_markdown()
) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"