fix: set page number using 1-based indexing (#22)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2024-07-31 14:28:44 +02:00 committed by GitHub
parent e102827753
commit d2d9543415
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 11 additions and 11 deletions

View File

@ -56,7 +56,7 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
### Convert a batch of documents
For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
From a local repo clone, you can run it with:

View File

@ -125,7 +125,7 @@ class ConvertedDocument(BaseModel):
desc = DsDocumentDescription(logs=[])
page_hashes = [
PageReference(hash=p.page_hash, page=p.page_no, model="default")
PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
for p in self.pages
]
@ -159,7 +159,7 @@ class ConvertedDocument(BaseModel):
prov=[
Prov(
bbox=target_bbox,
page=element.page_no,
page=element.page_no + 1,
span=[0, len(element.text)],
)
],
@ -242,7 +242,7 @@ class ConvertedDocument(BaseModel):
prov=[
Prov(
bbox=target_bbox,
page=element.page_no,
page=element.page_no + 1,
span=[0, 0],
)
],
@ -264,7 +264,7 @@ class ConvertedDocument(BaseModel):
prov=[
Prov(
bbox=target_bbox,
page=element.page_no,
page=element.page_no + 1,
span=[0, 0],
)
],
@ -274,7 +274,7 @@ class ConvertedDocument(BaseModel):
)
page_dimensions = [
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
for p in self.pages
]

8
poetry.lock generated
View File

@ -715,13 +715,13 @@ files = [
[[package]]
name = "docling-core"
version = "1.1.0"
version = "1.1.2"
description = "A python library to define and validate data types in Docling."
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_core-1.1.0-py3-none-any.whl", hash = "sha256:80096ec6bbce9e616700ccd6bdd5a50e5d1a9a832d7968da3874d54b29962536"},
{file = "docling_core-1.1.0.tar.gz", hash = "sha256:69bc83d3b192d9e56bb91d77d8434d9fc109f8cb25ab5a285d2f3bccc10899cb"},
{file = "docling_core-1.1.2-py3-none-any.whl", hash = "sha256:bdff5643e3e37a24204449eee99505db0f1cf620b8e1ce4cf4b71850bf49496b"},
{file = "docling_core-1.1.2.tar.gz", hash = "sha256:969cde6795631a5f5f8cbb5e7ca0e4032864c1abc8fff762415a09a9b1f7146c"},
]
[package.dependencies]
@ -4882,4 +4882,4 @@ ocr = ["easyocr"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "dcb00c6601f61b087fd204d040149c20a7dcd72ab353e912e78dc265c86e4d00"
content-hash = "a6685d5cf1b283d805e10193a437662a1807f99dad40b56ab1e58e1b708fc184"

View File

@ -23,7 +23,7 @@ packages = [{include = "docling"}]
[tool.poetry.dependencies]
python = "^3.10"
pydantic = "^2.0.0"
docling-core = "^1.1.0"
docling-core = "^1.1.2"
docling-ibm-models = "^1.1.0"
deepsearch-glm = ">=0.19.0,<1"
filetype = "^1.2.0"