fix: set page number using 1-based indexing (#22)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2024-07-31 14:28:44 +02:00 committed by GitHub
parent e102827753
commit d2d9543415
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 11 additions and 11 deletions

View File

@ -56,7 +56,7 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate
### Convert a batch of documents ### Convert a batch of documents
For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py). For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
From a local repo clone, you can run it with: From a local repo clone, you can run it with:

View File

@ -125,7 +125,7 @@ class ConvertedDocument(BaseModel):
desc = DsDocumentDescription(logs=[]) desc = DsDocumentDescription(logs=[])
page_hashes = [ page_hashes = [
PageReference(hash=p.page_hash, page=p.page_no, model="default") PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
for p in self.pages for p in self.pages
] ]
@ -159,7 +159,7 @@ class ConvertedDocument(BaseModel):
prov=[ prov=[
Prov( Prov(
bbox=target_bbox, bbox=target_bbox,
page=element.page_no, page=element.page_no + 1,
span=[0, len(element.text)], span=[0, len(element.text)],
) )
], ],
@ -242,7 +242,7 @@ class ConvertedDocument(BaseModel):
prov=[ prov=[
Prov( Prov(
bbox=target_bbox, bbox=target_bbox,
page=element.page_no, page=element.page_no + 1,
span=[0, 0], span=[0, 0],
) )
], ],
@ -264,7 +264,7 @@ class ConvertedDocument(BaseModel):
prov=[ prov=[
Prov( Prov(
bbox=target_bbox, bbox=target_bbox,
page=element.page_no, page=element.page_no + 1,
span=[0, 0], span=[0, 0],
) )
], ],
@ -274,7 +274,7 @@ class ConvertedDocument(BaseModel):
) )
page_dimensions = [ page_dimensions = [
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width) PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
for p in self.pages for p in self.pages
] ]

8
poetry.lock generated
View File

@ -715,13 +715,13 @@ files = [
[[package]] [[package]]
name = "docling-core" name = "docling-core"
version = "1.1.0" version = "1.1.2"
description = "A python library to define and validate data types in Docling." description = "A python library to define and validate data types in Docling."
optional = false optional = false
python-versions = "<4.0,>=3.9" python-versions = "<4.0,>=3.9"
files = [ files = [
{file = "docling_core-1.1.0-py3-none-any.whl", hash = "sha256:80096ec6bbce9e616700ccd6bdd5a50e5d1a9a832d7968da3874d54b29962536"}, {file = "docling_core-1.1.2-py3-none-any.whl", hash = "sha256:bdff5643e3e37a24204449eee99505db0f1cf620b8e1ce4cf4b71850bf49496b"},
{file = "docling_core-1.1.0.tar.gz", hash = "sha256:69bc83d3b192d9e56bb91d77d8434d9fc109f8cb25ab5a285d2f3bccc10899cb"}, {file = "docling_core-1.1.2.tar.gz", hash = "sha256:969cde6795631a5f5f8cbb5e7ca0e4032864c1abc8fff762415a09a9b1f7146c"},
] ]
[package.dependencies] [package.dependencies]
@ -4882,4 +4882,4 @@ ocr = ["easyocr"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "dcb00c6601f61b087fd204d040149c20a7dcd72ab353e912e78dc265c86e4d00" content-hash = "a6685d5cf1b283d805e10193a437662a1807f99dad40b56ab1e58e1b708fc184"

View File

@ -23,7 +23,7 @@ packages = [{include = "docling"}]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.10" python = "^3.10"
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-core = "^1.1.0" docling-core = "^1.1.2"
docling-ibm-models = "^1.1.0" docling-ibm-models = "^1.1.0"
deepsearch-glm = ">=0.19.0,<1" deepsearch-glm = ">=0.19.0,<1"
filetype = "^1.2.0" filetype = "^1.2.0"