fix: set page number using 1-based indexing (#22)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2024-07-31 14:28:44 +02:00
parent e102827753
commit d2d9543415
4 changed files with 11 additions and 11 deletions
@@ -56,7 +56,7 @@ print(doc.export_to_markdown())  # output: "## DocLayNet: A Large Human-Annotate
 ### Convert a batch of documents
-For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
+For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
 From a local repo clone, you can run it with:
@@ -125,7 +125,7 @@ class ConvertedDocument(BaseModel):
        desc = DsDocumentDescription(logs=[])
        page_hashes = [
-            PageReference(hash=p.page_hash, page=p.page_no, model="default")
+            PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
            for p in self.pages
        ]
@@ -159,7 +159,7 @@ class ConvertedDocument(BaseModel):
                        prov=[
                            Prov(
                                bbox=target_bbox,
-                                page=element.page_no,
+                                page=element.page_no + 1,
                                span=[0, len(element.text)],
                            )
                        ],
@@ -242,7 +242,7 @@ class ConvertedDocument(BaseModel):
                        prov=[
                            Prov(
                                bbox=target_bbox,
-                                page=element.page_no,
+                                page=element.page_no + 1,
                                span=[0, 0],
                            )
                        ],
@@ -264,7 +264,7 @@ class ConvertedDocument(BaseModel):
                        prov=[
                            Prov(
                                bbox=target_bbox,
-                                page=element.page_no,
+                                page=element.page_no + 1,
                                span=[0, 0],
                            )
                        ],
@@ -274,7 +274,7 @@ class ConvertedDocument(BaseModel):
                )
        page_dimensions = [
-            PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
+            PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
            for p in self.pages
        ]
@@ -715,13 +715,13 @@ files = [
 [[package]]
 name = "docling-core"
-version = "1.1.0"
+version = "1.1.2"
 description = "A python library to define and validate data types in Docling."
 optional = false
 python-versions = "<4.0,>=3.9"
 files = [
-    {file = "docling_core-1.1.0-py3-none-any.whl", hash = "sha256:80096ec6bbce9e616700ccd6bdd5a50e5d1a9a832d7968da3874d54b29962536"},
+    {file = "docling_core-1.1.2-py3-none-any.whl", hash = "sha256:bdff5643e3e37a24204449eee99505db0f1cf620b8e1ce4cf4b71850bf49496b"},
-    {file = "docling_core-1.1.0.tar.gz", hash = "sha256:69bc83d3b192d9e56bb91d77d8434d9fc109f8cb25ab5a285d2f3bccc10899cb"},
+    {file = "docling_core-1.1.2.tar.gz", hash = "sha256:969cde6795631a5f5f8cbb5e7ca0e4032864c1abc8fff762415a09a9b1f7146c"},
 ]
 [package.dependencies]
@@ -4882,4 +4882,4 @@ ocr = ["easyocr"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "dcb00c6601f61b087fd204d040149c20a7dcd72ab353e912e78dc265c86e4d00"
+content-hash = "a6685d5cf1b283d805e10193a437662a1807f99dad40b56ab1e58e1b708fc184"
@@ -23,7 +23,7 @@ packages = [{include = "docling"}]
 [tool.poetry.dependencies]
 python = "^3.10"
 pydantic = "^2.0.0"
-docling-core = "^1.1.0"
+docling-core = "^1.1.2"
 docling-ibm-models = "^1.1.0"
 deepsearch-glm = ">=0.19.0,<1"
 filetype = "^1.2.0"