From a097ccd8d5c6480ecba6bed16a36bf9a155f4fe3 Mon Sep 17 00:00:00 2001
From: nkh0472 <67589323+nkh0472@users.noreply.github.com>
Date: Mon, 28 Apr 2025 14:52:09 +0800
Subject: [PATCH] chore: typo fix (#1465)

* typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

* chore: typo fix

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>

---------

Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>
---
 docling/backend/md_backend.py            | 2 +-
 docling/backend/msword_backend.py        | 2 +-
 docling/backend/xml/jats_backend.py      | 2 +-
 docling/backend/xml/uspto_backend.py     | 8 ++++----
 docling/cli/main.py                      | 2 +-
 docling/models/table_structure_model.py  | 2 +-
 docling/models/tesseract_ocr_model.py    | 2 +-
 docs/examples/backend_xml_rag.ipynb      | 2 +-
 docs/examples/hybrid_chunking.ipynb      | 2 +-
 docs/examples/pictures_description.ipynb | 2 +-
 docs/examples/rag_milvus.ipynb           | 2 +-
 docs/examples/rag_weaviate.ipynb         | 4 ++--
 docs/faq/index.md                        | 2 +-
 docs/v2.md                               | 4 ++--
 14 files changed, 19 insertions(+), 19 deletions(-)
diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py
index 0c6b306..fbe17ee 100644
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -409,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                         )
                     return _txt
 
-                # restore original HTML by removing previouly added markers
+                # restore original HTML by removing previously added markers
                 for regex in [
                     rf"<pre>\s*<code>\s*{_START_MARKER}",
                     rf"{_STOP_MARKER}\s*</code>\s*</pre>",
diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
index a108361..f136222 100644
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
 
         # Common styles for bullet and numbered lists.
         # "List Bullet", "List Number", "List Paragraph"
-        # Identify wether list is a numbered list or not
+        # Identify whether list is a numbered list or not
         # is_numbered = "List Bullet" not in paragraph.style.name
         is_numbered = False
         p_style_id, p_level = self._get_label_and_level(paragraph)
diff --git a/docling/backend/xml/jats_backend.py b/docling/backend/xml/jats_backend.py
index 23560d3..f286504 100755
--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
         super().__init__(in_doc, path_or_stream)
         self.path_or_stream = path_or_stream
 
-        # Initialize the root of the document hiearchy
+        # Initialize the root of the document hierarchy
         self.root: Optional[NodeItem] = None
 
         self.valid = False
diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py
index b0f8031..268b80a 100644
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@@ -1,6 +1,6 @@
 """Backend to parse patents from the United States Patent Office (USPTO).
 
-The parsers included in this module can handle patent grants pubished since 1976 and
+The parsers included in this module can handle patent grants published since 1976 and
 patent applications since 2001.
 The original files can be found in https://bulkdata.uspto.gov.
 """
@@ -440,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
                     )
 
             elif name == self.Element.PARAGRAPH.value and text:
-                # remmove blank spaces added in paragraphs
+                # remove blank spaces added in paragraphs
                 text = re.sub("\\s+", " ", text)
                 if self.Element.ABSTRACT.value in self.property:
                     self.abstract = (
@@ -1697,7 +1697,7 @@ class XmlTable:
 class HtmlEntity:
     """Provide utility functions to get the HTML entities of styled characters.
 
-    This class has been developped from:
+    This class has been developed from:
     https://unicode-table.com/en/html-entities/
     https://www.w3.org/TR/WD-math-970515/table03.html
     """
@@ -1896,7 +1896,7 @@ class HtmlEntity:
         """Get an HTML entity of a greek letter in ISO 8879.
 
         Args:
-            The text to transform, as an ISO 8879 entitiy.
+            The text to transform, as an ISO 8879 entity.
 
         Returns:
             The HTML entity representing a greek letter. If the input text is not
diff --git a/docling/cli/main.py b/docling/cli/main.py
index c442655..98a4c8d 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -521,7 +521,7 @@ def convert(  # noqa: C901
             if image_export_mode != ImageRefMode.PLACEHOLDER:
                 pipeline_options.generate_page_images = True
                 pipeline_options.generate_picture_images = (
-                    True  # FIXME: to be deprecated in verson 3
+                    True  # FIXME: to be deprecated in version 3
                 )
                 pipeline_options.images_scale = 2
 
diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
index 1a3f900..a7679ea 100644
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -234,7 +234,7 @@ class TableStructureModel(BasePageModel):
                                 tcells = table_cluster.cells
                             tokens = []
                             for c in tcells:
-                                # Only allow non empty stings (spaces) into the cells of a table
+                                # Only allow non empty strings (spaces) into the cells of a table
                                 if len(c.text.strip()) > 0:
                                     new_cell = copy.deepcopy(c)
                                     new_cell.rect = BoundingRectangle.from_bounding_box(
diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py
index fbe907c..989ce0e 100644
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -151,7 +151,7 @@ class TesseractOcrModel(BaseOcrModel):
                             script = map_tesseract_script(script)
                             lang = f"{self.script_prefix}{script}"
 
-                            # Check if the detected languge is present in the system
+                            # Check if the detected language is present in the system
                             if lang not in self._tesserocr_languages:
                                 msg = f"Tesseract detected the script '{script}' and language '{lang}'."
                                 msg += " However this language is not installed in your system and will be ignored."
diff --git a/docs/examples/backend_xml_rag.ipynb b/docs/examples/backend_xml_rag.ipynb
index 60872c3..60c5839 100644
--- a/docs/examples/backend_xml_rag.ipynb
+++ b/docs/examples/backend_xml_rag.ipynb
@@ -569,7 +569,7 @@
     "The `DoclingDocument` format of the converted patents has a rich hierarchical structure, inherited from the original XML document and preserved by the Docling custom backend.\n",
     "In this notebook, we will leverage:\n",
     "- The `SimpleDirectoryReader` pattern to iterate over the exported XML files created in section [Fetch the data](#fetch-the-data).\n",
-    "- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vectore store.\n",
+    "- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vector store.\n",
     "- The `HierarchicalChunker` implementation, which applies a document-based hierarchical chunking, to leverage the patent structures like sections and paragraphs within sections.\n",
     "\n",
     "Refer to other possible implementations and usage patterns in the [Chunking](../../concepts/chunking/) documentation and the [RAG with LlamaIndex](../rag_llamaindex/) notebook."
diff --git a/docs/examples/hybrid_chunking.ipynb b/docs/examples/hybrid_chunking.ipynb
index c8a8f42..68795a0 100644
--- a/docs/examples/hybrid_chunking.ipynb
+++ b/docs/examples/hybrid_chunking.ipynb
@@ -206,7 +206,7 @@
    "source": [
     "Points to notice looking at the output chunks below:\n",
     "- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)\n",
-    "- Where neeeded, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
+    "- Where needed, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
     "- Where possible, we merge undersized peer chunks (see chunk 0)\n",
     "- \"Tail\" chunks trailing right after merges may still be undersized (see chunk 8)"
    ]
diff --git a/docs/examples/pictures_description.ipynb b/docs/examples/pictures_description.ipynb
index a40a73a..17845d7 100644
--- a/docs/examples/pictures_description.ipynb
+++ b/docs/examples/pictures_description.ipynb
@@ -279,7 +279,7 @@
     "## Use other vision models\n",
     "\n",
     "The examples above can also be reproduced using other vision model.\n",
-    "The Docling options `PictureDescriptionVlmOptions` allows to speficy your favorite vision model from the Hugging Face Hub."
+    "The Docling options `PictureDescriptionVlmOptions` allows to specify your favorite vision model from the Hugging Face Hub."
    ]
   },
   {
diff --git a/docs/examples/rag_milvus.ipynb b/docs/examples/rag_milvus.ipynb
index 6366810..9165297 100644
--- a/docs/examples/rag_milvus.ipynb
+++ b/docs/examples/rag_milvus.ipynb
@@ -32,7 +32,7 @@
     "\n",
     "Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
     "1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
-    "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU.\n"
+    "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU.\n"
    ]
   },
   {
diff --git a/docs/examples/rag_weaviate.ipynb b/docs/examples/rag_weaviate.ipynb
index 627e892..52ad11c 100644
--- a/docs/examples/rag_weaviate.ipynb
+++ b/docs/examples/rag_weaviate.ipynb
@@ -43,7 +43,7 @@
     "\n",
     "Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
     "1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
-    "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU."
+    "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU."
    ]
   },
   {
@@ -716,7 +716,7 @@
     "id": "7tGz49nfUegG"
    },
    "source": [
-    "We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documetation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
+    "We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documentation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
    ]
   }
  ],
diff --git a/docs/faq/index.md b/docs/faq/index.md
index 781c568..649a22f 100644
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@@ -137,7 +137,7 @@ This is a collection of FAQ collected from the user questions on <https://github
     ### Some images are missing from MS Word and Powerpoint
 
     The image processing library used by Docling is able to handle embedded WMF images only on Windows platform.
-    If you are on other operaring systems, these images will be ignored.
+    If you are on other operating systems, these images will be ignored.
 
 
 ??? question "`HybridChunker` triggers warning: 'Token indices sequence length is longer than the specified maximum sequence length for this model'"
diff --git a/docs/v2.md b/docs/v2.md
index df1c823..395555d 100644
--- a/docs/v2.md
+++ b/docs/v2.md
@@ -37,7 +37,7 @@ docling ./input/dir --output ./scratch --abort-on-error
 
 ### Setting up a `DocumentConverter`
 
-To accomodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
+To accommodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
 You can now define a list of allowed formats on the `DocumentConverter` initialization, and specify custom options
 per-format if desired. By default, all supported formats are allowed. If you don't provide `format_options`, defaults
 will be used for all `allowed_formats`.
@@ -151,7 +151,7 @@ conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/240
 ## Inspect the converted document:
 conv_result.document.print_element_tree()
 
-## Iterate the elements in reading order, including hierachy level:
+## Iterate the elements in reading order, including hierarchy level:
 for item, level in conv_result.document.iterate_items():
     if isinstance(item, TextItem):
         print(item.text)