From a097ccd8d5c6480ecba6bed16a36bf9a155f4fe3 Mon Sep 17 00:00:00 2001 From: nkh0472 <67589323+nkh0472@users.noreply.github.com> Date: Mon, 28 Apr 2025 14:52:09 +0800 Subject: [PATCH] chore: typo fix (#1465) * typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> --------- Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> --- docling/backend/md_backend.py | 2 +- docling/backend/msword_backend.py | 2 +- docling/backend/xml/jats_backend.py | 2 +- docling/backend/xml/uspto_backend.py | 8 ++++---- docling/cli/main.py | 2 +- docling/models/table_structure_model.py | 2 +- docling/models/tesseract_ocr_model.py | 2 +- docs/examples/backend_xml_rag.ipynb | 2 +- docs/examples/hybrid_chunking.ipynb | 2 +- docs/examples/pictures_description.ipynb | 2 +- docs/examples/rag_milvus.ipynb | 2 +- docs/examples/rag_weaviate.ipynb | 4 ++-- docs/faq/index.md | 2 +- docs/v2.md | 4 ++-- 14 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 0c6b306..fbe17ee 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -409,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): ) return _txt - # restore original HTML by removing previouly added markers + # restore original HTML by removing previously added markers for regex in [ rf"
\s*\s*{_START_MARKER}",
rf"{_STOP_MARKER}\s*
\s*
",
diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
index a108361..f136222 100644
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Common styles for bullet and numbered lists.
# "List Bullet", "List Number", "List Paragraph"
- # Identify wether list is a numbered list or not
+ # Identify whether list is a numbered list or not
# is_numbered = "List Bullet" not in paragraph.style.name
is_numbered = False
p_style_id, p_level = self._get_label_and_level(paragraph)
diff --git a/docling/backend/xml/jats_backend.py b/docling/backend/xml/jats_backend.py
index 23560d3..f286504 100755
--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
super().__init__(in_doc, path_or_stream)
self.path_or_stream = path_or_stream
- # Initialize the root of the document hiearchy
+ # Initialize the root of the document hierarchy
self.root: Optional[NodeItem] = None
self.valid = False
diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py
index b0f8031..268b80a 100644
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@@ -1,6 +1,6 @@
"""Backend to parse patents from the United States Patent Office (USPTO).
-The parsers included in this module can handle patent grants pubished since 1976 and
+The parsers included in this module can handle patent grants published since 1976 and
patent applications since 2001.
The original files can be found in https://bulkdata.uspto.gov.
"""
@@ -440,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
)
elif name == self.Element.PARAGRAPH.value and text:
- # remmove blank spaces added in paragraphs
+ # remove blank spaces added in paragraphs
text = re.sub("\\s+", " ", text)
if self.Element.ABSTRACT.value in self.property:
self.abstract = (
@@ -1697,7 +1697,7 @@ class XmlTable:
class HtmlEntity:
"""Provide utility functions to get the HTML entities of styled characters.
- This class has been developped from:
+ This class has been developed from:
https://unicode-table.com/en/html-entities/
https://www.w3.org/TR/WD-math-970515/table03.html
"""
@@ -1896,7 +1896,7 @@ class HtmlEntity:
"""Get an HTML entity of a greek letter in ISO 8879.
Args:
- The text to transform, as an ISO 8879 entitiy.
+ The text to transform, as an ISO 8879 entity.
Returns:
The HTML entity representing a greek letter. If the input text is not
diff --git a/docling/cli/main.py b/docling/cli/main.py
index c442655..98a4c8d 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -521,7 +521,7 @@ def convert( # noqa: C901
if image_export_mode != ImageRefMode.PLACEHOLDER:
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = (
- True # FIXME: to be deprecated in verson 3
+ True # FIXME: to be deprecated in version 3
)
pipeline_options.images_scale = 2
diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
index 1a3f900..a7679ea 100644
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -234,7 +234,7 @@ class TableStructureModel(BasePageModel):
tcells = table_cluster.cells
tokens = []
for c in tcells:
- # Only allow non empty stings (spaces) into the cells of a table
+ # Only allow non empty strings (spaces) into the cells of a table
if len(c.text.strip()) > 0:
new_cell = copy.deepcopy(c)
new_cell.rect = BoundingRectangle.from_bounding_box(
diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py
index fbe907c..989ce0e 100644
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -151,7 +151,7 @@ class TesseractOcrModel(BaseOcrModel):
script = map_tesseract_script(script)
lang = f"{self.script_prefix}{script}"
- # Check if the detected languge is present in the system
+ # Check if the detected language is present in the system
if lang not in self._tesserocr_languages:
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
msg += " However this language is not installed in your system and will be ignored."
diff --git a/docs/examples/backend_xml_rag.ipynb b/docs/examples/backend_xml_rag.ipynb
index 60872c3..60c5839 100644
--- a/docs/examples/backend_xml_rag.ipynb
+++ b/docs/examples/backend_xml_rag.ipynb
@@ -569,7 +569,7 @@
"The `DoclingDocument` format of the converted patents has a rich hierarchical structure, inherited from the original XML document and preserved by the Docling custom backend.\n",
"In this notebook, we will leverage:\n",
"- The `SimpleDirectoryReader` pattern to iterate over the exported XML files created in section [Fetch the data](#fetch-the-data).\n",
- "- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vectore store.\n",
+ "- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vector store.\n",
"- The `HierarchicalChunker` implementation, which applies a document-based hierarchical chunking, to leverage the patent structures like sections and paragraphs within sections.\n",
"\n",
"Refer to other possible implementations and usage patterns in the [Chunking](../../concepts/chunking/) documentation and the [RAG with LlamaIndex](../rag_llamaindex/) notebook."
diff --git a/docs/examples/hybrid_chunking.ipynb b/docs/examples/hybrid_chunking.ipynb
index c8a8f42..68795a0 100644
--- a/docs/examples/hybrid_chunking.ipynb
+++ b/docs/examples/hybrid_chunking.ipynb
@@ -206,7 +206,7 @@
"source": [
"Points to notice looking at the output chunks below:\n",
"- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)\n",
- "- Where neeeded, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
+ "- Where needed, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
"- Where possible, we merge undersized peer chunks (see chunk 0)\n",
"- \"Tail\" chunks trailing right after merges may still be undersized (see chunk 8)"
]
diff --git a/docs/examples/pictures_description.ipynb b/docs/examples/pictures_description.ipynb
index a40a73a..17845d7 100644
--- a/docs/examples/pictures_description.ipynb
+++ b/docs/examples/pictures_description.ipynb
@@ -279,7 +279,7 @@
"## Use other vision models\n",
"\n",
"The examples above can also be reproduced using other vision model.\n",
- "The Docling options `PictureDescriptionVlmOptions` allows to speficy your favorite vision model from the Hugging Face Hub."
+ "The Docling options `PictureDescriptionVlmOptions` allows to specify your favorite vision model from the Hugging Face Hub."
]
},
{
diff --git a/docs/examples/rag_milvus.ipynb b/docs/examples/rag_milvus.ipynb
index 6366810..9165297 100644
--- a/docs/examples/rag_milvus.ipynb
+++ b/docs/examples/rag_milvus.ipynb
@@ -32,7 +32,7 @@
"\n",
"Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
"1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
- "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU.\n"
+ "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU.\n"
]
},
{
diff --git a/docs/examples/rag_weaviate.ipynb b/docs/examples/rag_weaviate.ipynb
index 627e892..52ad11c 100644
--- a/docs/examples/rag_weaviate.ipynb
+++ b/docs/examples/rag_weaviate.ipynb
@@ -43,7 +43,7 @@
"\n",
"Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
"1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
- "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU."
+ "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU."
]
},
{
@@ -716,7 +716,7 @@
"id": "7tGz49nfUegG"
},
"source": [
- "We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documetation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
+ "We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documentation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
]
}
],
diff --git a/docs/faq/index.md b/docs/faq/index.md
index 781c568..649a22f 100644
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@@ -137,7 +137,7 @@ This is a collection of FAQ collected from the user questions on