diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 0c6b306..fbe17ee 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -409,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): ) return _txt - # restore original HTML by removing previouly added markers + # restore original HTML by removing previously added markers for regex in [ rf"
\s*\s*{_START_MARKER}",
                     rf"{_STOP_MARKER}\s*\s*
", diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index a108361..f136222 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): # Common styles for bullet and numbered lists. # "List Bullet", "List Number", "List Paragraph" - # Identify wether list is a numbered list or not + # Identify whether list is a numbered list or not # is_numbered = "List Bullet" not in paragraph.style.name is_numbered = False p_style_id, p_level = self._get_label_and_level(paragraph) diff --git a/docling/backend/xml/jats_backend.py b/docling/backend/xml/jats_backend.py index 23560d3..f286504 100755 --- a/docling/backend/xml/jats_backend.py +++ b/docling/backend/xml/jats_backend.py @@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): super().__init__(in_doc, path_or_stream) self.path_or_stream = path_or_stream - # Initialize the root of the document hiearchy + # Initialize the root of the document hierarchy self.root: Optional[NodeItem] = None self.valid = False diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py index b0f8031..268b80a 100644 --- a/docling/backend/xml/uspto_backend.py +++ b/docling/backend/xml/uspto_backend.py @@ -1,6 +1,6 @@ """Backend to parse patents from the United States Patent Office (USPTO). -The parsers included in this module can handle patent grants pubished since 1976 and +The parsers included in this module can handle patent grants published since 1976 and patent applications since 2001. The original files can be found in https://bulkdata.uspto.gov. """ @@ -440,7 +440,7 @@ class PatentUsptoIce(PatentUspto): ) elif name == self.Element.PARAGRAPH.value and text: - # remmove blank spaces added in paragraphs + # remove blank spaces added in paragraphs text = re.sub("\\s+", " ", text) if self.Element.ABSTRACT.value in self.property: self.abstract = ( @@ -1697,7 +1697,7 @@ class XmlTable: class HtmlEntity: """Provide utility functions to get the HTML entities of styled characters. - This class has been developped from: + This class has been developed from: https://unicode-table.com/en/html-entities/ https://www.w3.org/TR/WD-math-970515/table03.html """ @@ -1896,7 +1896,7 @@ class HtmlEntity: """Get an HTML entity of a greek letter in ISO 8879. Args: - The text to transform, as an ISO 8879 entitiy. + The text to transform, as an ISO 8879 entity. Returns: The HTML entity representing a greek letter. If the input text is not diff --git a/docling/cli/main.py b/docling/cli/main.py index c442655..98a4c8d 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -521,7 +521,7 @@ def convert( # noqa: C901 if image_export_mode != ImageRefMode.PLACEHOLDER: pipeline_options.generate_page_images = True pipeline_options.generate_picture_images = ( - True # FIXME: to be deprecated in verson 3 + True # FIXME: to be deprecated in version 3 ) pipeline_options.images_scale = 2 diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 1a3f900..a7679ea 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -234,7 +234,7 @@ class TableStructureModel(BasePageModel): tcells = table_cluster.cells tokens = [] for c in tcells: - # Only allow non empty stings (spaces) into the cells of a table + # Only allow non empty strings (spaces) into the cells of a table if len(c.text.strip()) > 0: new_cell = copy.deepcopy(c) new_cell.rect = BoundingRectangle.from_bounding_box( diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index fbe907c..989ce0e 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -151,7 +151,7 @@ class TesseractOcrModel(BaseOcrModel): script = map_tesseract_script(script) lang = f"{self.script_prefix}{script}" - # Check if the detected languge is present in the system + # Check if the detected language is present in the system if lang not in self._tesserocr_languages: msg = f"Tesseract detected the script '{script}' and language '{lang}'." msg += " However this language is not installed in your system and will be ignored." diff --git a/docs/examples/backend_xml_rag.ipynb b/docs/examples/backend_xml_rag.ipynb index 60872c3..60c5839 100644 --- a/docs/examples/backend_xml_rag.ipynb +++ b/docs/examples/backend_xml_rag.ipynb @@ -569,7 +569,7 @@ "The `DoclingDocument` format of the converted patents has a rich hierarchical structure, inherited from the original XML document and preserved by the Docling custom backend.\n", "In this notebook, we will leverage:\n", "- The `SimpleDirectoryReader` pattern to iterate over the exported XML files created in section [Fetch the data](#fetch-the-data).\n", - "- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vectore store.\n", + "- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vector store.\n", "- The `HierarchicalChunker` implementation, which applies a document-based hierarchical chunking, to leverage the patent structures like sections and paragraphs within sections.\n", "\n", "Refer to other possible implementations and usage patterns in the [Chunking](../../concepts/chunking/) documentation and the [RAG with LlamaIndex](../rag_llamaindex/) notebook." diff --git a/docs/examples/hybrid_chunking.ipynb b/docs/examples/hybrid_chunking.ipynb index c8a8f42..68795a0 100644 --- a/docs/examples/hybrid_chunking.ipynb +++ b/docs/examples/hybrid_chunking.ipynb @@ -206,7 +206,7 @@ "source": [ "Points to notice looking at the output chunks below:\n", "- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)\n", - "- Where neeeded, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n", + "- Where needed, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n", "- Where possible, we merge undersized peer chunks (see chunk 0)\n", "- \"Tail\" chunks trailing right after merges may still be undersized (see chunk 8)" ] diff --git a/docs/examples/pictures_description.ipynb b/docs/examples/pictures_description.ipynb index a40a73a..17845d7 100644 --- a/docs/examples/pictures_description.ipynb +++ b/docs/examples/pictures_description.ipynb @@ -279,7 +279,7 @@ "## Use other vision models\n", "\n", "The examples above can also be reproduced using other vision model.\n", - "The Docling options `PictureDescriptionVlmOptions` allows to speficy your favorite vision model from the Hugging Face Hub." + "The Docling options `PictureDescriptionVlmOptions` allows to specify your favorite vision model from the Hugging Face Hub." ] }, { diff --git a/docs/examples/rag_milvus.ipynb b/docs/examples/rag_milvus.ipynb index 6366810..9165297 100644 --- a/docs/examples/rag_milvus.ipynb +++ b/docs/examples/rag_milvus.ipynb @@ -32,7 +32,7 @@ "\n", "Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n", "1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n", - "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU.\n" + "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU.\n" ] }, { diff --git a/docs/examples/rag_weaviate.ipynb b/docs/examples/rag_weaviate.ipynb index 627e892..52ad11c 100644 --- a/docs/examples/rag_weaviate.ipynb +++ b/docs/examples/rag_weaviate.ipynb @@ -43,7 +43,7 @@ "\n", "Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n", "1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n", - "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU." + "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU." ] }, { @@ -716,7 +716,7 @@ "id": "7tGz49nfUegG" }, "source": [ - "We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documetation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)." + "We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documentation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)." ] } ], diff --git a/docs/faq/index.md b/docs/faq/index.md index 781c568..649a22f 100644 --- a/docs/faq/index.md +++ b/docs/faq/index.md @@ -137,7 +137,7 @@ This is a collection of FAQ collected from the user questions on