Initial commit

This commit is contained in:
Christoph Auer
2024-07-15 09:42:42 +02:00
commit e2d996753b
38 changed files with 8767 additions and 0 deletions

View File

View File

@@ -0,0 +1,18 @@
from abc import abstractmethod
from pathlib import Path
from typing import Iterable
from docling.datamodel.base_models import Page, PipelineOptions
class BaseModelPipeline:
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
self.model_pipe = []
self.artifacts_path = artifacts_path
self.pipeline_options = pipeline_options
def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for model in self.model_pipe:
page_batch = model(page_batch)
yield from page_batch

View File

@@ -0,0 +1,40 @@
from pathlib import Path
from typing import Iterable
from docling.datamodel.base_models import Page, PipelineOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel
from docling.models.table_structure_model import TableStructureModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline
class StandardModelPipeline(BaseModelPipeline):
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
_table_model_path = "model_artifacts/tableformer"
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
super().__init__(artifacts_path, pipeline_options)
self.model_pipe = [
EasyOcrModel(
config={
"lang": ["fr", "de", "es", "en"],
"enabled": pipeline_options.do_ocr,
}
),
LayoutModel(
config={
"artifacts_path": artifacts_path
/ StandardModelPipeline._layout_model_path
}
),
TableStructureModel(
config={
"artifacts_path": artifacts_path
/ StandardModelPipeline._table_model_path,
"enabled": pipeline_options.do_table_structure,
"do_cell_matching": False,
}
),
]