Docling/docling/models/page_preprocessing_model.py

import re
from collections.abc import Iterable
from pathlib import Path
from typing import Optional

import numpy as np
from PIL import ImageDraw
from pydantic import BaseModel

from docling.datamodel.base_models import Page, ScoreValue
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.profiling import TimeRecorder


class PagePreprocessingOptions(BaseModel):
    images_scale: Optional[float]
    create_parsed_page: bool


class PagePreprocessingModel(BasePageModel):
    def __init__(self, options: PagePreprocessingOptions):
        self.options = options

        # Pre-compiled regex patterns for efficiency
        self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
        self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
        self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
        self.SLASH_NUMBER_GARBAGE_RE = re.compile(
            r"(?:/\w+\s*){2,}"
        )  # Two or more "/token " sequences

    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        for page in page_batch:
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
            else:
                with TimeRecorder(conv_res, "page_parse"):
                    page = self._populate_page_images(page)
                    page = self._parse_page_cells(conv_res, page)
                yield page

    # Generate the page image and store it in the page object
    def _populate_page_images(self, page: Page) -> Page:
        # default scale
        page.get_image(
            scale=1.0
        )  # puts the page image on the image cache at default scale

        images_scale = self.options.images_scale
        # user requested scales
        if images_scale is not None:
            page._default_image_scale = images_scale
            page.get_image(
                scale=images_scale
            )  # this will trigger storing the image in the internal cache

        return page

    # Extract and populate the page cells and store it in the page object
    def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
        assert page._backend is not None

        page.cells = list(page._backend.get_text_cells())

        if self.options.create_parsed_page:
            page.parsed_page = page._backend.get_segmented_page()

        # Rate the text quality from the PDF parser, and aggregate on page
        text_scores = []
        for c in page.cells:
            score = self.rate_text_quality(c.text)
            text_scores.append(score)

        conv_res.confidence.pages[page.page_no].parse_score = float(
            np.nanquantile(
                text_scores, q=0.10
            )  # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
        )

        # DEBUG code:
        def draw_text_boxes(image, cells, show: bool = False):
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = (
                    c.to_bounding_box().l,
                    c.to_bounding_box().t,
                    c.to_bounding_box().r,
                    c.to_bounding_box().b,
                )

                draw.rectangle([(x0, y0), (x1, y1)], outline="red")
            if show:
                image.show()
            else:
                out_path: Path = (
                    Path(settings.debug.debug_output_path)
                    / f"debug_{conv_res.input.file.stem}"
                )
                out_path.mkdir(parents=True, exist_ok=True)

                out_file = out_path / f"cells_page_{page.page_no:05}.png"
                image.save(str(out_file), format="png")

        if settings.debug.visualize_cells:
            draw_text_boxes(page.get_image(scale=1.0), page.cells)

        return page

    def rate_text_quality(self, text: str) -> float:
        # Hard errors: if any of these patterns are found, return 0.0 immediately.
        blacklist_chars = ["<EFBFBD>"]
        if (
            any(text.find(c) >= 0 for c in blacklist_chars)
            or self.GLYPH_RE.search(text)
            or self.SLASH_G_RE.search(text)
            or self.SLASH_NUMBER_GARBAGE_RE.match(
                text
            )  # Check if text is mostly slash-number pattern
        ):
            return 0.0

        penalty = 0.0

        # Apply a penalty only if the fragmented words pattern occurs at least three times.
        frag_matches = self.FRAG_RE.findall(text)
        if len(frag_matches) >= 3:
            penalty += 0.1 * len(frag_matches)

        # Additional heuristic: if the average token length is below 2, add a penalty.
        # tokens = text.split()
        # if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
        #    penalty += 0.2

        return max(1.0 - penalty, 0.0)