diff --git a/requirements.txt b/requirements.txt index c37fbf5..5ca3a33 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -albumentations==1.4.0 numpy==1.24.4 omegaconf==2.3.0 opencv-python==4.11.0.86 diff --git a/utils/processor.py b/utils/processor.py index ba89d08..3f96b68 100644 --- a/utils/processor.py +++ b/utils/processor.py @@ -1,4 +1,4 @@ -""" +""" Copyright (c) 2025 Bytedance Ltd. and/or its affiliates SPDX-License-Identifier: MIT """ @@ -6,8 +6,11 @@ SPDX-License-Identifier: MIT import numpy as np import torch from PIL import ImageOps +from torchvision import transforms +from torchvision.transforms.functional import resize -from utils.utils import * +IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) +IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) class DolphinProcessor: @@ -34,6 +37,10 @@ class DolphinProcessor: self.prefix_answer_space_flag = dp_config.get("prefix_answer_space_flag", True) self.suffix_prompt_space_flag = dp_config.get("suffix_prompt_space_flag", True) + self.transform = transforms.Compose( + [transforms.ToTensor(), transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)] + ) + def process_prompt_for_inference(self, prompt): prompt = prompt.replace("\n", "") if not prompt.startswith(""): @@ -60,5 +67,5 @@ class DolphinProcessor: ) image = ImageOps.expand(image, padding) if return_img_size: - return test_transform(image).unsqueeze(0), (origin_w, origin_h) - return test_transform(image).unsqueeze(0) + return self.transform(image).unsqueeze(0), (origin_w, origin_h) + return self.transform(image).unsqueeze(0) diff --git a/utils/utils.py b/utils/utils.py index 423d0be..999a877 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -1,37 +1,33 @@ -""" +""" Copyright (c) 2025 Bytedance Ltd. and/or its affiliates SPDX-License-Identifier: MIT """ import copy +import io import json import os -import io import re from dataclasses import dataclass from typing import List, Tuple -import albumentations as alb import cv2 import numpy as np -from albumentations.pytorch import ToTensorV2 import pymupdf from PIL import Image -from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD -from torchvision.transforms.functional import resize from utils.markdown_utils import MarkdownConverter def save_figure_to_local(pil_crop, save_dir, image_name, reading_order): """Save cropped figure to local file system - + Args: pil_crop: PIL Image object of the cropped figure save_dir: Base directory to save results image_name: Name of the source image/document reading_order: Reading order of the figure in the document - + Returns: str: Filename of the saved figure """ @@ -39,17 +35,17 @@ def save_figure_to_local(pil_crop, save_dir, image_name, reading_order): # Create figures directory if it doesn't exist figures_dir = os.path.join(save_dir, "markdown", "figures") # os.makedirs(figures_dir, exist_ok=True) - + # Generate figure filename figure_filename = f"{image_name}_figure_{reading_order:03d}.png" figure_path = os.path.join(figures_dir, figure_filename) - + # Save the figure pil_crop.save(figure_path, format="PNG", quality=95) - + # print(f"Saved figure: {figure_filename}") return figure_filename - + except Exception as e: print(f"Error saving figure: {str(e)}") # Return a fallback filename @@ -58,38 +54,38 @@ def save_figure_to_local(pil_crop, save_dir, image_name, reading_order): def convert_pdf_to_images(pdf_path, target_size=896): """Convert PDF pages to images - + Args: pdf_path: Path to PDF file target_size: Target size for the longest dimension - + Returns: List of PIL Images """ images = [] try: doc = pymupdf.open(pdf_path) - + for page_num in range(len(doc)): page = doc[page_num] - + # Calculate scale to make longest dimension equal to target_size rect = page.rect scale = target_size / max(rect.width, rect.height) - + # Render page as image mat = pymupdf.Matrix(scale, scale) pix = page.get_pixmap(matrix=mat) - + # Convert to PIL Image img_data = pix.tobytes("png") pil_image = Image.open(io.BytesIO(img_data)) images.append(pil_image) - + doc.close() print(f"Successfully converted {len(images)} pages from PDF") return images - + except Exception as e: print(f"Error converting PDF to images: {str(e)}") return [] @@ -97,42 +93,38 @@ def convert_pdf_to_images(pdf_path, target_size=896): def is_pdf_file(file_path): """Check if file is a PDF""" - return file_path.lower().endswith('.pdf') + return file_path.lower().endswith(".pdf") def save_combined_pdf_results(all_page_results, pdf_path, save_dir): """Save combined results for multi-page PDF with both JSON and Markdown - + Args: all_page_results: List of results for all pages pdf_path: Path to original PDF file save_dir: Directory to save results - + Returns: Path to saved combined JSON file """ # Create output filename based on PDF name base_name = os.path.splitext(os.path.basename(pdf_path))[0] - + # Prepare combined results - combined_results = { - "source_file": pdf_path, - "total_pages": len(all_page_results), - "pages": all_page_results - } - + combined_results = {"source_file": pdf_path, "total_pages": len(all_page_results), "pages": all_page_results} + # Save combined JSON results json_filename = f"{base_name}.json" json_path = os.path.join(save_dir, "recognition_json", json_filename) os.makedirs(os.path.dirname(json_path), exist_ok=True) - - with open(json_path, 'w', encoding='utf-8') as f: + + with open(json_path, "w", encoding="utf-8") as f: json.dump(combined_results, f, indent=2, ensure_ascii=False) - + # Generate and save combined markdown try: markdown_converter = MarkdownConverter() - + # Combine all page results into a single list for markdown conversion all_elements = [] for page_data in all_page_results: @@ -140,52 +132,33 @@ def save_combined_pdf_results(all_page_results, pdf_path, save_dir): if page_elements: # Add page separator if not the first page if all_elements: - all_elements.append({ - "label": "page_separator", - "text": f"\n\n---\n\n", - "reading_order": len(all_elements) - }) + all_elements.append( + {"label": "page_separator", "text": f"\n\n---\n\n", "reading_order": len(all_elements)} + ) all_elements.extend(page_elements) - + # Generate markdown content markdown_content = markdown_converter.convert(all_elements) - + # Save markdown file markdown_filename = f"{base_name}.md" markdown_path = os.path.join(save_dir, "markdown", markdown_filename) os.makedirs(os.path.dirname(markdown_path), exist_ok=True) - - with open(markdown_path, 'w', encoding='utf-8') as f: + + with open(markdown_path, "w", encoding="utf-8") as f: f.write(markdown_content) - + # print(f"Combined markdown saved to: {markdown_path}") - + except ImportError: print("MarkdownConverter not available, skipping markdown generation") except Exception as e: print(f"Error generating markdown: {e}") - + # print(f"Combined JSON results saved to: {json_path}") return json_path -def alb_wrapper(transform): - def f(im): - return transform(image=np.asarray(im))["image"] - - return f - - -test_transform = alb_wrapper( - alb.Compose( - [ - alb.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD), - ToTensorV2(), - ] - ) -) - - def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True): # print(f"check_coord_valid: {x1}, {y1}, {x2}, {y2}, {image_size}, {abs_coord}") if x2 <= x1 or y2 <= y1: @@ -195,12 +168,12 @@ def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True): if not abs_coord: if x2 > 1 or y2 > 1: return False, f"[{x1}, {y1}, {x2}, {y2}]" - elif image_size is not None: # has image size + elif image_size is not None: # has image size if x2 > image_size[0] or y2 > image_size[1]: return False, f"[{x1}, {y1}, {x2}, {y2}]" return True, None - + def adjust_box_edges(image, boxes: List[List[float]], max_pixels=15, threshold=0.2): """ Image: cv2.image object, or Path @@ -276,6 +249,7 @@ def parse_layout_string(bbox_str): @dataclass class ImageDimensions: """Class to store image dimensions""" + original_w: int original_h: int padded_w: int @@ -284,11 +258,11 @@ class ImageDimensions: def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[int, int, int, int]: """Map coordinates from padded image back to original image - + Args: x1, y1, x2, y2: Coordinates in padded image dims: Image dimensions object - + Returns: tuple: (x1, y1, x2, y2) coordinates in original image """ @@ -296,19 +270,19 @@ def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[ # Calculate padding offsets top = (dims.padded_h - dims.original_h) // 2 left = (dims.padded_w - dims.original_w) // 2 - + # Map back to original coordinates orig_x1 = max(0, x1 - left) orig_y1 = max(0, y1 - top) orig_x2 = min(dims.original_w, x2 - left) orig_y2 = min(dims.original_h, y2 - top) - + # Ensure we have a valid box (width and height > 0) if orig_x2 <= orig_x1: orig_x2 = min(orig_x1 + 1, dims.original_w) if orig_y2 <= orig_y1: orig_y2 = min(orig_y1 + 1, dims.original_h) - + return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2) except Exception as e: print(f"map_to_original_coordinates error: {str(e)}") @@ -318,12 +292,17 @@ def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[ def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions): """ - From absolute coordinates to relevant coordinates - e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4] + From absolute coordinates to relevant coordinates + e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4] """ try: x1, y1, x2, y2 = abs_coords - return round(x1 / dims.original_w, 3), round(y1 / dims.original_h, 3), round(x2 / dims.original_w, 3), round(y2 / dims.original_h, 3) + return ( + round(x1 / dims.original_w, 3), + round(y1 / dims.original_h, 3), + round(x2 / dims.original_w, 3), + round(y2 / dims.original_h, 3), + ) except Exception as e: print(f"map_to_relevant_coordinates error: {str(e)}") return 0.0, 0.0, 1.0, 1.0 # Return full image coordinates @@ -331,13 +310,13 @@ def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions): def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None): """Process and adjust coordinates - + Args: coords: Normalized coordinates [x1, y1, x2, y2] padded_image: Padded image dims: Image dimensions object previous_box: Previous box coordinates for overlap adjustment - + Returns: tuple: (x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box) """ @@ -345,35 +324,35 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo # Convert normalized coordinates to absolute coordinates x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h) x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h) - + # Ensure coordinates are within image bounds before adjustment x1 = max(0, min(x1, dims.padded_w - 1)) y1 = max(0, min(y1, dims.padded_h - 1)) x2 = max(0, min(x2, dims.padded_w)) y2 = max(0, min(y2, dims.padded_h)) - + # Ensure width and height are at least 1 pixel if x2 <= x1: x2 = min(x1 + 1, dims.padded_w) if y2 <= y1: y2 = min(y1 + 1, dims.padded_h) - + # Extend box boundaries new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]]) x1, y1, x2, y2 = new_boxes[0] - + # Ensure coordinates are still within image bounds after adjustment x1 = max(0, min(x1, dims.padded_w - 1)) y1 = max(0, min(y1, dims.padded_h - 1)) x2 = max(0, min(x2, dims.padded_w)) y2 = max(0, min(y2, dims.padded_h)) - + # Ensure width and height are at least 1 pixel after adjustment if x2 <= x1: x2 = min(x1 + 1, dims.padded_w) if y2 <= y1: y2 = min(y1 + 1, dims.padded_h) - + # Check for overlap with previous box and adjust if previous_box is not None: prev_x1, prev_y1, prev_x2, prev_y2 = previous_box @@ -384,15 +363,13 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo # Make sure y2 is still greater than y1 if y2 <= y1: y2 = min(y1 + 1, dims.padded_h) - + # Update previous box new_previous_box = [x1, y1, x2, y2] # Map to original coordinates - orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates( - x1, y1, x2, y2, dims - ) - + orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(x1, y1, x2, y2, dims) + return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box except Exception as e: print(f"process_coordinates error: {str(e)}") @@ -403,10 +380,10 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]: """Load and prepare image with padding while maintaining aspect ratio - + Args: image: PIL image - + Returns: tuple: (padded_image, image_dimensions) """ @@ -423,29 +400,18 @@ def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]: right = max_size - original_w - left # Apply padding - padded_image = cv2.copyMakeBorder(image, top, bottom, left, right, - cv2.BORDER_CONSTANT, value=(0, 0, 0)) + padded_image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0)) padded_h, padded_w = padded_image.shape[:2] - - dimensions = ImageDimensions( - original_w=original_w, - original_h=original_h, - padded_w=padded_w, - padded_h=padded_h - ) - + + dimensions = ImageDimensions(original_w=original_w, original_h=original_h, padded_w=padded_w, padded_h=padded_h) + return padded_image, dimensions except Exception as e: print(f"prepare_image error: {str(e)}") # Create a minimal valid image and dimensions h, w = image.height, image.width - dimensions = ImageDimensions( - original_w=w, - original_h=h, - padded_w=w, - padded_h=h - ) + dimensions = ImageDimensions(original_w=w, original_h=h, padded_w=w, padded_h=h) # Return a black image of the same size return np.zeros((h, w, 3), dtype=np.uint8), dimensions @@ -484,7 +450,7 @@ def crop_margin(img: Image.Image) -> Image.Image: if width == 0 or height == 0: print("Warning: Image has zero width or height") return img - + data = np.array(img.convert("L")) data = data.astype(np.uint8) max_val = data.max() @@ -498,13 +464,13 @@ def crop_margin(img: Image.Image) -> Image.Image: if coords is None: return img a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box - + # Ensure crop coordinates are within image bounds a = max(0, a) b = max(0, b) w = min(w, width - a) h = min(h, height - b) - + # Only crop if we have a valid region if w > 0 and h > 0: return img.crop((a, b, a + w, b + h))