Merge pull request #90 from hanyd2010/feat/remove_albumentations

remove 'albumentations'
2025-06-26 20:01:35 +08:00 · 2025-06-26 20:01:35 +08:00 · 0e4ead6717
commit 0e4ead6717
parent 98b8ccc38d 4edac82fc3
3 changed files with 84 additions and 112 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,3 @@
-albumentations==1.4.0
 numpy==1.24.4
 omegaconf==2.3.0
 opencv-python==4.11.0.86
--- a/utils/processor.py
+++ b/utils/processor.py
@ -1,4 +1,4 @@
-""" 
+"""
 Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 SPDX-License-Identifier: MIT
 """
@ -6,8 +6,11 @@ SPDX-License-Identifier: MIT
 import numpy as np
 import torch
 from PIL import ImageOps
+from torchvision import transforms
+from torchvision.transforms.functional import resize

-from utils.utils import *
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)


 class DolphinProcessor:
@ -34,6 +37,10 @@ class DolphinProcessor:
        self.prefix_answer_space_flag = dp_config.get("prefix_answer_space_flag", True)
        self.suffix_prompt_space_flag = dp_config.get("suffix_prompt_space_flag", True)

+        self.transform = transforms.Compose(
+            [transforms.ToTensor(), transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)]
+        )
+
    def process_prompt_for_inference(self, prompt):
        prompt = prompt.replace("<image>\n", "")
        if not prompt.startswith("<s>"):
@ -60,5 +67,5 @@ class DolphinProcessor:
        )
        image = ImageOps.expand(image, padding)
        if return_img_size:
-            return test_transform(image).unsqueeze(0), (origin_w, origin_h)
-        return test_transform(image).unsqueeze(0)
+            return self.transform(image).unsqueeze(0), (origin_w, origin_h)
+        return self.transform(image).unsqueeze(0)
--- a/utils/utils.py
+++ b/utils/utils.py
@ -1,37 +1,33 @@
-""" 
+"""
 Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 SPDX-License-Identifier: MIT
 """

 import copy
+import io
 import json
 import os
-import io
 import re
 from dataclasses import dataclass
 from typing import List, Tuple

-import albumentations as alb
 import cv2
 import numpy as np
-from albumentations.pytorch import ToTensorV2
 import pymupdf
 from PIL import Image
-from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
-from torchvision.transforms.functional import resize

 from utils.markdown_utils import MarkdownConverter


 def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
    """Save cropped figure to local file system
-    
+
    Args:
        pil_crop: PIL Image object of the cropped figure
        save_dir: Base directory to save results
        image_name: Name of the source image/document
        reading_order: Reading order of the figure in the document
-        
+
    Returns:
        str: Filename of the saved figure
    """
@ -39,17 +35,17 @@ def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
        # Create figures directory if it doesn't exist
        figures_dir = os.path.join(save_dir, "markdown", "figures")
        # os.makedirs(figures_dir, exist_ok=True)
-        
+
        # Generate figure filename
        figure_filename = f"{image_name}_figure_{reading_order:03d}.png"
        figure_path = os.path.join(figures_dir, figure_filename)
-        
+
        # Save the figure
        pil_crop.save(figure_path, format="PNG", quality=95)
-        
+
        # print(f"Saved figure: {figure_filename}")
        return figure_filename
-        
+
    except Exception as e:
        print(f"Error saving figure: {str(e)}")
        # Return a fallback filename
@ -58,38 +54,38 @@ def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):

 def convert_pdf_to_images(pdf_path, target_size=896):
    """Convert PDF pages to images
-    
+
    Args:
        pdf_path: Path to PDF file
        target_size: Target size for the longest dimension
-        
+
    Returns:
        List of PIL Images
    """
    images = []
    try:
        doc = pymupdf.open(pdf_path)
-        
+
        for page_num in range(len(doc)):
            page = doc[page_num]
-            
+
            # Calculate scale to make longest dimension equal to target_size
            rect = page.rect
            scale = target_size / max(rect.width, rect.height)
-            
+
            # Render page as image
            mat = pymupdf.Matrix(scale, scale)
            pix = page.get_pixmap(matrix=mat)
-            
+
            # Convert to PIL Image
            img_data = pix.tobytes("png")
            pil_image = Image.open(io.BytesIO(img_data))
            images.append(pil_image)
-        
+
        doc.close()
        print(f"Successfully converted {len(images)} pages from PDF")
        return images
-        
+
    except Exception as e:
        print(f"Error converting PDF to images: {str(e)}")
        return []
@ -97,42 +93,38 @@ def convert_pdf_to_images(pdf_path, target_size=896):

 def is_pdf_file(file_path):
    """Check if file is a PDF"""
-    return file_path.lower().endswith('.pdf')
+    return file_path.lower().endswith(".pdf")


 def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
    """Save combined results for multi-page PDF with both JSON and Markdown
-    
+
    Args:
        all_page_results: List of results for all pages
        pdf_path: Path to original PDF file
        save_dir: Directory to save results
-        
+
    Returns:
        Path to saved combined JSON file
    """
    # Create output filename based on PDF name
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
-    
+
    # Prepare combined results
-    combined_results = {
-        "source_file": pdf_path,
-        "total_pages": len(all_page_results),
-        "pages": all_page_results
-    }
-    
+    combined_results = {"source_file": pdf_path, "total_pages": len(all_page_results), "pages": all_page_results}
+
    # Save combined JSON results
    json_filename = f"{base_name}.json"
    json_path = os.path.join(save_dir, "recognition_json", json_filename)
    os.makedirs(os.path.dirname(json_path), exist_ok=True)
-    
-    with open(json_path, 'w', encoding='utf-8') as f:
+
+    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(combined_results, f, indent=2, ensure_ascii=False)
-    
+
    # Generate and save combined markdown
    try:
        markdown_converter = MarkdownConverter()
-        
+
        # Combine all page results into a single list for markdown conversion
        all_elements = []
        for page_data in all_page_results:
@ -140,52 +132,33 @@ def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
            if page_elements:
                # Add page separator if not the first page
                if all_elements:
-                    all_elements.append({
-                        "label": "page_separator",
-                        "text": f"\n\n---\n\n",
-                        "reading_order": len(all_elements)
-                    })
+                    all_elements.append(
+                        {"label": "page_separator", "text": f"\n\n---\n\n", "reading_order": len(all_elements)}
+                    )
                all_elements.extend(page_elements)
-        
+
        # Generate markdown content
        markdown_content = markdown_converter.convert(all_elements)
-        
+
        # Save markdown file
        markdown_filename = f"{base_name}.md"
        markdown_path = os.path.join(save_dir, "markdown", markdown_filename)
        os.makedirs(os.path.dirname(markdown_path), exist_ok=True)
-        
-        with open(markdown_path, 'w', encoding='utf-8') as f:
+
+        with open(markdown_path, "w", encoding="utf-8") as f:
            f.write(markdown_content)
-            
+
        # print(f"Combined markdown saved to: {markdown_path}")
-        
+
    except ImportError:
        print("MarkdownConverter not available, skipping markdown generation")
    except Exception as e:
        print(f"Error generating markdown: {e}")
-    
+
    # print(f"Combined JSON results saved to: {json_path}")
    return json_path


-def alb_wrapper(transform):
-    def f(im):
-        return transform(image=np.asarray(im))["image"]
-
-    return f
-
-
-test_transform = alb_wrapper(
-    alb.Compose(
-        [
-            alb.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
-            ToTensorV2(),
-        ]
-    )
-)
-
-
 def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
    # print(f"check_coord_valid: {x1}, {y1}, {x2}, {y2}, {image_size}, {abs_coord}")
    if x2 <= x1 or y2 <= y1:
@ -195,12 +168,12 @@ def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
    if not abs_coord:
        if x2 > 1 or y2 > 1:
            return False, f"[{x1}, {y1}, {x2}, {y2}]"
-    elif image_size is not None: # has image size
+    elif image_size is not None:  # has image size
        if x2 > image_size[0] or y2 > image_size[1]:
            return False, f"[{x1}, {y1}, {x2}, {y2}]"
    return True, None

-    
+
 def adjust_box_edges(image, boxes: List[List[float]], max_pixels=15, threshold=0.2):
    """
    Image: cv2.image object, or Path
@ -276,6 +249,7 @@ def parse_layout_string(bbox_str):
@dataclass
 class ImageDimensions:
    """Class to store image dimensions"""
+
    original_w: int
    original_h: int
    padded_w: int
@ -284,11 +258,11 @@ class ImageDimensions:

 def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[int, int, int, int]:
    """Map coordinates from padded image back to original image
-    
+
    Args:
        x1, y1, x2, y2: Coordinates in padded image
        dims: Image dimensions object
-        
+
    Returns:
        tuple: (x1, y1, x2, y2) coordinates in original image
    """
@ -296,19 +270,19 @@ def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[
        # Calculate padding offsets
        top = (dims.padded_h - dims.original_h) // 2
        left = (dims.padded_w - dims.original_w) // 2
-        
+
        # Map back to original coordinates
        orig_x1 = max(0, x1 - left)
        orig_y1 = max(0, y1 - top)
        orig_x2 = min(dims.original_w, x2 - left)
        orig_y2 = min(dims.original_h, y2 - top)
-        
+
        # Ensure we have a valid box (width and height > 0)
        if orig_x2 <= orig_x1:
            orig_x2 = min(orig_x1 + 1, dims.original_w)
        if orig_y2 <= orig_y1:
            orig_y2 = min(orig_y1 + 1, dims.original_h)
-            
+
        return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2)
    except Exception as e:
        print(f"map_to_original_coordinates error: {str(e)}")
@ -318,12 +292,17 @@ def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[

 def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions):
    """
-        From absolute coordinates to relevant coordinates
-        e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4]
+    From absolute coordinates to relevant coordinates
+    e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4]
    """
    try:
        x1, y1, x2, y2 = abs_coords
-        return round(x1 / dims.original_w, 3), round(y1 / dims.original_h, 3), round(x2 / dims.original_w, 3), round(y2 / dims.original_h, 3)
+        return (
+            round(x1 / dims.original_w, 3),
+            round(y1 / dims.original_h, 3),
+            round(x2 / dims.original_w, 3),
+            round(y2 / dims.original_h, 3),
+        )
    except Exception as e:
        print(f"map_to_relevant_coordinates error: {str(e)}")
        return 0.0, 0.0, 1.0, 1.0  # Return full image coordinates
@ -331,13 +310,13 @@ def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions):

 def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None):
    """Process and adjust coordinates
-    
+
    Args:
        coords: Normalized coordinates [x1, y1, x2, y2]
        padded_image: Padded image
        dims: Image dimensions object
        previous_box: Previous box coordinates for overlap adjustment
-    
+
    Returns:
        tuple: (x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box)
    """
@ -345,35 +324,35 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
        # Convert normalized coordinates to absolute coordinates
        x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h)
        x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h)
-        
+
        # Ensure coordinates are within image bounds before adjustment
        x1 = max(0, min(x1, dims.padded_w - 1))
        y1 = max(0, min(y1, dims.padded_h - 1))
        x2 = max(0, min(x2, dims.padded_w))
        y2 = max(0, min(y2, dims.padded_h))
-        
+
        # Ensure width and height are at least 1 pixel
        if x2 <= x1:
            x2 = min(x1 + 1, dims.padded_w)
        if y2 <= y1:
            y2 = min(y1 + 1, dims.padded_h)
-        
+
        # Extend box boundaries
        new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]])
        x1, y1, x2, y2 = new_boxes[0]
-        
+
        # Ensure coordinates are still within image bounds after adjustment
        x1 = max(0, min(x1, dims.padded_w - 1))
        y1 = max(0, min(y1, dims.padded_h - 1))
        x2 = max(0, min(x2, dims.padded_w))
        y2 = max(0, min(y2, dims.padded_h))
-        
+
        # Ensure width and height are at least 1 pixel after adjustment
        if x2 <= x1:
            x2 = min(x1 + 1, dims.padded_w)
        if y2 <= y1:
            y2 = min(y1 + 1, dims.padded_h)
-        
+
        # Check for overlap with previous box and adjust
        if previous_box is not None:
            prev_x1, prev_y1, prev_x2, prev_y2 = previous_box
@ -384,15 +363,13 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
                # Make sure y2 is still greater than y1
                if y2 <= y1:
                    y2 = min(y1 + 1, dims.padded_h)
-        
+
        # Update previous box
        new_previous_box = [x1, y1, x2, y2]

        # Map to original coordinates
-        orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(
-            x1, y1, x2, y2, dims
-        )
-        
+        orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(x1, y1, x2, y2, dims)
+
        return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box
    except Exception as e:
        print(f"process_coordinates error: {str(e)}")
@ -403,10 +380,10 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo

 def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
    """Load and prepare image with padding while maintaining aspect ratio
-    
+
    Args:
        image: PIL image
-        
+
    Returns:
        tuple: (padded_image, image_dimensions)
    """
@ -423,29 +400,18 @@ def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
        right = max_size - original_w - left

        # Apply padding
-        padded_image = cv2.copyMakeBorder(image, top, bottom, left, right,
-                                cv2.BORDER_CONSTANT, value=(0, 0, 0))
+        padded_image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))

        padded_h, padded_w = padded_image.shape[:2]
-        
-        dimensions = ImageDimensions(
-            original_w=original_w,
-            original_h=original_h,
-            padded_w=padded_w,
-            padded_h=padded_h
-        )
-        
+
+        dimensions = ImageDimensions(original_w=original_w, original_h=original_h, padded_w=padded_w, padded_h=padded_h)
+
        return padded_image, dimensions
    except Exception as e:
        print(f"prepare_image error: {str(e)}")
        # Create a minimal valid image and dimensions
        h, w = image.height, image.width
-        dimensions = ImageDimensions(
-            original_w=w,
-            original_h=h,
-            padded_w=w,
-            padded_h=h
-        )
+        dimensions = ImageDimensions(original_w=w, original_h=h, padded_w=w, padded_h=h)
        # Return a black image of the same size
        return np.zeros((h, w, 3), dtype=np.uint8), dimensions

@ -484,7 +450,7 @@ def crop_margin(img: Image.Image) -> Image.Image:
        if width == 0 or height == 0:
            print("Warning: Image has zero width or height")
            return img
-            
+
        data = np.array(img.convert("L"))
        data = data.astype(np.uint8)
        max_val = data.max()
@ -498,13 +464,13 @@ def crop_margin(img: Image.Image) -> Image.Image:
        if coords is None:
            return img
        a, b, w, h = cv2.boundingRect(coords)  # Find minimum spanning bounding box
-        
+
        # Ensure crop coordinates are within image bounds
        a = max(0, a)
        b = max(0, b)
        w = min(w, width - a)
        h = min(h, height - b)
-        
+
        # Only crop if we have a valid region
        if w > 0 and h > 0:
            return img.crop((a, b, a + w, b + h))