remove 'albumentations'

2025-06-26 19:45:12 +08:00 · 2025-06-26 19:45:12 +08:00 · 4edac82fc3
commit 4edac82fc3
parent 98b8ccc38d
3 changed files with 84 additions and 112 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,3 @@
 albumentations==1.4.0
 numpy==1.24.4
 omegaconf==2.3.0
 opencv-python==4.11.0.86
--- a/utils/processor.py
+++ b/utils/processor.py
@ -1,4 +1,4 @@
-""" 
+"""
 Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 SPDX-License-Identifier: MIT
 """
@ -6,8 +6,11 @@ SPDX-License-Identifier: MIT
 import numpy as np
 import torch
 from PIL import ImageOps
 from torchvision import transforms
 from torchvision.transforms.functional import resize
-from utils.utils import *
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
 class DolphinProcessor:
@ -34,6 +37,10 @@ class DolphinProcessor:
        self.prefix_answer_space_flag = dp_config.get("prefix_answer_space_flag", True)
        self.suffix_prompt_space_flag = dp_config.get("suffix_prompt_space_flag", True)
        self.transform = transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)]
        )
    def process_prompt_for_inference(self, prompt):
        prompt = prompt.replace("<image>\n", "")
        if not prompt.startswith("<s>"):
@ -60,5 +67,5 @@ class DolphinProcessor:
        )
        image = ImageOps.expand(image, padding)
        if return_img_size:
-            return test_transform(image).unsqueeze(0), (origin_w, origin_h)
+            return self.transform(image).unsqueeze(0), (origin_w, origin_h)
-        return test_transform(image).unsqueeze(0)
+        return self.transform(image).unsqueeze(0)
--- a/utils/utils.py
+++ b/utils/utils.py
@ -1,37 +1,33 @@
-""" 
+"""
 Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 SPDX-License-Identifier: MIT
 """
 import copy
 import io
 import json
 import os
 import io
 import re
 from dataclasses import dataclass
 from typing import List, Tuple
 import albumentations as alb
 import cv2
 import numpy as np
 from albumentations.pytorch import ToTensorV2
 import pymupdf
 from PIL import Image
 from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from torchvision.transforms.functional import resize
 from utils.markdown_utils import MarkdownConverter
 def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
    """Save cropped figure to local file system
-    
+
    Args:
        pil_crop: PIL Image object of the cropped figure
        save_dir: Base directory to save results
        image_name: Name of the source image/document
        reading_order: Reading order of the figure in the document
-        
+
    Returns:
        str: Filename of the saved figure
    """
@ -39,17 +35,17 @@ def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
        # Create figures directory if it doesn't exist
        figures_dir = os.path.join(save_dir, "markdown", "figures")
        # os.makedirs(figures_dir, exist_ok=True)
-        
+
        # Generate figure filename
        figure_filename = f"{image_name}_figure_{reading_order:03d}.png"
        figure_path = os.path.join(figures_dir, figure_filename)
-        
+
        # Save the figure
        pil_crop.save(figure_path, format="PNG", quality=95)
-        
+
        # print(f"Saved figure: {figure_filename}")
        return figure_filename
-        
+
    except Exception as e:
        print(f"Error saving figure: {str(e)}")
        # Return a fallback filename
@ -58,38 +54,38 @@ def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
 def convert_pdf_to_images(pdf_path, target_size=896):
    """Convert PDF pages to images
-    
+
    Args:
        pdf_path: Path to PDF file
        target_size: Target size for the longest dimension
-        
+
    Returns:
        List of PIL Images
    """
    images = []
    try:
        doc = pymupdf.open(pdf_path)
-        
+
        for page_num in range(len(doc)):
            page = doc[page_num]
-            
+
            # Calculate scale to make longest dimension equal to target_size
            rect = page.rect
            scale = target_size / max(rect.width, rect.height)
-            
+
            # Render page as image
            mat = pymupdf.Matrix(scale, scale)
            pix = page.get_pixmap(matrix=mat)
-            
+
            # Convert to PIL Image
            img_data = pix.tobytes("png")
            pil_image = Image.open(io.BytesIO(img_data))
            images.append(pil_image)
-        
+
        doc.close()
        print(f"Successfully converted {len(images)} pages from PDF")
        return images
-        
+
    except Exception as e:
        print(f"Error converting PDF to images: {str(e)}")
        return []
@ -97,42 +93,38 @@ def convert_pdf_to_images(pdf_path, target_size=896):
 def is_pdf_file(file_path):
    """Check if file is a PDF"""
-    return file_path.lower().endswith('.pdf')
+    return file_path.lower().endswith(".pdf")
 def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
    """Save combined results for multi-page PDF with both JSON and Markdown
-    
+
    Args:
        all_page_results: List of results for all pages
        pdf_path: Path to original PDF file
        save_dir: Directory to save results
-        
+
    Returns:
        Path to saved combined JSON file
    """
    # Create output filename based on PDF name
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
-    
+
    # Prepare combined results
-    combined_results = {
+    combined_results = {"source_file": pdf_path, "total_pages": len(all_page_results), "pages": all_page_results}
-        "source_file": pdf_path,
+
        "total_pages": len(all_page_results),
        "pages": all_page_results
    }
    # Save combined JSON results
    json_filename = f"{base_name}.json"
    json_path = os.path.join(save_dir, "recognition_json", json_filename)
    os.makedirs(os.path.dirname(json_path), exist_ok=True)
-    
+
-    with open(json_path, 'w', encoding='utf-8') as f:
+    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(combined_results, f, indent=2, ensure_ascii=False)
-    
+
    # Generate and save combined markdown
    try:
        markdown_converter = MarkdownConverter()
-        
+
        # Combine all page results into a single list for markdown conversion
        all_elements = []
        for page_data in all_page_results:
@ -140,52 +132,33 @@ def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
            if page_elements:
                # Add page separator if not the first page
                if all_elements:
-                    all_elements.append({
+                    all_elements.append(
-                        "label": "page_separator",
+                        {"label": "page_separator", "text": f"\n\n---\n\n", "reading_order": len(all_elements)}
-                        "text": f"\n\n---\n\n",
+                    )
                        "reading_order": len(all_elements)
                    })
                all_elements.extend(page_elements)
-        
+
        # Generate markdown content
        markdown_content = markdown_converter.convert(all_elements)
-        
+
        # Save markdown file
        markdown_filename = f"{base_name}.md"
        markdown_path = os.path.join(save_dir, "markdown", markdown_filename)
        os.makedirs(os.path.dirname(markdown_path), exist_ok=True)
-        
+
-        with open(markdown_path, 'w', encoding='utf-8') as f:
+        with open(markdown_path, "w", encoding="utf-8") as f:
            f.write(markdown_content)
-            
+
        # print(f"Combined markdown saved to: {markdown_path}")
-        
+
    except ImportError:
        print("MarkdownConverter not available, skipping markdown generation")
    except Exception as e:
        print(f"Error generating markdown: {e}")
-    
+
    # print(f"Combined JSON results saved to: {json_path}")
    return json_path
 def alb_wrapper(transform):
    def f(im):
        return transform(image=np.asarray(im))["image"]
    return f
 test_transform = alb_wrapper(
    alb.Compose(
        [
            alb.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
            ToTensorV2(),
        ]
    )
 )
 def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
    # print(f"check_coord_valid: {x1}, {y1}, {x2}, {y2}, {image_size}, {abs_coord}")
    if x2 <= x1 or y2 <= y1:
@ -195,12 +168,12 @@ def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
    if not abs_coord:
        if x2 > 1 or y2 > 1:
            return False, f"[{x1}, {y1}, {x2}, {y2}]"
-    elif image_size is not None: # has image size
+    elif image_size is not None:  # has image size
        if x2 > image_size[0] or y2 > image_size[1]:
            return False, f"[{x1}, {y1}, {x2}, {y2}]"
    return True, None
-    
+
 def adjust_box_edges(image, boxes: List[List[float]], max_pixels=15, threshold=0.2):
    """
    Image: cv2.image object, or Path
@ -276,6 +249,7 @@ def parse_layout_string(bbox_str):
@dataclass
 class ImageDimensions:
    """Class to store image dimensions"""
    original_w: int
    original_h: int
    padded_w: int
@ -284,11 +258,11 @@ class ImageDimensions:
 def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[int, int, int, int]:
    """Map coordinates from padded image back to original image
-    
+
    Args:
        x1, y1, x2, y2: Coordinates in padded image
        dims: Image dimensions object
-        
+
    Returns:
        tuple: (x1, y1, x2, y2) coordinates in original image
    """
@ -296,19 +270,19 @@ def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[
        # Calculate padding offsets
        top = (dims.padded_h - dims.original_h) // 2
        left = (dims.padded_w - dims.original_w) // 2
-        
+
        # Map back to original coordinates
        orig_x1 = max(0, x1 - left)
        orig_y1 = max(0, y1 - top)
        orig_x2 = min(dims.original_w, x2 - left)
        orig_y2 = min(dims.original_h, y2 - top)
-        
+
        # Ensure we have a valid box (width and height > 0)
        if orig_x2 <= orig_x1:
            orig_x2 = min(orig_x1 + 1, dims.original_w)
        if orig_y2 <= orig_y1:
            orig_y2 = min(orig_y1 + 1, dims.original_h)
-            
+
        return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2)
    except Exception as e:
        print(f"map_to_original_coordinates error: {str(e)}")
@ -318,12 +292,17 @@ def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[
 def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions):
    """
-        From absolute coordinates to relevant coordinates
+    From absolute coordinates to relevant coordinates
-        e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4]
+    e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4]
    """
    try:
        x1, y1, x2, y2 = abs_coords
-        return round(x1 / dims.original_w, 3), round(y1 / dims.original_h, 3), round(x2 / dims.original_w, 3), round(y2 / dims.original_h, 3)
+        return (
            round(x1 / dims.original_w, 3),
            round(y1 / dims.original_h, 3),
            round(x2 / dims.original_w, 3),
            round(y2 / dims.original_h, 3),
        )
    except Exception as e:
        print(f"map_to_relevant_coordinates error: {str(e)}")
        return 0.0, 0.0, 1.0, 1.0  # Return full image coordinates
@ -331,13 +310,13 @@ def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions):
 def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None):
    """Process and adjust coordinates
-    
+
    Args:
        coords: Normalized coordinates [x1, y1, x2, y2]
        padded_image: Padded image
        dims: Image dimensions object
        previous_box: Previous box coordinates for overlap adjustment
-    
+
    Returns:
        tuple: (x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box)
    """
@ -345,35 +324,35 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
        # Convert normalized coordinates to absolute coordinates
        x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h)
        x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h)
-        
+
        # Ensure coordinates are within image bounds before adjustment
        x1 = max(0, min(x1, dims.padded_w - 1))
        y1 = max(0, min(y1, dims.padded_h - 1))
        x2 = max(0, min(x2, dims.padded_w))
        y2 = max(0, min(y2, dims.padded_h))
-        
+
        # Ensure width and height are at least 1 pixel
        if x2 <= x1:
            x2 = min(x1 + 1, dims.padded_w)
        if y2 <= y1:
            y2 = min(y1 + 1, dims.padded_h)
-        
+
        # Extend box boundaries
        new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]])
        x1, y1, x2, y2 = new_boxes[0]
-        
+
        # Ensure coordinates are still within image bounds after adjustment
        x1 = max(0, min(x1, dims.padded_w - 1))
        y1 = max(0, min(y1, dims.padded_h - 1))
        x2 = max(0, min(x2, dims.padded_w))
        y2 = max(0, min(y2, dims.padded_h))
-        
+
        # Ensure width and height are at least 1 pixel after adjustment
        if x2 <= x1:
            x2 = min(x1 + 1, dims.padded_w)
        if y2 <= y1:
            y2 = min(y1 + 1, dims.padded_h)
-        
+
        # Check for overlap with previous box and adjust
        if previous_box is not None:
            prev_x1, prev_y1, prev_x2, prev_y2 = previous_box
@ -384,15 +363,13 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
                # Make sure y2 is still greater than y1
                if y2 <= y1:
                    y2 = min(y1 + 1, dims.padded_h)
-        
+
        # Update previous box
        new_previous_box = [x1, y1, x2, y2]
        # Map to original coordinates
-        orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(
+        orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(x1, y1, x2, y2, dims)
-            x1, y1, x2, y2, dims
+
        )
        return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box
    except Exception as e:
        print(f"process_coordinates error: {str(e)}")
@ -403,10 +380,10 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
 def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
    """Load and prepare image with padding while maintaining aspect ratio
-    
+
    Args:
        image: PIL image
-        
+
    Returns:
        tuple: (padded_image, image_dimensions)
    """
@ -423,29 +400,18 @@ def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
        right = max_size - original_w - left
        # Apply padding
-        padded_image = cv2.copyMakeBorder(image, top, bottom, left, right,
+        padded_image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
                                cv2.BORDER_CONSTANT, value=(0, 0, 0))
        padded_h, padded_w = padded_image.shape[:2]
-        
+
-        dimensions = ImageDimensions(
+        dimensions = ImageDimensions(original_w=original_w, original_h=original_h, padded_w=padded_w, padded_h=padded_h)
-            original_w=original_w,
+
            original_h=original_h,
            padded_w=padded_w,
            padded_h=padded_h
        )
        return padded_image, dimensions
    except Exception as e:
        print(f"prepare_image error: {str(e)}")
        # Create a minimal valid image and dimensions
        h, w = image.height, image.width
-        dimensions = ImageDimensions(
+        dimensions = ImageDimensions(original_w=w, original_h=h, padded_w=w, padded_h=h)
            original_w=w,
            original_h=h,
            padded_w=w,
            padded_h=h
        )
        # Return a black image of the same size
        return np.zeros((h, w, 3), dtype=np.uint8), dimensions
@ -484,7 +450,7 @@ def crop_margin(img: Image.Image) -> Image.Image:
        if width == 0 or height == 0:
            print("Warning: Image has zero width or height")
            return img
-            
+
        data = np.array(img.convert("L"))
        data = data.astype(np.uint8)
        max_val = data.max()
@ -498,13 +464,13 @@ def crop_margin(img: Image.Image) -> Image.Image:
        if coords is None:
            return img
        a, b, w, h = cv2.boundingRect(coords)  # Find minimum spanning bounding box
-        
+
        # Ensure crop coordinates are within image bounds
        a = max(0, a)
        b = max(0, b)
        w = min(w, width - a)
        h = min(h, height - b)
-        
+
        # Only crop if we have a valid region
        if w > 0 and h > 0:
            return img.crop((a, b, a + w, b + h))