Merge pull request #90 from hanyd2010/feat/remove_albumentations

remove 'albumentations'
This commit is contained in:
Hao Feng 2025-06-26 20:01:35 +08:00 committed by GitHub
commit 0e4ead6717
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 84 additions and 112 deletions

View File

@ -1,4 +1,3 @@
albumentations==1.4.0
numpy==1.24.4
omegaconf==2.3.0
opencv-python==4.11.0.86

View File

@ -1,4 +1,4 @@
"""
"""
Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
SPDX-License-Identifier: MIT
"""
@ -6,8 +6,11 @@ SPDX-License-Identifier: MIT
import numpy as np
import torch
from PIL import ImageOps
from torchvision import transforms
from torchvision.transforms.functional import resize
from utils.utils import *
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
class DolphinProcessor:
@ -34,6 +37,10 @@ class DolphinProcessor:
self.prefix_answer_space_flag = dp_config.get("prefix_answer_space_flag", True)
self.suffix_prompt_space_flag = dp_config.get("suffix_prompt_space_flag", True)
self.transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)]
)
def process_prompt_for_inference(self, prompt):
prompt = prompt.replace("<image>\n", "")
if not prompt.startswith("<s>"):
@ -60,5 +67,5 @@ class DolphinProcessor:
)
image = ImageOps.expand(image, padding)
if return_img_size:
return test_transform(image).unsqueeze(0), (origin_w, origin_h)
return test_transform(image).unsqueeze(0)
return self.transform(image).unsqueeze(0), (origin_w, origin_h)
return self.transform(image).unsqueeze(0)

View File

@ -1,37 +1,33 @@
"""
"""
Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
SPDX-License-Identifier: MIT
"""
import copy
import io
import json
import os
import io
import re
from dataclasses import dataclass
from typing import List, Tuple
import albumentations as alb
import cv2
import numpy as np
from albumentations.pytorch import ToTensorV2
import pymupdf
from PIL import Image
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from torchvision.transforms.functional import resize
from utils.markdown_utils import MarkdownConverter
def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
"""Save cropped figure to local file system
Args:
pil_crop: PIL Image object of the cropped figure
save_dir: Base directory to save results
image_name: Name of the source image/document
reading_order: Reading order of the figure in the document
Returns:
str: Filename of the saved figure
"""
@ -39,17 +35,17 @@ def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
# Create figures directory if it doesn't exist
figures_dir = os.path.join(save_dir, "markdown", "figures")
# os.makedirs(figures_dir, exist_ok=True)
# Generate figure filename
figure_filename = f"{image_name}_figure_{reading_order:03d}.png"
figure_path = os.path.join(figures_dir, figure_filename)
# Save the figure
pil_crop.save(figure_path, format="PNG", quality=95)
# print(f"Saved figure: {figure_filename}")
return figure_filename
except Exception as e:
print(f"Error saving figure: {str(e)}")
# Return a fallback filename
@ -58,38 +54,38 @@ def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
def convert_pdf_to_images(pdf_path, target_size=896):
"""Convert PDF pages to images
Args:
pdf_path: Path to PDF file
target_size: Target size for the longest dimension
Returns:
List of PIL Images
"""
images = []
try:
doc = pymupdf.open(pdf_path)
for page_num in range(len(doc)):
page = doc[page_num]
# Calculate scale to make longest dimension equal to target_size
rect = page.rect
scale = target_size / max(rect.width, rect.height)
# Render page as image
mat = pymupdf.Matrix(scale, scale)
pix = page.get_pixmap(matrix=mat)
# Convert to PIL Image
img_data = pix.tobytes("png")
pil_image = Image.open(io.BytesIO(img_data))
images.append(pil_image)
doc.close()
print(f"Successfully converted {len(images)} pages from PDF")
return images
except Exception as e:
print(f"Error converting PDF to images: {str(e)}")
return []
@ -97,42 +93,38 @@ def convert_pdf_to_images(pdf_path, target_size=896):
def is_pdf_file(file_path):
"""Check if file is a PDF"""
return file_path.lower().endswith('.pdf')
return file_path.lower().endswith(".pdf")
def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
"""Save combined results for multi-page PDF with both JSON and Markdown
Args:
all_page_results: List of results for all pages
pdf_path: Path to original PDF file
save_dir: Directory to save results
Returns:
Path to saved combined JSON file
"""
# Create output filename based on PDF name
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
# Prepare combined results
combined_results = {
"source_file": pdf_path,
"total_pages": len(all_page_results),
"pages": all_page_results
}
combined_results = {"source_file": pdf_path, "total_pages": len(all_page_results), "pages": all_page_results}
# Save combined JSON results
json_filename = f"{base_name}.json"
json_path = os.path.join(save_dir, "recognition_json", json_filename)
os.makedirs(os.path.dirname(json_path), exist_ok=True)
with open(json_path, 'w', encoding='utf-8') as f:
with open(json_path, "w", encoding="utf-8") as f:
json.dump(combined_results, f, indent=2, ensure_ascii=False)
# Generate and save combined markdown
try:
markdown_converter = MarkdownConverter()
# Combine all page results into a single list for markdown conversion
all_elements = []
for page_data in all_page_results:
@ -140,52 +132,33 @@ def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
if page_elements:
# Add page separator if not the first page
if all_elements:
all_elements.append({
"label": "page_separator",
"text": f"\n\n---\n\n",
"reading_order": len(all_elements)
})
all_elements.append(
{"label": "page_separator", "text": f"\n\n---\n\n", "reading_order": len(all_elements)}
)
all_elements.extend(page_elements)
# Generate markdown content
markdown_content = markdown_converter.convert(all_elements)
# Save markdown file
markdown_filename = f"{base_name}.md"
markdown_path = os.path.join(save_dir, "markdown", markdown_filename)
os.makedirs(os.path.dirname(markdown_path), exist_ok=True)
with open(markdown_path, 'w', encoding='utf-8') as f:
with open(markdown_path, "w", encoding="utf-8") as f:
f.write(markdown_content)
# print(f"Combined markdown saved to: {markdown_path}")
except ImportError:
print("MarkdownConverter not available, skipping markdown generation")
except Exception as e:
print(f"Error generating markdown: {e}")
# print(f"Combined JSON results saved to: {json_path}")
return json_path
def alb_wrapper(transform):
def f(im):
return transform(image=np.asarray(im))["image"]
return f
test_transform = alb_wrapper(
alb.Compose(
[
alb.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
ToTensorV2(),
]
)
)
def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
# print(f"check_coord_valid: {x1}, {y1}, {x2}, {y2}, {image_size}, {abs_coord}")
if x2 <= x1 or y2 <= y1:
@ -195,12 +168,12 @@ def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
if not abs_coord:
if x2 > 1 or y2 > 1:
return False, f"[{x1}, {y1}, {x2}, {y2}]"
elif image_size is not None: # has image size
elif image_size is not None: # has image size
if x2 > image_size[0] or y2 > image_size[1]:
return False, f"[{x1}, {y1}, {x2}, {y2}]"
return True, None
def adjust_box_edges(image, boxes: List[List[float]], max_pixels=15, threshold=0.2):
"""
Image: cv2.image object, or Path
@ -276,6 +249,7 @@ def parse_layout_string(bbox_str):
@dataclass
class ImageDimensions:
"""Class to store image dimensions"""
original_w: int
original_h: int
padded_w: int
@ -284,11 +258,11 @@ class ImageDimensions:
def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[int, int, int, int]:
"""Map coordinates from padded image back to original image
Args:
x1, y1, x2, y2: Coordinates in padded image
dims: Image dimensions object
Returns:
tuple: (x1, y1, x2, y2) coordinates in original image
"""
@ -296,19 +270,19 @@ def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[
# Calculate padding offsets
top = (dims.padded_h - dims.original_h) // 2
left = (dims.padded_w - dims.original_w) // 2
# Map back to original coordinates
orig_x1 = max(0, x1 - left)
orig_y1 = max(0, y1 - top)
orig_x2 = min(dims.original_w, x2 - left)
orig_y2 = min(dims.original_h, y2 - top)
# Ensure we have a valid box (width and height > 0)
if orig_x2 <= orig_x1:
orig_x2 = min(orig_x1 + 1, dims.original_w)
if orig_y2 <= orig_y1:
orig_y2 = min(orig_y1 + 1, dims.original_h)
return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2)
except Exception as e:
print(f"map_to_original_coordinates error: {str(e)}")
@ -318,12 +292,17 @@ def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[
def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions):
"""
From absolute coordinates to relevant coordinates
e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4]
From absolute coordinates to relevant coordinates
e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4]
"""
try:
x1, y1, x2, y2 = abs_coords
return round(x1 / dims.original_w, 3), round(y1 / dims.original_h, 3), round(x2 / dims.original_w, 3), round(y2 / dims.original_h, 3)
return (
round(x1 / dims.original_w, 3),
round(y1 / dims.original_h, 3),
round(x2 / dims.original_w, 3),
round(y2 / dims.original_h, 3),
)
except Exception as e:
print(f"map_to_relevant_coordinates error: {str(e)}")
return 0.0, 0.0, 1.0, 1.0 # Return full image coordinates
@ -331,13 +310,13 @@ def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions):
def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None):
"""Process and adjust coordinates
Args:
coords: Normalized coordinates [x1, y1, x2, y2]
padded_image: Padded image
dims: Image dimensions object
previous_box: Previous box coordinates for overlap adjustment
Returns:
tuple: (x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box)
"""
@ -345,35 +324,35 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
# Convert normalized coordinates to absolute coordinates
x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h)
x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h)
# Ensure coordinates are within image bounds before adjustment
x1 = max(0, min(x1, dims.padded_w - 1))
y1 = max(0, min(y1, dims.padded_h - 1))
x2 = max(0, min(x2, dims.padded_w))
y2 = max(0, min(y2, dims.padded_h))
# Ensure width and height are at least 1 pixel
if x2 <= x1:
x2 = min(x1 + 1, dims.padded_w)
if y2 <= y1:
y2 = min(y1 + 1, dims.padded_h)
# Extend box boundaries
new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]])
x1, y1, x2, y2 = new_boxes[0]
# Ensure coordinates are still within image bounds after adjustment
x1 = max(0, min(x1, dims.padded_w - 1))
y1 = max(0, min(y1, dims.padded_h - 1))
x2 = max(0, min(x2, dims.padded_w))
y2 = max(0, min(y2, dims.padded_h))
# Ensure width and height are at least 1 pixel after adjustment
if x2 <= x1:
x2 = min(x1 + 1, dims.padded_w)
if y2 <= y1:
y2 = min(y1 + 1, dims.padded_h)
# Check for overlap with previous box and adjust
if previous_box is not None:
prev_x1, prev_y1, prev_x2, prev_y2 = previous_box
@ -384,15 +363,13 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
# Make sure y2 is still greater than y1
if y2 <= y1:
y2 = min(y1 + 1, dims.padded_h)
# Update previous box
new_previous_box = [x1, y1, x2, y2]
# Map to original coordinates
orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(
x1, y1, x2, y2, dims
)
orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(x1, y1, x2, y2, dims)
return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box
except Exception as e:
print(f"process_coordinates error: {str(e)}")
@ -403,10 +380,10 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
"""Load and prepare image with padding while maintaining aspect ratio
Args:
image: PIL image
Returns:
tuple: (padded_image, image_dimensions)
"""
@ -423,29 +400,18 @@ def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
right = max_size - original_w - left
# Apply padding
padded_image = cv2.copyMakeBorder(image, top, bottom, left, right,
cv2.BORDER_CONSTANT, value=(0, 0, 0))
padded_image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
padded_h, padded_w = padded_image.shape[:2]
dimensions = ImageDimensions(
original_w=original_w,
original_h=original_h,
padded_w=padded_w,
padded_h=padded_h
)
dimensions = ImageDimensions(original_w=original_w, original_h=original_h, padded_w=padded_w, padded_h=padded_h)
return padded_image, dimensions
except Exception as e:
print(f"prepare_image error: {str(e)}")
# Create a minimal valid image and dimensions
h, w = image.height, image.width
dimensions = ImageDimensions(
original_w=w,
original_h=h,
padded_w=w,
padded_h=h
)
dimensions = ImageDimensions(original_w=w, original_h=h, padded_w=w, padded_h=h)
# Return a black image of the same size
return np.zeros((h, w, 3), dtype=np.uint8), dimensions
@ -484,7 +450,7 @@ def crop_margin(img: Image.Image) -> Image.Image:
if width == 0 or height == 0:
print("Warning: Image has zero width or height")
return img
data = np.array(img.convert("L"))
data = data.astype(np.uint8)
max_val = data.max()
@ -498,13 +464,13 @@ def crop_margin(img: Image.Image) -> Image.Image:
if coords is None:
return img
a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box
# Ensure crop coordinates are within image bounds
a = max(0, a)
b = max(0, b)
w = min(w, width - a)
h = min(h, height - b)
# Only crop if we have a valid region
if w > 0 and h > 0:
return img.crop((a, b, a + w, b + h))