remove 'albumentations'

This commit is contained in:
yingdong.han 2025-06-26 19:45:12 +08:00
parent 98b8ccc38d
commit 4edac82fc3
3 changed files with 84 additions and 112 deletions

View File

@ -1,4 +1,3 @@
albumentations==1.4.0
numpy==1.24.4
omegaconf==2.3.0
opencv-python==4.11.0.86

View File

@ -6,8 +6,11 @@ SPDX-License-Identifier: MIT
import numpy as np
import torch
from PIL import ImageOps
from torchvision import transforms
from torchvision.transforms.functional import resize
from utils.utils import *
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
class DolphinProcessor:
@ -34,6 +37,10 @@ class DolphinProcessor:
self.prefix_answer_space_flag = dp_config.get("prefix_answer_space_flag", True)
self.suffix_prompt_space_flag = dp_config.get("suffix_prompt_space_flag", True)
self.transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)]
)
def process_prompt_for_inference(self, prompt):
prompt = prompt.replace("<image>\n", "")
if not prompt.startswith("<s>"):
@ -60,5 +67,5 @@ class DolphinProcessor:
)
image = ImageOps.expand(image, padding)
if return_img_size:
return test_transform(image).unsqueeze(0), (origin_w, origin_h)
return test_transform(image).unsqueeze(0)
return self.transform(image).unsqueeze(0), (origin_w, origin_h)
return self.transform(image).unsqueeze(0)

View File

@ -4,21 +4,17 @@ SPDX-License-Identifier: MIT
"""
import copy
import io
import json
import os
import io
import re
from dataclasses import dataclass
from typing import List, Tuple
import albumentations as alb
import cv2
import numpy as np
from albumentations.pytorch import ToTensorV2
import pymupdf
from PIL import Image
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from torchvision.transforms.functional import resize
from utils.markdown_utils import MarkdownConverter
@ -97,7 +93,7 @@ def convert_pdf_to_images(pdf_path, target_size=896):
def is_pdf_file(file_path):
"""Check if file is a PDF"""
return file_path.lower().endswith('.pdf')
return file_path.lower().endswith(".pdf")
def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
@ -115,18 +111,14 @@ def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
# Prepare combined results
combined_results = {
"source_file": pdf_path,
"total_pages": len(all_page_results),
"pages": all_page_results
}
combined_results = {"source_file": pdf_path, "total_pages": len(all_page_results), "pages": all_page_results}
# Save combined JSON results
json_filename = f"{base_name}.json"
json_path = os.path.join(save_dir, "recognition_json", json_filename)
os.makedirs(os.path.dirname(json_path), exist_ok=True)
with open(json_path, 'w', encoding='utf-8') as f:
with open(json_path, "w", encoding="utf-8") as f:
json.dump(combined_results, f, indent=2, ensure_ascii=False)
# Generate and save combined markdown
@ -140,11 +132,9 @@ def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
if page_elements:
# Add page separator if not the first page
if all_elements:
all_elements.append({
"label": "page_separator",
"text": f"\n\n---\n\n",
"reading_order": len(all_elements)
})
all_elements.append(
{"label": "page_separator", "text": f"\n\n---\n\n", "reading_order": len(all_elements)}
)
all_elements.extend(page_elements)
# Generate markdown content
@ -155,7 +145,7 @@ def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
markdown_path = os.path.join(save_dir, "markdown", markdown_filename)
os.makedirs(os.path.dirname(markdown_path), exist_ok=True)
with open(markdown_path, 'w', encoding='utf-8') as f:
with open(markdown_path, "w", encoding="utf-8") as f:
f.write(markdown_content)
# print(f"Combined markdown saved to: {markdown_path}")
@ -169,23 +159,6 @@ def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
return json_path
def alb_wrapper(transform):
def f(im):
return transform(image=np.asarray(im))["image"]
return f
test_transform = alb_wrapper(
alb.Compose(
[
alb.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
ToTensorV2(),
]
)
)
def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
# print(f"check_coord_valid: {x1}, {y1}, {x2}, {y2}, {image_size}, {abs_coord}")
if x2 <= x1 or y2 <= y1:
@ -195,7 +168,7 @@ def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
if not abs_coord:
if x2 > 1 or y2 > 1:
return False, f"[{x1}, {y1}, {x2}, {y2}]"
elif image_size is not None: # has image size
elif image_size is not None: # has image size
if x2 > image_size[0] or y2 > image_size[1]:
return False, f"[{x1}, {y1}, {x2}, {y2}]"
return True, None
@ -276,6 +249,7 @@ def parse_layout_string(bbox_str):
@dataclass
class ImageDimensions:
"""Class to store image dimensions"""
original_w: int
original_h: int
padded_w: int
@ -318,12 +292,17 @@ def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[
def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions):
"""
From absolute coordinates to relevant coordinates
e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4]
From absolute coordinates to relevant coordinates
e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4]
"""
try:
x1, y1, x2, y2 = abs_coords
return round(x1 / dims.original_w, 3), round(y1 / dims.original_h, 3), round(x2 / dims.original_w, 3), round(y2 / dims.original_h, 3)
return (
round(x1 / dims.original_w, 3),
round(y1 / dims.original_h, 3),
round(x2 / dims.original_w, 3),
round(y2 / dims.original_h, 3),
)
except Exception as e:
print(f"map_to_relevant_coordinates error: {str(e)}")
return 0.0, 0.0, 1.0, 1.0 # Return full image coordinates
@ -389,9 +368,7 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
new_previous_box = [x1, y1, x2, y2]
# Map to original coordinates
orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(
x1, y1, x2, y2, dims
)
orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(x1, y1, x2, y2, dims)
return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box
except Exception as e:
@ -423,29 +400,18 @@ def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
right = max_size - original_w - left
# Apply padding
padded_image = cv2.copyMakeBorder(image, top, bottom, left, right,
cv2.BORDER_CONSTANT, value=(0, 0, 0))
padded_image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
padded_h, padded_w = padded_image.shape[:2]
dimensions = ImageDimensions(
original_w=original_w,
original_h=original_h,
padded_w=padded_w,
padded_h=padded_h
)
dimensions = ImageDimensions(original_w=original_w, original_h=original_h, padded_w=padded_w, padded_h=padded_h)
return padded_image, dimensions
except Exception as e:
print(f"prepare_image error: {str(e)}")
# Create a minimal valid image and dimensions
h, w = image.height, image.width
dimensions = ImageDimensions(
original_w=w,
original_h=h,
padded_w=w,
padded_h=h
)
dimensions = ImageDimensions(original_w=w, original_h=h, padded_w=w, padded_h=h)
# Return a black image of the same size
return np.zeros((h, w, 3), dtype=np.uint8), dimensions