remove 'albumentations'

This commit is contained in:
yingdong.han 2025-06-26 19:45:12 +08:00
parent 98b8ccc38d
commit 4edac82fc3
3 changed files with 84 additions and 112 deletions

View File

@ -1,4 +1,3 @@
albumentations==1.4.0
numpy==1.24.4 numpy==1.24.4
omegaconf==2.3.0 omegaconf==2.3.0
opencv-python==4.11.0.86 opencv-python==4.11.0.86

View File

@ -6,8 +6,11 @@ SPDX-License-Identifier: MIT
import numpy as np import numpy as np
import torch import torch
from PIL import ImageOps from PIL import ImageOps
from torchvision import transforms
from torchvision.transforms.functional import resize
from utils.utils import * IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
class DolphinProcessor: class DolphinProcessor:
@ -34,6 +37,10 @@ class DolphinProcessor:
self.prefix_answer_space_flag = dp_config.get("prefix_answer_space_flag", True) self.prefix_answer_space_flag = dp_config.get("prefix_answer_space_flag", True)
self.suffix_prompt_space_flag = dp_config.get("suffix_prompt_space_flag", True) self.suffix_prompt_space_flag = dp_config.get("suffix_prompt_space_flag", True)
self.transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)]
)
def process_prompt_for_inference(self, prompt): def process_prompt_for_inference(self, prompt):
prompt = prompt.replace("<image>\n", "") prompt = prompt.replace("<image>\n", "")
if not prompt.startswith("<s>"): if not prompt.startswith("<s>"):
@ -60,5 +67,5 @@ class DolphinProcessor:
) )
image = ImageOps.expand(image, padding) image = ImageOps.expand(image, padding)
if return_img_size: if return_img_size:
return test_transform(image).unsqueeze(0), (origin_w, origin_h) return self.transform(image).unsqueeze(0), (origin_w, origin_h)
return test_transform(image).unsqueeze(0) return self.transform(image).unsqueeze(0)

View File

@ -4,21 +4,17 @@ SPDX-License-Identifier: MIT
""" """
import copy import copy
import io
import json import json
import os import os
import io
import re import re
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Tuple from typing import List, Tuple
import albumentations as alb
import cv2 import cv2
import numpy as np import numpy as np
from albumentations.pytorch import ToTensorV2
import pymupdf import pymupdf
from PIL import Image from PIL import Image
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from torchvision.transforms.functional import resize
from utils.markdown_utils import MarkdownConverter from utils.markdown_utils import MarkdownConverter
@ -97,7 +93,7 @@ def convert_pdf_to_images(pdf_path, target_size=896):
def is_pdf_file(file_path): def is_pdf_file(file_path):
"""Check if file is a PDF""" """Check if file is a PDF"""
return file_path.lower().endswith('.pdf') return file_path.lower().endswith(".pdf")
def save_combined_pdf_results(all_page_results, pdf_path, save_dir): def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
@ -115,18 +111,14 @@ def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
base_name = os.path.splitext(os.path.basename(pdf_path))[0] base_name = os.path.splitext(os.path.basename(pdf_path))[0]
# Prepare combined results # Prepare combined results
combined_results = { combined_results = {"source_file": pdf_path, "total_pages": len(all_page_results), "pages": all_page_results}
"source_file": pdf_path,
"total_pages": len(all_page_results),
"pages": all_page_results
}
# Save combined JSON results # Save combined JSON results
json_filename = f"{base_name}.json" json_filename = f"{base_name}.json"
json_path = os.path.join(save_dir, "recognition_json", json_filename) json_path = os.path.join(save_dir, "recognition_json", json_filename)
os.makedirs(os.path.dirname(json_path), exist_ok=True) os.makedirs(os.path.dirname(json_path), exist_ok=True)
with open(json_path, 'w', encoding='utf-8') as f: with open(json_path, "w", encoding="utf-8") as f:
json.dump(combined_results, f, indent=2, ensure_ascii=False) json.dump(combined_results, f, indent=2, ensure_ascii=False)
# Generate and save combined markdown # Generate and save combined markdown
@ -140,11 +132,9 @@ def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
if page_elements: if page_elements:
# Add page separator if not the first page # Add page separator if not the first page
if all_elements: if all_elements:
all_elements.append({ all_elements.append(
"label": "page_separator", {"label": "page_separator", "text": f"\n\n---\n\n", "reading_order": len(all_elements)}
"text": f"\n\n---\n\n", )
"reading_order": len(all_elements)
})
all_elements.extend(page_elements) all_elements.extend(page_elements)
# Generate markdown content # Generate markdown content
@ -155,7 +145,7 @@ def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
markdown_path = os.path.join(save_dir, "markdown", markdown_filename) markdown_path = os.path.join(save_dir, "markdown", markdown_filename)
os.makedirs(os.path.dirname(markdown_path), exist_ok=True) os.makedirs(os.path.dirname(markdown_path), exist_ok=True)
with open(markdown_path, 'w', encoding='utf-8') as f: with open(markdown_path, "w", encoding="utf-8") as f:
f.write(markdown_content) f.write(markdown_content)
# print(f"Combined markdown saved to: {markdown_path}") # print(f"Combined markdown saved to: {markdown_path}")
@ -169,23 +159,6 @@ def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
return json_path return json_path
def alb_wrapper(transform):
def f(im):
return transform(image=np.asarray(im))["image"]
return f
test_transform = alb_wrapper(
alb.Compose(
[
alb.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
ToTensorV2(),
]
)
)
def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True): def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
# print(f"check_coord_valid: {x1}, {y1}, {x2}, {y2}, {image_size}, {abs_coord}") # print(f"check_coord_valid: {x1}, {y1}, {x2}, {y2}, {image_size}, {abs_coord}")
if x2 <= x1 or y2 <= y1: if x2 <= x1 or y2 <= y1:
@ -276,6 +249,7 @@ def parse_layout_string(bbox_str):
@dataclass @dataclass
class ImageDimensions: class ImageDimensions:
"""Class to store image dimensions""" """Class to store image dimensions"""
original_w: int original_w: int
original_h: int original_h: int
padded_w: int padded_w: int
@ -323,7 +297,12 @@ def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions):
""" """
try: try:
x1, y1, x2, y2 = abs_coords x1, y1, x2, y2 = abs_coords
return round(x1 / dims.original_w, 3), round(y1 / dims.original_h, 3), round(x2 / dims.original_w, 3), round(y2 / dims.original_h, 3) return (
round(x1 / dims.original_w, 3),
round(y1 / dims.original_h, 3),
round(x2 / dims.original_w, 3),
round(y2 / dims.original_h, 3),
)
except Exception as e: except Exception as e:
print(f"map_to_relevant_coordinates error: {str(e)}") print(f"map_to_relevant_coordinates error: {str(e)}")
return 0.0, 0.0, 1.0, 1.0 # Return full image coordinates return 0.0, 0.0, 1.0, 1.0 # Return full image coordinates
@ -389,9 +368,7 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
new_previous_box = [x1, y1, x2, y2] new_previous_box = [x1, y1, x2, y2]
# Map to original coordinates # Map to original coordinates
orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates( orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(x1, y1, x2, y2, dims)
x1, y1, x2, y2, dims
)
return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box
except Exception as e: except Exception as e:
@ -423,29 +400,18 @@ def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
right = max_size - original_w - left right = max_size - original_w - left
# Apply padding # Apply padding
padded_image = cv2.copyMakeBorder(image, top, bottom, left, right, padded_image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
cv2.BORDER_CONSTANT, value=(0, 0, 0))
padded_h, padded_w = padded_image.shape[:2] padded_h, padded_w = padded_image.shape[:2]
dimensions = ImageDimensions( dimensions = ImageDimensions(original_w=original_w, original_h=original_h, padded_w=padded_w, padded_h=padded_h)
original_w=original_w,
original_h=original_h,
padded_w=padded_w,
padded_h=padded_h
)
return padded_image, dimensions return padded_image, dimensions
except Exception as e: except Exception as e:
print(f"prepare_image error: {str(e)}") print(f"prepare_image error: {str(e)}")
# Create a minimal valid image and dimensions # Create a minimal valid image and dimensions
h, w = image.height, image.width h, w = image.height, image.width
dimensions = ImageDimensions( dimensions = ImageDimensions(original_w=w, original_h=h, padded_w=w, padded_h=h)
original_w=w,
original_h=h,
padded_w=w,
padded_h=h
)
# Return a black image of the same size # Return a black image of the same size
return np.zeros((h, w, 3), dtype=np.uint8), dimensions return np.zeros((h, w, 3), dtype=np.uint8), dimensions