remove 'albumentations'
This commit is contained in:
parent
98b8ccc38d
commit
4edac82fc3
@ -1,4 +1,3 @@
|
|||||||
albumentations==1.4.0
|
|
||||||
numpy==1.24.4
|
numpy==1.24.4
|
||||||
omegaconf==2.3.0
|
omegaconf==2.3.0
|
||||||
opencv-python==4.11.0.86
|
opencv-python==4.11.0.86
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""
|
"""
|
||||||
Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||||
SPDX-License-Identifier: MIT
|
SPDX-License-Identifier: MIT
|
||||||
"""
|
"""
|
||||||
@ -6,8 +6,11 @@ SPDX-License-Identifier: MIT
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from PIL import ImageOps
|
from PIL import ImageOps
|
||||||
|
from torchvision import transforms
|
||||||
|
from torchvision.transforms.functional import resize
|
||||||
|
|
||||||
from utils.utils import *
|
IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
|
||||||
|
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
|
||||||
|
|
||||||
|
|
||||||
class DolphinProcessor:
|
class DolphinProcessor:
|
||||||
@ -34,6 +37,10 @@ class DolphinProcessor:
|
|||||||
self.prefix_answer_space_flag = dp_config.get("prefix_answer_space_flag", True)
|
self.prefix_answer_space_flag = dp_config.get("prefix_answer_space_flag", True)
|
||||||
self.suffix_prompt_space_flag = dp_config.get("suffix_prompt_space_flag", True)
|
self.suffix_prompt_space_flag = dp_config.get("suffix_prompt_space_flag", True)
|
||||||
|
|
||||||
|
self.transform = transforms.Compose(
|
||||||
|
[transforms.ToTensor(), transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD)]
|
||||||
|
)
|
||||||
|
|
||||||
def process_prompt_for_inference(self, prompt):
|
def process_prompt_for_inference(self, prompt):
|
||||||
prompt = prompt.replace("<image>\n", "")
|
prompt = prompt.replace("<image>\n", "")
|
||||||
if not prompt.startswith("<s>"):
|
if not prompt.startswith("<s>"):
|
||||||
@ -60,5 +67,5 @@ class DolphinProcessor:
|
|||||||
)
|
)
|
||||||
image = ImageOps.expand(image, padding)
|
image = ImageOps.expand(image, padding)
|
||||||
if return_img_size:
|
if return_img_size:
|
||||||
return test_transform(image).unsqueeze(0), (origin_w, origin_h)
|
return self.transform(image).unsqueeze(0), (origin_w, origin_h)
|
||||||
return test_transform(image).unsqueeze(0)
|
return self.transform(image).unsqueeze(0)
|
||||||
|
180
utils/utils.py
180
utils/utils.py
@ -1,37 +1,33 @@
|
|||||||
"""
|
"""
|
||||||
Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
|
||||||
SPDX-License-Identifier: MIT
|
SPDX-License-Identifier: MIT
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
|
import io
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import io
|
|
||||||
import re
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
import albumentations as alb
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from albumentations.pytorch import ToTensorV2
|
|
||||||
import pymupdf
|
import pymupdf
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
|
|
||||||
from torchvision.transforms.functional import resize
|
|
||||||
|
|
||||||
from utils.markdown_utils import MarkdownConverter
|
from utils.markdown_utils import MarkdownConverter
|
||||||
|
|
||||||
|
|
||||||
def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
|
def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
|
||||||
"""Save cropped figure to local file system
|
"""Save cropped figure to local file system
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pil_crop: PIL Image object of the cropped figure
|
pil_crop: PIL Image object of the cropped figure
|
||||||
save_dir: Base directory to save results
|
save_dir: Base directory to save results
|
||||||
image_name: Name of the source image/document
|
image_name: Name of the source image/document
|
||||||
reading_order: Reading order of the figure in the document
|
reading_order: Reading order of the figure in the document
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: Filename of the saved figure
|
str: Filename of the saved figure
|
||||||
"""
|
"""
|
||||||
@ -39,17 +35,17 @@ def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
|
|||||||
# Create figures directory if it doesn't exist
|
# Create figures directory if it doesn't exist
|
||||||
figures_dir = os.path.join(save_dir, "markdown", "figures")
|
figures_dir = os.path.join(save_dir, "markdown", "figures")
|
||||||
# os.makedirs(figures_dir, exist_ok=True)
|
# os.makedirs(figures_dir, exist_ok=True)
|
||||||
|
|
||||||
# Generate figure filename
|
# Generate figure filename
|
||||||
figure_filename = f"{image_name}_figure_{reading_order:03d}.png"
|
figure_filename = f"{image_name}_figure_{reading_order:03d}.png"
|
||||||
figure_path = os.path.join(figures_dir, figure_filename)
|
figure_path = os.path.join(figures_dir, figure_filename)
|
||||||
|
|
||||||
# Save the figure
|
# Save the figure
|
||||||
pil_crop.save(figure_path, format="PNG", quality=95)
|
pil_crop.save(figure_path, format="PNG", quality=95)
|
||||||
|
|
||||||
# print(f"Saved figure: {figure_filename}")
|
# print(f"Saved figure: {figure_filename}")
|
||||||
return figure_filename
|
return figure_filename
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error saving figure: {str(e)}")
|
print(f"Error saving figure: {str(e)}")
|
||||||
# Return a fallback filename
|
# Return a fallback filename
|
||||||
@ -58,38 +54,38 @@ def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
|
|||||||
|
|
||||||
def convert_pdf_to_images(pdf_path, target_size=896):
|
def convert_pdf_to_images(pdf_path, target_size=896):
|
||||||
"""Convert PDF pages to images
|
"""Convert PDF pages to images
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pdf_path: Path to PDF file
|
pdf_path: Path to PDF file
|
||||||
target_size: Target size for the longest dimension
|
target_size: Target size for the longest dimension
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of PIL Images
|
List of PIL Images
|
||||||
"""
|
"""
|
||||||
images = []
|
images = []
|
||||||
try:
|
try:
|
||||||
doc = pymupdf.open(pdf_path)
|
doc = pymupdf.open(pdf_path)
|
||||||
|
|
||||||
for page_num in range(len(doc)):
|
for page_num in range(len(doc)):
|
||||||
page = doc[page_num]
|
page = doc[page_num]
|
||||||
|
|
||||||
# Calculate scale to make longest dimension equal to target_size
|
# Calculate scale to make longest dimension equal to target_size
|
||||||
rect = page.rect
|
rect = page.rect
|
||||||
scale = target_size / max(rect.width, rect.height)
|
scale = target_size / max(rect.width, rect.height)
|
||||||
|
|
||||||
# Render page as image
|
# Render page as image
|
||||||
mat = pymupdf.Matrix(scale, scale)
|
mat = pymupdf.Matrix(scale, scale)
|
||||||
pix = page.get_pixmap(matrix=mat)
|
pix = page.get_pixmap(matrix=mat)
|
||||||
|
|
||||||
# Convert to PIL Image
|
# Convert to PIL Image
|
||||||
img_data = pix.tobytes("png")
|
img_data = pix.tobytes("png")
|
||||||
pil_image = Image.open(io.BytesIO(img_data))
|
pil_image = Image.open(io.BytesIO(img_data))
|
||||||
images.append(pil_image)
|
images.append(pil_image)
|
||||||
|
|
||||||
doc.close()
|
doc.close()
|
||||||
print(f"Successfully converted {len(images)} pages from PDF")
|
print(f"Successfully converted {len(images)} pages from PDF")
|
||||||
return images
|
return images
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error converting PDF to images: {str(e)}")
|
print(f"Error converting PDF to images: {str(e)}")
|
||||||
return []
|
return []
|
||||||
@ -97,42 +93,38 @@ def convert_pdf_to_images(pdf_path, target_size=896):
|
|||||||
|
|
||||||
def is_pdf_file(file_path):
|
def is_pdf_file(file_path):
|
||||||
"""Check if file is a PDF"""
|
"""Check if file is a PDF"""
|
||||||
return file_path.lower().endswith('.pdf')
|
return file_path.lower().endswith(".pdf")
|
||||||
|
|
||||||
|
|
||||||
def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
|
def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
|
||||||
"""Save combined results for multi-page PDF with both JSON and Markdown
|
"""Save combined results for multi-page PDF with both JSON and Markdown
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
all_page_results: List of results for all pages
|
all_page_results: List of results for all pages
|
||||||
pdf_path: Path to original PDF file
|
pdf_path: Path to original PDF file
|
||||||
save_dir: Directory to save results
|
save_dir: Directory to save results
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Path to saved combined JSON file
|
Path to saved combined JSON file
|
||||||
"""
|
"""
|
||||||
# Create output filename based on PDF name
|
# Create output filename based on PDF name
|
||||||
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
|
|
||||||
# Prepare combined results
|
# Prepare combined results
|
||||||
combined_results = {
|
combined_results = {"source_file": pdf_path, "total_pages": len(all_page_results), "pages": all_page_results}
|
||||||
"source_file": pdf_path,
|
|
||||||
"total_pages": len(all_page_results),
|
|
||||||
"pages": all_page_results
|
|
||||||
}
|
|
||||||
|
|
||||||
# Save combined JSON results
|
# Save combined JSON results
|
||||||
json_filename = f"{base_name}.json"
|
json_filename = f"{base_name}.json"
|
||||||
json_path = os.path.join(save_dir, "recognition_json", json_filename)
|
json_path = os.path.join(save_dir, "recognition_json", json_filename)
|
||||||
os.makedirs(os.path.dirname(json_path), exist_ok=True)
|
os.makedirs(os.path.dirname(json_path), exist_ok=True)
|
||||||
|
|
||||||
with open(json_path, 'w', encoding='utf-8') as f:
|
with open(json_path, "w", encoding="utf-8") as f:
|
||||||
json.dump(combined_results, f, indent=2, ensure_ascii=False)
|
json.dump(combined_results, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
# Generate and save combined markdown
|
# Generate and save combined markdown
|
||||||
try:
|
try:
|
||||||
markdown_converter = MarkdownConverter()
|
markdown_converter = MarkdownConverter()
|
||||||
|
|
||||||
# Combine all page results into a single list for markdown conversion
|
# Combine all page results into a single list for markdown conversion
|
||||||
all_elements = []
|
all_elements = []
|
||||||
for page_data in all_page_results:
|
for page_data in all_page_results:
|
||||||
@ -140,52 +132,33 @@ def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
|
|||||||
if page_elements:
|
if page_elements:
|
||||||
# Add page separator if not the first page
|
# Add page separator if not the first page
|
||||||
if all_elements:
|
if all_elements:
|
||||||
all_elements.append({
|
all_elements.append(
|
||||||
"label": "page_separator",
|
{"label": "page_separator", "text": f"\n\n---\n\n", "reading_order": len(all_elements)}
|
||||||
"text": f"\n\n---\n\n",
|
)
|
||||||
"reading_order": len(all_elements)
|
|
||||||
})
|
|
||||||
all_elements.extend(page_elements)
|
all_elements.extend(page_elements)
|
||||||
|
|
||||||
# Generate markdown content
|
# Generate markdown content
|
||||||
markdown_content = markdown_converter.convert(all_elements)
|
markdown_content = markdown_converter.convert(all_elements)
|
||||||
|
|
||||||
# Save markdown file
|
# Save markdown file
|
||||||
markdown_filename = f"{base_name}.md"
|
markdown_filename = f"{base_name}.md"
|
||||||
markdown_path = os.path.join(save_dir, "markdown", markdown_filename)
|
markdown_path = os.path.join(save_dir, "markdown", markdown_filename)
|
||||||
os.makedirs(os.path.dirname(markdown_path), exist_ok=True)
|
os.makedirs(os.path.dirname(markdown_path), exist_ok=True)
|
||||||
|
|
||||||
with open(markdown_path, 'w', encoding='utf-8') as f:
|
with open(markdown_path, "w", encoding="utf-8") as f:
|
||||||
f.write(markdown_content)
|
f.write(markdown_content)
|
||||||
|
|
||||||
# print(f"Combined markdown saved to: {markdown_path}")
|
# print(f"Combined markdown saved to: {markdown_path}")
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("MarkdownConverter not available, skipping markdown generation")
|
print("MarkdownConverter not available, skipping markdown generation")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error generating markdown: {e}")
|
print(f"Error generating markdown: {e}")
|
||||||
|
|
||||||
# print(f"Combined JSON results saved to: {json_path}")
|
# print(f"Combined JSON results saved to: {json_path}")
|
||||||
return json_path
|
return json_path
|
||||||
|
|
||||||
|
|
||||||
def alb_wrapper(transform):
|
|
||||||
def f(im):
|
|
||||||
return transform(image=np.asarray(im))["image"]
|
|
||||||
|
|
||||||
return f
|
|
||||||
|
|
||||||
|
|
||||||
test_transform = alb_wrapper(
|
|
||||||
alb.Compose(
|
|
||||||
[
|
|
||||||
alb.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
|
|
||||||
ToTensorV2(),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
|
def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
|
||||||
# print(f"check_coord_valid: {x1}, {y1}, {x2}, {y2}, {image_size}, {abs_coord}")
|
# print(f"check_coord_valid: {x1}, {y1}, {x2}, {y2}, {image_size}, {abs_coord}")
|
||||||
if x2 <= x1 or y2 <= y1:
|
if x2 <= x1 or y2 <= y1:
|
||||||
@ -195,12 +168,12 @@ def check_coord_valid(x1, y1, x2, y2, image_size=None, abs_coord=True):
|
|||||||
if not abs_coord:
|
if not abs_coord:
|
||||||
if x2 > 1 or y2 > 1:
|
if x2 > 1 or y2 > 1:
|
||||||
return False, f"[{x1}, {y1}, {x2}, {y2}]"
|
return False, f"[{x1}, {y1}, {x2}, {y2}]"
|
||||||
elif image_size is not None: # has image size
|
elif image_size is not None: # has image size
|
||||||
if x2 > image_size[0] or y2 > image_size[1]:
|
if x2 > image_size[0] or y2 > image_size[1]:
|
||||||
return False, f"[{x1}, {y1}, {x2}, {y2}]"
|
return False, f"[{x1}, {y1}, {x2}, {y2}]"
|
||||||
return True, None
|
return True, None
|
||||||
|
|
||||||
|
|
||||||
def adjust_box_edges(image, boxes: List[List[float]], max_pixels=15, threshold=0.2):
|
def adjust_box_edges(image, boxes: List[List[float]], max_pixels=15, threshold=0.2):
|
||||||
"""
|
"""
|
||||||
Image: cv2.image object, or Path
|
Image: cv2.image object, or Path
|
||||||
@ -276,6 +249,7 @@ def parse_layout_string(bbox_str):
|
|||||||
@dataclass
|
@dataclass
|
||||||
class ImageDimensions:
|
class ImageDimensions:
|
||||||
"""Class to store image dimensions"""
|
"""Class to store image dimensions"""
|
||||||
|
|
||||||
original_w: int
|
original_w: int
|
||||||
original_h: int
|
original_h: int
|
||||||
padded_w: int
|
padded_w: int
|
||||||
@ -284,11 +258,11 @@ class ImageDimensions:
|
|||||||
|
|
||||||
def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[int, int, int, int]:
|
def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[int, int, int, int]:
|
||||||
"""Map coordinates from padded image back to original image
|
"""Map coordinates from padded image back to original image
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
x1, y1, x2, y2: Coordinates in padded image
|
x1, y1, x2, y2: Coordinates in padded image
|
||||||
dims: Image dimensions object
|
dims: Image dimensions object
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tuple: (x1, y1, x2, y2) coordinates in original image
|
tuple: (x1, y1, x2, y2) coordinates in original image
|
||||||
"""
|
"""
|
||||||
@ -296,19 +270,19 @@ def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[
|
|||||||
# Calculate padding offsets
|
# Calculate padding offsets
|
||||||
top = (dims.padded_h - dims.original_h) // 2
|
top = (dims.padded_h - dims.original_h) // 2
|
||||||
left = (dims.padded_w - dims.original_w) // 2
|
left = (dims.padded_w - dims.original_w) // 2
|
||||||
|
|
||||||
# Map back to original coordinates
|
# Map back to original coordinates
|
||||||
orig_x1 = max(0, x1 - left)
|
orig_x1 = max(0, x1 - left)
|
||||||
orig_y1 = max(0, y1 - top)
|
orig_y1 = max(0, y1 - top)
|
||||||
orig_x2 = min(dims.original_w, x2 - left)
|
orig_x2 = min(dims.original_w, x2 - left)
|
||||||
orig_y2 = min(dims.original_h, y2 - top)
|
orig_y2 = min(dims.original_h, y2 - top)
|
||||||
|
|
||||||
# Ensure we have a valid box (width and height > 0)
|
# Ensure we have a valid box (width and height > 0)
|
||||||
if orig_x2 <= orig_x1:
|
if orig_x2 <= orig_x1:
|
||||||
orig_x2 = min(orig_x1 + 1, dims.original_w)
|
orig_x2 = min(orig_x1 + 1, dims.original_w)
|
||||||
if orig_y2 <= orig_y1:
|
if orig_y2 <= orig_y1:
|
||||||
orig_y2 = min(orig_y1 + 1, dims.original_h)
|
orig_y2 = min(orig_y1 + 1, dims.original_h)
|
||||||
|
|
||||||
return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2)
|
return int(orig_x1), int(orig_y1), int(orig_x2), int(orig_y2)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"map_to_original_coordinates error: {str(e)}")
|
print(f"map_to_original_coordinates error: {str(e)}")
|
||||||
@ -318,12 +292,17 @@ def map_to_original_coordinates(x1, y1, x2, y2, dims: ImageDimensions) -> Tuple[
|
|||||||
|
|
||||||
def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions):
|
def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions):
|
||||||
"""
|
"""
|
||||||
From absolute coordinates to relevant coordinates
|
From absolute coordinates to relevant coordinates
|
||||||
e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4]
|
e.g. [100, 100, 200, 200] -> [0.1, 0.2, 0.3, 0.4]
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
x1, y1, x2, y2 = abs_coords
|
x1, y1, x2, y2 = abs_coords
|
||||||
return round(x1 / dims.original_w, 3), round(y1 / dims.original_h, 3), round(x2 / dims.original_w, 3), round(y2 / dims.original_h, 3)
|
return (
|
||||||
|
round(x1 / dims.original_w, 3),
|
||||||
|
round(y1 / dims.original_h, 3),
|
||||||
|
round(x2 / dims.original_w, 3),
|
||||||
|
round(y2 / dims.original_h, 3),
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"map_to_relevant_coordinates error: {str(e)}")
|
print(f"map_to_relevant_coordinates error: {str(e)}")
|
||||||
return 0.0, 0.0, 1.0, 1.0 # Return full image coordinates
|
return 0.0, 0.0, 1.0, 1.0 # Return full image coordinates
|
||||||
@ -331,13 +310,13 @@ def map_to_relevant_coordinates(abs_coords, dims: ImageDimensions):
|
|||||||
|
|
||||||
def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None):
|
def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_box=None):
|
||||||
"""Process and adjust coordinates
|
"""Process and adjust coordinates
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
coords: Normalized coordinates [x1, y1, x2, y2]
|
coords: Normalized coordinates [x1, y1, x2, y2]
|
||||||
padded_image: Padded image
|
padded_image: Padded image
|
||||||
dims: Image dimensions object
|
dims: Image dimensions object
|
||||||
previous_box: Previous box coordinates for overlap adjustment
|
previous_box: Previous box coordinates for overlap adjustment
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tuple: (x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box)
|
tuple: (x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box)
|
||||||
"""
|
"""
|
||||||
@ -345,35 +324,35 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
|
|||||||
# Convert normalized coordinates to absolute coordinates
|
# Convert normalized coordinates to absolute coordinates
|
||||||
x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h)
|
x1, y1 = int(coords[0] * dims.padded_w), int(coords[1] * dims.padded_h)
|
||||||
x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h)
|
x2, y2 = int(coords[2] * dims.padded_w), int(coords[3] * dims.padded_h)
|
||||||
|
|
||||||
# Ensure coordinates are within image bounds before adjustment
|
# Ensure coordinates are within image bounds before adjustment
|
||||||
x1 = max(0, min(x1, dims.padded_w - 1))
|
x1 = max(0, min(x1, dims.padded_w - 1))
|
||||||
y1 = max(0, min(y1, dims.padded_h - 1))
|
y1 = max(0, min(y1, dims.padded_h - 1))
|
||||||
x2 = max(0, min(x2, dims.padded_w))
|
x2 = max(0, min(x2, dims.padded_w))
|
||||||
y2 = max(0, min(y2, dims.padded_h))
|
y2 = max(0, min(y2, dims.padded_h))
|
||||||
|
|
||||||
# Ensure width and height are at least 1 pixel
|
# Ensure width and height are at least 1 pixel
|
||||||
if x2 <= x1:
|
if x2 <= x1:
|
||||||
x2 = min(x1 + 1, dims.padded_w)
|
x2 = min(x1 + 1, dims.padded_w)
|
||||||
if y2 <= y1:
|
if y2 <= y1:
|
||||||
y2 = min(y1 + 1, dims.padded_h)
|
y2 = min(y1 + 1, dims.padded_h)
|
||||||
|
|
||||||
# Extend box boundaries
|
# Extend box boundaries
|
||||||
new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]])
|
new_boxes = adjust_box_edges(padded_image, [[x1, y1, x2, y2]])
|
||||||
x1, y1, x2, y2 = new_boxes[0]
|
x1, y1, x2, y2 = new_boxes[0]
|
||||||
|
|
||||||
# Ensure coordinates are still within image bounds after adjustment
|
# Ensure coordinates are still within image bounds after adjustment
|
||||||
x1 = max(0, min(x1, dims.padded_w - 1))
|
x1 = max(0, min(x1, dims.padded_w - 1))
|
||||||
y1 = max(0, min(y1, dims.padded_h - 1))
|
y1 = max(0, min(y1, dims.padded_h - 1))
|
||||||
x2 = max(0, min(x2, dims.padded_w))
|
x2 = max(0, min(x2, dims.padded_w))
|
||||||
y2 = max(0, min(y2, dims.padded_h))
|
y2 = max(0, min(y2, dims.padded_h))
|
||||||
|
|
||||||
# Ensure width and height are at least 1 pixel after adjustment
|
# Ensure width and height are at least 1 pixel after adjustment
|
||||||
if x2 <= x1:
|
if x2 <= x1:
|
||||||
x2 = min(x1 + 1, dims.padded_w)
|
x2 = min(x1 + 1, dims.padded_w)
|
||||||
if y2 <= y1:
|
if y2 <= y1:
|
||||||
y2 = min(y1 + 1, dims.padded_h)
|
y2 = min(y1 + 1, dims.padded_h)
|
||||||
|
|
||||||
# Check for overlap with previous box and adjust
|
# Check for overlap with previous box and adjust
|
||||||
if previous_box is not None:
|
if previous_box is not None:
|
||||||
prev_x1, prev_y1, prev_x2, prev_y2 = previous_box
|
prev_x1, prev_y1, prev_x2, prev_y2 = previous_box
|
||||||
@ -384,15 +363,13 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
|
|||||||
# Make sure y2 is still greater than y1
|
# Make sure y2 is still greater than y1
|
||||||
if y2 <= y1:
|
if y2 <= y1:
|
||||||
y2 = min(y1 + 1, dims.padded_h)
|
y2 = min(y1 + 1, dims.padded_h)
|
||||||
|
|
||||||
# Update previous box
|
# Update previous box
|
||||||
new_previous_box = [x1, y1, x2, y2]
|
new_previous_box = [x1, y1, x2, y2]
|
||||||
|
|
||||||
# Map to original coordinates
|
# Map to original coordinates
|
||||||
orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(
|
orig_x1, orig_y1, orig_x2, orig_y2 = map_to_original_coordinates(x1, y1, x2, y2, dims)
|
||||||
x1, y1, x2, y2, dims
|
|
||||||
)
|
|
||||||
|
|
||||||
return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box
|
return x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, new_previous_box
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"process_coordinates error: {str(e)}")
|
print(f"process_coordinates error: {str(e)}")
|
||||||
@ -403,10 +380,10 @@ def process_coordinates(coords, padded_image, dims: ImageDimensions, previous_bo
|
|||||||
|
|
||||||
def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
|
def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
|
||||||
"""Load and prepare image with padding while maintaining aspect ratio
|
"""Load and prepare image with padding while maintaining aspect ratio
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image: PIL image
|
image: PIL image
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tuple: (padded_image, image_dimensions)
|
tuple: (padded_image, image_dimensions)
|
||||||
"""
|
"""
|
||||||
@ -423,29 +400,18 @@ def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
|
|||||||
right = max_size - original_w - left
|
right = max_size - original_w - left
|
||||||
|
|
||||||
# Apply padding
|
# Apply padding
|
||||||
padded_image = cv2.copyMakeBorder(image, top, bottom, left, right,
|
padded_image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(0, 0, 0))
|
||||||
cv2.BORDER_CONSTANT, value=(0, 0, 0))
|
|
||||||
|
|
||||||
padded_h, padded_w = padded_image.shape[:2]
|
padded_h, padded_w = padded_image.shape[:2]
|
||||||
|
|
||||||
dimensions = ImageDimensions(
|
dimensions = ImageDimensions(original_w=original_w, original_h=original_h, padded_w=padded_w, padded_h=padded_h)
|
||||||
original_w=original_w,
|
|
||||||
original_h=original_h,
|
|
||||||
padded_w=padded_w,
|
|
||||||
padded_h=padded_h
|
|
||||||
)
|
|
||||||
|
|
||||||
return padded_image, dimensions
|
return padded_image, dimensions
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"prepare_image error: {str(e)}")
|
print(f"prepare_image error: {str(e)}")
|
||||||
# Create a minimal valid image and dimensions
|
# Create a minimal valid image and dimensions
|
||||||
h, w = image.height, image.width
|
h, w = image.height, image.width
|
||||||
dimensions = ImageDimensions(
|
dimensions = ImageDimensions(original_w=w, original_h=h, padded_w=w, padded_h=h)
|
||||||
original_w=w,
|
|
||||||
original_h=h,
|
|
||||||
padded_w=w,
|
|
||||||
padded_h=h
|
|
||||||
)
|
|
||||||
# Return a black image of the same size
|
# Return a black image of the same size
|
||||||
return np.zeros((h, w, 3), dtype=np.uint8), dimensions
|
return np.zeros((h, w, 3), dtype=np.uint8), dimensions
|
||||||
|
|
||||||
@ -484,7 +450,7 @@ def crop_margin(img: Image.Image) -> Image.Image:
|
|||||||
if width == 0 or height == 0:
|
if width == 0 or height == 0:
|
||||||
print("Warning: Image has zero width or height")
|
print("Warning: Image has zero width or height")
|
||||||
return img
|
return img
|
||||||
|
|
||||||
data = np.array(img.convert("L"))
|
data = np.array(img.convert("L"))
|
||||||
data = data.astype(np.uint8)
|
data = data.astype(np.uint8)
|
||||||
max_val = data.max()
|
max_val = data.max()
|
||||||
@ -498,13 +464,13 @@ def crop_margin(img: Image.Image) -> Image.Image:
|
|||||||
if coords is None:
|
if coords is None:
|
||||||
return img
|
return img
|
||||||
a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box
|
a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box
|
||||||
|
|
||||||
# Ensure crop coordinates are within image bounds
|
# Ensure crop coordinates are within image bounds
|
||||||
a = max(0, a)
|
a = max(0, a)
|
||||||
b = max(0, b)
|
b = max(0, b)
|
||||||
w = min(w, width - a)
|
w = min(w, width - a)
|
||||||
h = min(h, height - b)
|
h = min(h, height - b)
|
||||||
|
|
||||||
# Only crop if we have a valid region
|
# Only crop if we have a valid region
|
||||||
if w > 0 and h > 0:
|
if w > 0 and h > 0:
|
||||||
return img.crop((a, b, a + w, b + h))
|
return img.crop((a, b, a + w, b + h))
|
||||||
|
Loading…
Reference in New Issue
Block a user