add pdf parsing
This commit is contained in:
parent
49f51871c6
commit
10b017a62b
23
README.md
23
README.md
@ -9,7 +9,7 @@
|
||||
<a href="https://huggingface.co/ByteDance/Dolphin">
|
||||
<img src="https://img.shields.io/badge/HuggingFace-Dolphin-yellow">
|
||||
</a>
|
||||
<a href="http://115.190.42.15:8888/dolphin/">
|
||||
<a href="https://huggingface.co/spaces/ByteDance/Dolphin">
|
||||
<img src="https://img.shields.io/badge/Demo-Dolphin-blue">
|
||||
</a>
|
||||
<a href="https://github.com/bytedance/Dolphin">
|
||||
@ -50,6 +50,7 @@ Try our demo on [Demo-Dolphin](http://115.190.42.15:8888/dolphin/).
|
||||
|
||||
|
||||
## 📅 Changelog
|
||||
- 🔥 **2025.06.13** Added multi-page PDF document parsing capability.
|
||||
- 🔥 **2025.05.21** Our demo is released at [link](http://115.190.42.15:8888/dolphin/). Check it out!
|
||||
- 🔥 **2025.05.20** The pretrained model and inference code of Dolphin are released.
|
||||
- 🔥 **2025.05.16** Our paper has been accepted by ACL 2025. Paper link: [arXiv](https://arxiv.org/abs/2505.14059).
|
||||
@ -88,7 +89,7 @@ Try our demo on [Demo-Dolphin](http://115.190.42.15:8888/dolphin/).
|
||||
## ⚡ Inference
|
||||
|
||||
Dolphin provides two inference frameworks with support for two parsing granularities:
|
||||
- **Page-level Parsing**: Parse the entire document image into a structured JSON and Markdown format
|
||||
- **Page-level Parsing**: Parse the entire document page into a structured JSON and Markdown format
|
||||
- **Element-level Parsing**: Parse individual document elements (text, table, formula)
|
||||
|
||||
### 📄 Page-level Parsing
|
||||
@ -99,7 +100,10 @@ Dolphin provides two inference frameworks with support for two parsing granulari
|
||||
# Process a single document image
|
||||
python demo_page.py --config ./config/Dolphin.yaml --input_path ./demo/page_imgs/page_1.jpeg --save_dir ./results
|
||||
|
||||
# Process all document images in a directory
|
||||
# Process a single document pdf
|
||||
python demo_page.py --config ./config/Dolphin.yaml --input_path ./demo/page_imgs/page_6.pdf --save_dir ./results
|
||||
|
||||
# Process all documents in a directory
|
||||
python demo_page.py --config ./config/Dolphin.yaml --input_path ./demo/page_imgs --save_dir ./results
|
||||
|
||||
# Process with custom batch size for parallel element decoding
|
||||
@ -112,7 +116,10 @@ python demo_page.py --config ./config/Dolphin.yaml --input_path ./demo/page_imgs
|
||||
# Process a single document image
|
||||
python demo_page_hf.py --model_path ./hf_model --input_path ./demo/page_imgs/page_1.jpeg --save_dir ./results
|
||||
|
||||
# Process all document images in a directory
|
||||
# Process a single document pdf
|
||||
python demo_page_hf.py --model_path ./hf_model --input_path ./demo/page_imgs/page_6.pdf --save_dir ./results
|
||||
|
||||
# Process all documents in a directory
|
||||
python demo_page_hf.py --model_path ./hf_model --input_path ./demo/page_imgs --save_dir ./results
|
||||
|
||||
# Process with custom batch size for parallel element decoding
|
||||
@ -175,11 +182,11 @@ We would like to acknowledge the following open-source projects that provided in
|
||||
If you find this code useful for your research, please use the following BibTeX entry.
|
||||
|
||||
```bibtex
|
||||
@inproceedings{dolphin2025,
|
||||
@article{feng2025dolphin,
|
||||
title={Dolphin: Document Image Parsing via Heterogeneous Anchor Prompting},
|
||||
author={Feng, Hao and Wei, Shu and Fei, Xiang and Shi, Wei and Han, Yingdong and Liao, Lei and Lu, Jinghui and Wu, Binghong and Liu, Qi and Lin, Chunhui and Tang, Jingqun and Liu, Hao and Huang, Can},
|
||||
year={2025},
|
||||
booktitle={Proceedings of the 65rd Annual Meeting of the Association for Computational Linguistics (ACL)}
|
||||
author={Feng, Hao and Wei, Shu and Fei, Xiang and Shi, Wei and Han, Yingdong and Liao, Lei and Lu, Jinghui and Wu, Binghong and Liu, Qi and Lin, Chunhui and others},
|
||||
journal={arXiv preprint arXiv:2505.14059},
|
||||
year={2025}
|
||||
}
|
||||
```
|
||||
|
||||
|
19707
demo/page_imgs/page_6.pdf
Normal file
19707
demo/page_imgs/page_6.pdf
Normal file
File diff suppressed because it is too large
Load Diff
BIN
demo/page_imgs/page_7.jpeg
Normal file
BIN
demo/page_imgs/page_7.jpeg
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.2 MiB |
131
demo_page.py
131
demo_page.py
@ -15,23 +15,83 @@ from chat import DOLPHIN
|
||||
from utils.utils import *
|
||||
|
||||
|
||||
def process_page(image_path, model, save_dir, max_batch_size):
|
||||
"""Parse document images with two stages"""
|
||||
def process_document(document_path, model, save_dir, max_batch_size):
|
||||
"""Parse documents - Handles both images and PDFs"""
|
||||
file_ext = os.path.splitext(document_path)[1].lower()
|
||||
|
||||
if file_ext == '.pdf':
|
||||
# Process PDF file
|
||||
# Convert PDF to images
|
||||
images = convert_pdf_to_images(document_path)
|
||||
if not images:
|
||||
raise Exception(f"Failed to convert PDF {document_path} to images")
|
||||
|
||||
all_results = []
|
||||
|
||||
# Process each page
|
||||
for page_idx, pil_image in enumerate(images):
|
||||
print(f"Processing page {page_idx + 1}/{len(images)}")
|
||||
|
||||
# Generate output name for this page
|
||||
base_name = os.path.splitext(os.path.basename(document_path))[0]
|
||||
page_name = f"{base_name}_page_{page_idx + 1:03d}"
|
||||
|
||||
# Process this page (don't save individual page results)
|
||||
json_path, recognition_results = process_single_image(
|
||||
pil_image, model, save_dir, page_name, max_batch_size, save_individual=False
|
||||
)
|
||||
|
||||
# Add page information to results
|
||||
page_results = {
|
||||
"page_number": page_idx + 1,
|
||||
"elements": recognition_results
|
||||
}
|
||||
all_results.append(page_results)
|
||||
|
||||
# Save combined results for multi-page PDF
|
||||
combined_json_path = save_combined_pdf_results(all_results, document_path, save_dir)
|
||||
|
||||
return combined_json_path, all_results
|
||||
|
||||
else:
|
||||
# Process regular image file
|
||||
pil_image = Image.open(document_path).convert("RGB")
|
||||
base_name = os.path.splitext(os.path.basename(document_path))[0]
|
||||
return process_single_image(pil_image, model, save_dir, base_name, max_batch_size)
|
||||
|
||||
|
||||
def process_single_image(image, model, save_dir, image_name, max_batch_size, save_individual=True):
|
||||
"""Process a single image (either from file or converted from PDF page)
|
||||
|
||||
Args:
|
||||
image: PIL Image object
|
||||
model: DOLPHIN model instance
|
||||
save_dir: Directory to save results
|
||||
image_name: Name for the output file
|
||||
max_batch_size: Maximum batch size for processing
|
||||
save_individual: Whether to save individual results (False for PDF pages)
|
||||
|
||||
Returns:
|
||||
Tuple of (json_path, recognition_results)
|
||||
"""
|
||||
# Stage 1: Page-level layout and reading order parsing
|
||||
pil_image = Image.open(image_path).convert("RGB")
|
||||
layout_output = model.chat("Parse the reading order of this document.", pil_image)
|
||||
layout_output = model.chat("Parse the reading order of this document.", image)
|
||||
|
||||
# Stage 2: Element-level content parsing
|
||||
padded_image, dims = prepare_image(pil_image)
|
||||
recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size)
|
||||
padded_image, dims = prepare_image(image)
|
||||
recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size, save_dir, image_name)
|
||||
|
||||
# Save outputs
|
||||
json_path = save_outputs(recognition_results, image_path, save_dir)
|
||||
# Save outputs only if requested (skip for PDF pages)
|
||||
json_path = None
|
||||
if save_individual:
|
||||
# Create a dummy image path for save_outputs function
|
||||
dummy_image_path = f"{image_name}.jpg" # Extension doesn't matter, only basename is used
|
||||
json_path = save_outputs(recognition_results, dummy_image_path, save_dir)
|
||||
|
||||
return json_path, recognition_results
|
||||
|
||||
|
||||
def process_elements(layout_results, padded_image, dims, model, max_batch_size):
|
||||
def process_elements(layout_results, padded_image, dims, model, max_batch_size, save_dir=None, image_name=None):
|
||||
"""Parse all document elements with parallel decoding"""
|
||||
layout_results = parse_layout_string(layout_results)
|
||||
|
||||
@ -52,12 +112,18 @@ def process_elements(layout_results, padded_image, dims, model, max_batch_size):
|
||||
cropped = padded_image[y1:y2, x1:x2]
|
||||
if cropped.size > 0:
|
||||
if label == "fig":
|
||||
# For figure regions, add empty text result immediately
|
||||
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
||||
|
||||
# 修改:保存figure到本地文件而不是base64
|
||||
figure_filename = save_figure_to_local(pil_crop, save_dir, image_name, reading_order)
|
||||
|
||||
# For figure regions, store relative path instead of base64
|
||||
figure_results.append(
|
||||
{
|
||||
"label": label,
|
||||
"text": f"", # 相对路径
|
||||
"figure_path": f"figures/{figure_filename}", # 添加专门的路径字段
|
||||
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
|
||||
"text": "",
|
||||
"reading_order": reading_order,
|
||||
}
|
||||
)
|
||||
@ -109,9 +175,9 @@ def process_elements(layout_results, padded_image, dims, model, max_batch_size):
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Document processing tool using DOLPHIN model")
|
||||
parser = argparse.ArgumentParser(description="Document parsing based on DOLPHIN")
|
||||
parser.add_argument("--config", default="./config/Dolphin.yaml", help="Path to configuration file")
|
||||
parser.add_argument("--input_path", type=str, default="./demo", help="Path to input image or directory of images")
|
||||
parser.add_argument("--input_path", type=str, default="./demo", help="Path to input image/PDF or directory of files")
|
||||
parser.add_argument(
|
||||
"--save_dir",
|
||||
type=str,
|
||||
@ -130,31 +196,42 @@ def main():
|
||||
config = OmegaConf.load(args.config)
|
||||
model = DOLPHIN(config)
|
||||
|
||||
# Collect Document Images
|
||||
# Collect Document Files (images and PDFs)
|
||||
if os.path.isdir(args.input_path):
|
||||
image_files = []
|
||||
for ext in [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"]:
|
||||
image_files.extend(glob.glob(os.path.join(args.input_path, f"*{ext}")))
|
||||
image_files = sorted(image_files)
|
||||
# Support both image and PDF files
|
||||
file_extensions = [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG", ".pdf", ".PDF"]
|
||||
|
||||
document_files = []
|
||||
for ext in file_extensions:
|
||||
document_files.extend(glob.glob(os.path.join(args.input_path, f"*{ext}")))
|
||||
document_files = sorted(document_files)
|
||||
else:
|
||||
if not os.path.exists(args.input_path):
|
||||
raise FileNotFoundError(f"Input path {args.input_path} does not exist")
|
||||
image_files = [args.input_path]
|
||||
|
||||
# Check if it's a supported file type
|
||||
file_ext = os.path.splitext(args.input_path)[1].lower()
|
||||
supported_exts = ['.jpg', '.jpeg', '.png', '.pdf']
|
||||
|
||||
if file_ext not in supported_exts:
|
||||
raise ValueError(f"Unsupported file type: {file_ext}. Supported types: {supported_exts}")
|
||||
|
||||
document_files = [args.input_path]
|
||||
|
||||
save_dir = args.save_dir or (
|
||||
args.input_path if os.path.isdir(args.input_path) else os.path.dirname(args.input_path)
|
||||
)
|
||||
setup_output_dirs(save_dir)
|
||||
|
||||
total_samples = len(image_files)
|
||||
print(f"\nTotal samples to process: {total_samples}")
|
||||
total_samples = len(document_files)
|
||||
print(f"\nTotal files to process: {total_samples}")
|
||||
|
||||
# Process All Document Images
|
||||
for image_path in image_files:
|
||||
print(f"\nProcessing {image_path}")
|
||||
# Process All Document Files
|
||||
for file_path in document_files:
|
||||
print(f"\nProcessing {file_path}")
|
||||
try:
|
||||
json_path, recognition_results = process_page(
|
||||
image_path=image_path,
|
||||
json_path, recognition_results = process_document(
|
||||
document_path=file_path,
|
||||
model=model,
|
||||
save_dir=save_dir,
|
||||
max_batch_size=args.max_batch_size,
|
||||
@ -163,7 +240,7 @@ def main():
|
||||
print(f"Processing completed. Results saved to {save_dir}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {image_path}: {str(e)}")
|
||||
print(f"Error processing {file_path}: {str(e)}")
|
||||
continue
|
||||
|
||||
|
||||
|
131
demo_page_hf.py
131
demo_page_hf.py
@ -104,23 +104,83 @@ class DOLPHIN:
|
||||
return results
|
||||
|
||||
|
||||
def process_page(image_path, model, save_dir, max_batch_size=None):
|
||||
"""Parse document images with two stages"""
|
||||
def process_document(document_path, model, save_dir, max_batch_size=None):
|
||||
"""Parse documents with two stages - Handles both images and PDFs"""
|
||||
file_ext = os.path.splitext(document_path)[1].lower()
|
||||
|
||||
if file_ext == '.pdf':
|
||||
# Process PDF file
|
||||
# Convert PDF to images
|
||||
images = convert_pdf_to_images(document_path)
|
||||
if not images:
|
||||
raise Exception(f"Failed to convert PDF {document_path} to images")
|
||||
|
||||
all_results = []
|
||||
|
||||
# Process each page
|
||||
for page_idx, pil_image in enumerate(images):
|
||||
print(f"Processing page {page_idx + 1}/{len(images)}")
|
||||
|
||||
# Generate output name for this page
|
||||
base_name = os.path.splitext(os.path.basename(document_path))[0]
|
||||
page_name = f"{base_name}_page_{page_idx + 1:03d}"
|
||||
|
||||
# Process this page (don't save individual page results)
|
||||
json_path, recognition_results = process_single_image(
|
||||
pil_image, model, save_dir, page_name, max_batch_size, save_individual=False
|
||||
)
|
||||
|
||||
# Add page information to results
|
||||
page_results = {
|
||||
"page_number": page_idx + 1,
|
||||
"elements": recognition_results
|
||||
}
|
||||
all_results.append(page_results)
|
||||
|
||||
# Save combined results for multi-page PDF
|
||||
combined_json_path = save_combined_pdf_results(all_results, document_path, save_dir)
|
||||
|
||||
return combined_json_path, all_results
|
||||
|
||||
else:
|
||||
# Process regular image file
|
||||
pil_image = Image.open(document_path).convert("RGB")
|
||||
base_name = os.path.splitext(os.path.basename(document_path))[0]
|
||||
return process_single_image(pil_image, model, save_dir, base_name, max_batch_size)
|
||||
|
||||
|
||||
def process_single_image(image, model, save_dir, image_name, max_batch_size=None, save_individual=True):
|
||||
"""Process a single image (either from file or converted from PDF page)
|
||||
|
||||
Args:
|
||||
image: PIL Image object
|
||||
model: DOLPHIN model instance
|
||||
save_dir: Directory to save results
|
||||
image_name: Name for the output file
|
||||
max_batch_size: Maximum batch size for processing
|
||||
save_individual: Whether to save individual results (False for PDF pages)
|
||||
|
||||
Returns:
|
||||
Tuple of (json_path, recognition_results)
|
||||
"""
|
||||
# Stage 1: Page-level layout and reading order parsing
|
||||
pil_image = Image.open(image_path).convert("RGB")
|
||||
layout_output = model.chat("Parse the reading order of this document.", pil_image)
|
||||
layout_output = model.chat("Parse the reading order of this document.", image)
|
||||
|
||||
# Stage 2: Element-level content parsing
|
||||
padded_image, dims = prepare_image(pil_image)
|
||||
recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size)
|
||||
padded_image, dims = prepare_image(image)
|
||||
recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size, save_dir, image_name)
|
||||
|
||||
# Save outputs
|
||||
json_path = save_outputs(recognition_results, image_path, save_dir)
|
||||
# Save outputs only if requested (skip for PDF pages)
|
||||
json_path = None
|
||||
if save_individual:
|
||||
# Create a dummy image path for save_outputs function
|
||||
dummy_image_path = f"{image_name}.jpg" # Extension doesn't matter, only basename is used
|
||||
json_path = save_outputs(recognition_results, dummy_image_path, save_dir)
|
||||
|
||||
return json_path, recognition_results
|
||||
|
||||
|
||||
def process_elements(layout_results, padded_image, dims, model, max_batch_size=None):
|
||||
def process_elements(layout_results, padded_image, dims, model, max_batch_size, save_dir=None, image_name=None):
|
||||
"""Parse all document elements with parallel decoding"""
|
||||
layout_results = parse_layout_string(layout_results)
|
||||
|
||||
@ -143,12 +203,18 @@ def process_elements(layout_results, padded_image, dims, model, max_batch_size=N
|
||||
cropped = padded_image[y1:y2, x1:x2]
|
||||
if cropped.size > 0:
|
||||
if label == "fig":
|
||||
# For figure regions, add empty text result immediately
|
||||
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
||||
|
||||
# 修改:保存figure到本地文件而不是base64
|
||||
figure_filename = save_figure_to_local(pil_crop, save_dir, image_name, reading_order)
|
||||
|
||||
# For figure regions, store relative path instead of base64
|
||||
figure_results.append(
|
||||
{
|
||||
"label": label,
|
||||
"text": f"", # 相对路径
|
||||
"figure_path": f"figures/{figure_filename}", # 添加专门的路径字段
|
||||
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
|
||||
"text": "",
|
||||
"reading_order": reading_order,
|
||||
}
|
||||
)
|
||||
@ -227,9 +293,9 @@ def process_element_batch(elements, model, prompt, max_batch_size=None):
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Document processing tool using DOLPHIN model")
|
||||
parser = argparse.ArgumentParser(description="Document parsing based on DOLPHIN")
|
||||
parser.add_argument("--model_path", default="./hf_model", help="Path to Hugging Face model")
|
||||
parser.add_argument("--input_path", type=str, default="./demo", help="Path to input image or directory of images")
|
||||
parser.add_argument("--input_path", type=str, default="./demo", help="Path to input image/PDF or directory of files")
|
||||
parser.add_argument(
|
||||
"--save_dir",
|
||||
type=str,
|
||||
@ -247,31 +313,42 @@ def main():
|
||||
# Load Model
|
||||
model = DOLPHIN(args.model_path)
|
||||
|
||||
# Collect Document Images
|
||||
# Collect Document Files (images and PDFs)
|
||||
if os.path.isdir(args.input_path):
|
||||
image_files = []
|
||||
for ext in [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"]:
|
||||
image_files.extend(glob.glob(os.path.join(args.input_path, f"*{ext}")))
|
||||
image_files = sorted(image_files)
|
||||
# Support both image and PDF files
|
||||
file_extensions = [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG", ".pdf", ".PDF"]
|
||||
|
||||
document_files = []
|
||||
for ext in file_extensions:
|
||||
document_files.extend(glob.glob(os.path.join(args.input_path, f"*{ext}")))
|
||||
document_files = sorted(document_files)
|
||||
else:
|
||||
if not os.path.exists(args.input_path):
|
||||
raise FileNotFoundError(f"Input path {args.input_path} does not exist")
|
||||
image_files = [args.input_path]
|
||||
|
||||
# Check if it's a supported file type
|
||||
file_ext = os.path.splitext(args.input_path)[1].lower()
|
||||
supported_exts = ['.jpg', '.jpeg', '.png', '.pdf']
|
||||
|
||||
if file_ext not in supported_exts:
|
||||
raise ValueError(f"Unsupported file type: {file_ext}. Supported types: {supported_exts}")
|
||||
|
||||
document_files = [args.input_path]
|
||||
|
||||
save_dir = args.save_dir or (
|
||||
args.input_path if os.path.isdir(args.input_path) else os.path.dirname(args.input_path)
|
||||
)
|
||||
setup_output_dirs(save_dir)
|
||||
|
||||
total_samples = len(image_files)
|
||||
print(f"\nTotal samples to process: {total_samples}")
|
||||
total_samples = len(document_files)
|
||||
print(f"\nTotal files to process: {total_samples}")
|
||||
|
||||
# Process All Document Images
|
||||
for image_path in image_files:
|
||||
print(f"\nProcessing {image_path}")
|
||||
# Process All Document Files
|
||||
for file_path in document_files:
|
||||
print(f"\nProcessing {file_path}")
|
||||
try:
|
||||
json_path, recognition_results = process_page(
|
||||
image_path=image_path,
|
||||
json_path, recognition_results = process_document(
|
||||
document_path=file_path,
|
||||
model=model,
|
||||
save_dir=save_dir,
|
||||
max_batch_size=args.max_batch_size,
|
||||
@ -280,7 +357,7 @@ def main():
|
||||
print(f"Processing completed. Results saved to {save_dir}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing {image_path}: {str(e)}")
|
||||
print(f"Error processing {file_path}: {str(e)}")
|
||||
continue
|
||||
|
||||
|
||||
|
@ -8,4 +8,5 @@ timm==0.5.4
|
||||
torch==2.1.0
|
||||
torchvision==0.16.0
|
||||
transformers==4.47.0
|
||||
accelerate==1.6.0
|
||||
accelerate==1.6.0
|
||||
pymupdf==1.26
|
@ -223,21 +223,31 @@ class MarkdownConverter:
|
||||
|
||||
def _handle_figure(self, text: str, section_count: int) -> str:
|
||||
"""
|
||||
Convert base64 encoded image to markdown image syntax
|
||||
Handle figure content
|
||||
"""
|
||||
try:
|
||||
# Determine image format (assuming PNG if not specified)
|
||||
img_format = "png"
|
||||
# Check if it's a file path starting with "figures/"
|
||||
if text.startswith("figures/"):
|
||||
# Convert to relative path from markdown directory to figures directory
|
||||
relative_path = f"../{text}"
|
||||
return f"\n\n"
|
||||
|
||||
# Check if it's already a markdown format image link
|
||||
if text.startswith("!["):
|
||||
# Already in markdown format, return directly
|
||||
return f"{text}\n\n"
|
||||
|
||||
# If it's still base64 format, maintain original logic
|
||||
if text.startswith("data:image/"):
|
||||
# Extract format from data URI
|
||||
img_format = text.split(";")[0].split("/")[1]
|
||||
return f"\n\n"
|
||||
elif ";" in text and "," in text:
|
||||
# Already in data URI format
|
||||
return f"\n\n"
|
||||
else:
|
||||
# Raw base64, convert to data URI
|
||||
# Assume it's raw base64, convert to data URI
|
||||
img_format = "png"
|
||||
data_uri = f"data:image/{img_format};base64,{text}"
|
||||
return f"\n\n"
|
||||
|
||||
except Exception as e:
|
||||
print(f"_handle_figure error: {str(e)}")
|
||||
return f"*[Error processing figure: {str(e)}]*\n\n"
|
||||
|
151
utils/utils.py
151
utils/utils.py
@ -6,6 +6,7 @@ SPDX-License-Identifier: MIT
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import io
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Tuple
|
||||
@ -14,6 +15,7 @@ import albumentations as alb
|
||||
import cv2
|
||||
import numpy as np
|
||||
from albumentations.pytorch import ToTensorV2
|
||||
import pymupdf
|
||||
from PIL import Image
|
||||
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
|
||||
from torchvision.transforms.functional import resize
|
||||
@ -21,6 +23,152 @@ from torchvision.transforms.functional import resize
|
||||
from utils.markdown_utils import MarkdownConverter
|
||||
|
||||
|
||||
def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
|
||||
"""Save cropped figure to local file system
|
||||
|
||||
Args:
|
||||
pil_crop: PIL Image object of the cropped figure
|
||||
save_dir: Base directory to save results
|
||||
image_name: Name of the source image/document
|
||||
reading_order: Reading order of the figure in the document
|
||||
|
||||
Returns:
|
||||
str: Filename of the saved figure
|
||||
"""
|
||||
try:
|
||||
# Create figures directory if it doesn't exist
|
||||
figures_dir = os.path.join(save_dir, "markdown", "figures")
|
||||
# os.makedirs(figures_dir, exist_ok=True)
|
||||
|
||||
# Generate figure filename
|
||||
figure_filename = f"{image_name}_figure_{reading_order:03d}.png"
|
||||
figure_path = os.path.join(figures_dir, figure_filename)
|
||||
|
||||
# Save the figure
|
||||
pil_crop.save(figure_path, format="PNG", quality=95)
|
||||
|
||||
# print(f"Saved figure: {figure_filename}")
|
||||
return figure_filename
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error saving figure: {str(e)}")
|
||||
# Return a fallback filename
|
||||
return f"{image_name}_figure_{reading_order:03d}_error.png"
|
||||
|
||||
|
||||
def convert_pdf_to_images(pdf_path, target_size=896):
|
||||
"""Convert PDF pages to images
|
||||
|
||||
Args:
|
||||
pdf_path: Path to PDF file
|
||||
target_size: Target size for the longest dimension
|
||||
|
||||
Returns:
|
||||
List of PIL Images
|
||||
"""
|
||||
images = []
|
||||
try:
|
||||
doc = pymupdf.open(pdf_path)
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
|
||||
# Calculate scale to make longest dimension equal to target_size
|
||||
rect = page.rect
|
||||
scale = target_size / max(rect.width, rect.height)
|
||||
|
||||
# Render page as image
|
||||
mat = pymupdf.Matrix(scale, scale)
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
|
||||
# Convert to PIL Image
|
||||
img_data = pix.tobytes("png")
|
||||
pil_image = Image.open(io.BytesIO(img_data))
|
||||
images.append(pil_image)
|
||||
|
||||
doc.close()
|
||||
print(f"Successfully converted {len(images)} pages from PDF")
|
||||
return images
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error converting PDF to images: {str(e)}")
|
||||
return []
|
||||
|
||||
|
||||
def is_pdf_file(file_path):
|
||||
"""Check if file is a PDF"""
|
||||
return file_path.lower().endswith('.pdf')
|
||||
|
||||
|
||||
def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
|
||||
"""Save combined results for multi-page PDF with both JSON and Markdown
|
||||
|
||||
Args:
|
||||
all_page_results: List of results for all pages
|
||||
pdf_path: Path to original PDF file
|
||||
save_dir: Directory to save results
|
||||
|
||||
Returns:
|
||||
Path to saved combined JSON file
|
||||
"""
|
||||
# Create output filename based on PDF name
|
||||
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||
|
||||
# Prepare combined results
|
||||
combined_results = {
|
||||
"source_file": pdf_path,
|
||||
"total_pages": len(all_page_results),
|
||||
"pages": all_page_results
|
||||
}
|
||||
|
||||
# Save combined JSON results
|
||||
json_filename = f"{base_name}.json"
|
||||
json_path = os.path.join(save_dir, "recognition_json", json_filename)
|
||||
os.makedirs(os.path.dirname(json_path), exist_ok=True)
|
||||
|
||||
with open(json_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(combined_results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Generate and save combined markdown
|
||||
try:
|
||||
markdown_converter = MarkdownConverter()
|
||||
|
||||
# Combine all page results into a single list for markdown conversion
|
||||
all_elements = []
|
||||
for page_data in all_page_results:
|
||||
page_elements = page_data.get("elements", [])
|
||||
if page_elements:
|
||||
# Add page separator if not the first page
|
||||
if all_elements:
|
||||
all_elements.append({
|
||||
"label": "page_separator",
|
||||
"text": f"\n\n---\n\n",
|
||||
"reading_order": len(all_elements)
|
||||
})
|
||||
all_elements.extend(page_elements)
|
||||
|
||||
# Generate markdown content
|
||||
markdown_content = markdown_converter.convert(all_elements)
|
||||
|
||||
# Save markdown file
|
||||
markdown_filename = f"{base_name}.md"
|
||||
markdown_path = os.path.join(save_dir, "markdown", markdown_filename)
|
||||
os.makedirs(os.path.dirname(markdown_path), exist_ok=True)
|
||||
|
||||
with open(markdown_path, 'w', encoding='utf-8') as f:
|
||||
f.write(markdown_content)
|
||||
|
||||
# print(f"Combined markdown saved to: {markdown_path}")
|
||||
|
||||
except ImportError:
|
||||
print("MarkdownConverter not available, skipping markdown generation")
|
||||
except Exception as e:
|
||||
print(f"Error generating markdown: {e}")
|
||||
|
||||
# print(f"Combined JSON results saved to: {json_path}")
|
||||
return json_path
|
||||
|
||||
|
||||
def alb_wrapper(transform):
|
||||
def f(im):
|
||||
return transform(image=np.asarray(im))["image"]
|
||||
@ -302,13 +450,12 @@ def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
|
||||
return np.zeros((h, w, 3), dtype=np.uint8), dimensions
|
||||
|
||||
|
||||
|
||||
|
||||
def setup_output_dirs(save_dir):
|
||||
"""Create necessary output directories"""
|
||||
os.makedirs(save_dir, exist_ok=True)
|
||||
os.makedirs(os.path.join(save_dir, "markdown"), exist_ok=True)
|
||||
os.makedirs(os.path.join(save_dir, "recognition_json"), exist_ok=True)
|
||||
os.makedirs(os.path.join(save_dir, "markdown", "figures"), exist_ok=True)
|
||||
|
||||
|
||||
def save_outputs(recognition_results, image_path, save_dir):
|
||||
|
Loading…
Reference in New Issue
Block a user