add pdf parsing

This commit is contained in:
fenghao.2019 2025-06-13 16:45:28 +08:00
parent 49f51871c6
commit 10b017a62b
8 changed files with 20098 additions and 72 deletions

View File

@ -9,7 +9,7 @@
<a href="https://huggingface.co/ByteDance/Dolphin">
<img src="https://img.shields.io/badge/HuggingFace-Dolphin-yellow">
</a>
<a href="http://115.190.42.15:8888/dolphin/">
<a href="https://huggingface.co/spaces/ByteDance/Dolphin">
<img src="https://img.shields.io/badge/Demo-Dolphin-blue">
</a>
<a href="https://github.com/bytedance/Dolphin">
@ -50,6 +50,7 @@ Try our demo on [Demo-Dolphin](http://115.190.42.15:8888/dolphin/).
## 📅 Changelog
- 🔥 **2025.06.13** Added multi-page PDF document parsing capability.
- 🔥 **2025.05.21** Our demo is released at [link](http://115.190.42.15:8888/dolphin/). Check it out!
- 🔥 **2025.05.20** The pretrained model and inference code of Dolphin are released.
- 🔥 **2025.05.16** Our paper has been accepted by ACL 2025. Paper link: [arXiv](https://arxiv.org/abs/2505.14059).
@ -88,7 +89,7 @@ Try our demo on [Demo-Dolphin](http://115.190.42.15:8888/dolphin/).
## ⚡ Inference
Dolphin provides two inference frameworks with support for two parsing granularities:
- **Page-level Parsing**: Parse the entire document image into a structured JSON and Markdown format
- **Page-level Parsing**: Parse the entire document page into a structured JSON and Markdown format
- **Element-level Parsing**: Parse individual document elements (text, table, formula)
### 📄 Page-level Parsing
@ -99,7 +100,10 @@ Dolphin provides two inference frameworks with support for two parsing granulari
# Process a single document image
python demo_page.py --config ./config/Dolphin.yaml --input_path ./demo/page_imgs/page_1.jpeg --save_dir ./results
# Process all document images in a directory
# Process a single document pdf
python demo_page.py --config ./config/Dolphin.yaml --input_path ./demo/page_imgs/page_6.pdf --save_dir ./results
# Process all documents in a directory
python demo_page.py --config ./config/Dolphin.yaml --input_path ./demo/page_imgs --save_dir ./results
# Process with custom batch size for parallel element decoding
@ -112,7 +116,10 @@ python demo_page.py --config ./config/Dolphin.yaml --input_path ./demo/page_imgs
# Process a single document image
python demo_page_hf.py --model_path ./hf_model --input_path ./demo/page_imgs/page_1.jpeg --save_dir ./results
# Process all document images in a directory
# Process a single document pdf
python demo_page_hf.py --model_path ./hf_model --input_path ./demo/page_imgs/page_6.pdf --save_dir ./results
# Process all documents in a directory
python demo_page_hf.py --model_path ./hf_model --input_path ./demo/page_imgs --save_dir ./results
# Process with custom batch size for parallel element decoding
@ -175,11 +182,11 @@ We would like to acknowledge the following open-source projects that provided in
If you find this code useful for your research, please use the following BibTeX entry.
```bibtex
@inproceedings{dolphin2025,
@article{feng2025dolphin,
title={Dolphin: Document Image Parsing via Heterogeneous Anchor Prompting},
author={Feng, Hao and Wei, Shu and Fei, Xiang and Shi, Wei and Han, Yingdong and Liao, Lei and Lu, Jinghui and Wu, Binghong and Liu, Qi and Lin, Chunhui and Tang, Jingqun and Liu, Hao and Huang, Can},
year={2025},
booktitle={Proceedings of the 65rd Annual Meeting of the Association for Computational Linguistics (ACL)}
author={Feng, Hao and Wei, Shu and Fei, Xiang and Shi, Wei and Han, Yingdong and Liao, Lei and Lu, Jinghui and Wu, Binghong and Liu, Qi and Lin, Chunhui and others},
journal={arXiv preprint arXiv:2505.14059},
year={2025}
}
```

19707
demo/page_imgs/page_6.pdf Normal file

File diff suppressed because it is too large Load Diff

BIN
demo/page_imgs/page_7.jpeg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 MiB

View File

@ -15,23 +15,83 @@ from chat import DOLPHIN
from utils.utils import *
def process_page(image_path, model, save_dir, max_batch_size):
"""Parse document images with two stages"""
def process_document(document_path, model, save_dir, max_batch_size):
"""Parse documents - Handles both images and PDFs"""
file_ext = os.path.splitext(document_path)[1].lower()
if file_ext == '.pdf':
# Process PDF file
# Convert PDF to images
images = convert_pdf_to_images(document_path)
if not images:
raise Exception(f"Failed to convert PDF {document_path} to images")
all_results = []
# Process each page
for page_idx, pil_image in enumerate(images):
print(f"Processing page {page_idx + 1}/{len(images)}")
# Generate output name for this page
base_name = os.path.splitext(os.path.basename(document_path))[0]
page_name = f"{base_name}_page_{page_idx + 1:03d}"
# Process this page (don't save individual page results)
json_path, recognition_results = process_single_image(
pil_image, model, save_dir, page_name, max_batch_size, save_individual=False
)
# Add page information to results
page_results = {
"page_number": page_idx + 1,
"elements": recognition_results
}
all_results.append(page_results)
# Save combined results for multi-page PDF
combined_json_path = save_combined_pdf_results(all_results, document_path, save_dir)
return combined_json_path, all_results
else:
# Process regular image file
pil_image = Image.open(document_path).convert("RGB")
base_name = os.path.splitext(os.path.basename(document_path))[0]
return process_single_image(pil_image, model, save_dir, base_name, max_batch_size)
def process_single_image(image, model, save_dir, image_name, max_batch_size, save_individual=True):
"""Process a single image (either from file or converted from PDF page)
Args:
image: PIL Image object
model: DOLPHIN model instance
save_dir: Directory to save results
image_name: Name for the output file
max_batch_size: Maximum batch size for processing
save_individual: Whether to save individual results (False for PDF pages)
Returns:
Tuple of (json_path, recognition_results)
"""
# Stage 1: Page-level layout and reading order parsing
pil_image = Image.open(image_path).convert("RGB")
layout_output = model.chat("Parse the reading order of this document.", pil_image)
layout_output = model.chat("Parse the reading order of this document.", image)
# Stage 2: Element-level content parsing
padded_image, dims = prepare_image(pil_image)
recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size)
padded_image, dims = prepare_image(image)
recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size, save_dir, image_name)
# Save outputs
json_path = save_outputs(recognition_results, image_path, save_dir)
# Save outputs only if requested (skip for PDF pages)
json_path = None
if save_individual:
# Create a dummy image path for save_outputs function
dummy_image_path = f"{image_name}.jpg" # Extension doesn't matter, only basename is used
json_path = save_outputs(recognition_results, dummy_image_path, save_dir)
return json_path, recognition_results
def process_elements(layout_results, padded_image, dims, model, max_batch_size):
def process_elements(layout_results, padded_image, dims, model, max_batch_size, save_dir=None, image_name=None):
"""Parse all document elements with parallel decoding"""
layout_results = parse_layout_string(layout_results)
@ -52,12 +112,18 @@ def process_elements(layout_results, padded_image, dims, model, max_batch_size):
cropped = padded_image[y1:y2, x1:x2]
if cropped.size > 0:
if label == "fig":
# For figure regions, add empty text result immediately
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
# 修改保存figure到本地文件而不是base64
figure_filename = save_figure_to_local(pil_crop, save_dir, image_name, reading_order)
# For figure regions, store relative path instead of base64
figure_results.append(
{
"label": label,
"text": f"![Figure](figures/{figure_filename})", # 相对路径
"figure_path": f"figures/{figure_filename}", # 添加专门的路径字段
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
"text": "",
"reading_order": reading_order,
}
)
@ -109,9 +175,9 @@ def process_elements(layout_results, padded_image, dims, model, max_batch_size):
def main():
parser = argparse.ArgumentParser(description="Document processing tool using DOLPHIN model")
parser = argparse.ArgumentParser(description="Document parsing based on DOLPHIN")
parser.add_argument("--config", default="./config/Dolphin.yaml", help="Path to configuration file")
parser.add_argument("--input_path", type=str, default="./demo", help="Path to input image or directory of images")
parser.add_argument("--input_path", type=str, default="./demo", help="Path to input image/PDF or directory of files")
parser.add_argument(
"--save_dir",
type=str,
@ -130,31 +196,42 @@ def main():
config = OmegaConf.load(args.config)
model = DOLPHIN(config)
# Collect Document Images
# Collect Document Files (images and PDFs)
if os.path.isdir(args.input_path):
image_files = []
for ext in [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"]:
image_files.extend(glob.glob(os.path.join(args.input_path, f"*{ext}")))
image_files = sorted(image_files)
# Support both image and PDF files
file_extensions = [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG", ".pdf", ".PDF"]
document_files = []
for ext in file_extensions:
document_files.extend(glob.glob(os.path.join(args.input_path, f"*{ext}")))
document_files = sorted(document_files)
else:
if not os.path.exists(args.input_path):
raise FileNotFoundError(f"Input path {args.input_path} does not exist")
image_files = [args.input_path]
# Check if it's a supported file type
file_ext = os.path.splitext(args.input_path)[1].lower()
supported_exts = ['.jpg', '.jpeg', '.png', '.pdf']
if file_ext not in supported_exts:
raise ValueError(f"Unsupported file type: {file_ext}. Supported types: {supported_exts}")
document_files = [args.input_path]
save_dir = args.save_dir or (
args.input_path if os.path.isdir(args.input_path) else os.path.dirname(args.input_path)
)
setup_output_dirs(save_dir)
total_samples = len(image_files)
print(f"\nTotal samples to process: {total_samples}")
total_samples = len(document_files)
print(f"\nTotal files to process: {total_samples}")
# Process All Document Images
for image_path in image_files:
print(f"\nProcessing {image_path}")
# Process All Document Files
for file_path in document_files:
print(f"\nProcessing {file_path}")
try:
json_path, recognition_results = process_page(
image_path=image_path,
json_path, recognition_results = process_document(
document_path=file_path,
model=model,
save_dir=save_dir,
max_batch_size=args.max_batch_size,
@ -163,7 +240,7 @@ def main():
print(f"Processing completed. Results saved to {save_dir}")
except Exception as e:
print(f"Error processing {image_path}: {str(e)}")
print(f"Error processing {file_path}: {str(e)}")
continue

View File

@ -104,23 +104,83 @@ class DOLPHIN:
return results
def process_page(image_path, model, save_dir, max_batch_size=None):
"""Parse document images with two stages"""
def process_document(document_path, model, save_dir, max_batch_size=None):
"""Parse documents with two stages - Handles both images and PDFs"""
file_ext = os.path.splitext(document_path)[1].lower()
if file_ext == '.pdf':
# Process PDF file
# Convert PDF to images
images = convert_pdf_to_images(document_path)
if not images:
raise Exception(f"Failed to convert PDF {document_path} to images")
all_results = []
# Process each page
for page_idx, pil_image in enumerate(images):
print(f"Processing page {page_idx + 1}/{len(images)}")
# Generate output name for this page
base_name = os.path.splitext(os.path.basename(document_path))[0]
page_name = f"{base_name}_page_{page_idx + 1:03d}"
# Process this page (don't save individual page results)
json_path, recognition_results = process_single_image(
pil_image, model, save_dir, page_name, max_batch_size, save_individual=False
)
# Add page information to results
page_results = {
"page_number": page_idx + 1,
"elements": recognition_results
}
all_results.append(page_results)
# Save combined results for multi-page PDF
combined_json_path = save_combined_pdf_results(all_results, document_path, save_dir)
return combined_json_path, all_results
else:
# Process regular image file
pil_image = Image.open(document_path).convert("RGB")
base_name = os.path.splitext(os.path.basename(document_path))[0]
return process_single_image(pil_image, model, save_dir, base_name, max_batch_size)
def process_single_image(image, model, save_dir, image_name, max_batch_size=None, save_individual=True):
"""Process a single image (either from file or converted from PDF page)
Args:
image: PIL Image object
model: DOLPHIN model instance
save_dir: Directory to save results
image_name: Name for the output file
max_batch_size: Maximum batch size for processing
save_individual: Whether to save individual results (False for PDF pages)
Returns:
Tuple of (json_path, recognition_results)
"""
# Stage 1: Page-level layout and reading order parsing
pil_image = Image.open(image_path).convert("RGB")
layout_output = model.chat("Parse the reading order of this document.", pil_image)
layout_output = model.chat("Parse the reading order of this document.", image)
# Stage 2: Element-level content parsing
padded_image, dims = prepare_image(pil_image)
recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size)
padded_image, dims = prepare_image(image)
recognition_results = process_elements(layout_output, padded_image, dims, model, max_batch_size, save_dir, image_name)
# Save outputs
json_path = save_outputs(recognition_results, image_path, save_dir)
# Save outputs only if requested (skip for PDF pages)
json_path = None
if save_individual:
# Create a dummy image path for save_outputs function
dummy_image_path = f"{image_name}.jpg" # Extension doesn't matter, only basename is used
json_path = save_outputs(recognition_results, dummy_image_path, save_dir)
return json_path, recognition_results
def process_elements(layout_results, padded_image, dims, model, max_batch_size=None):
def process_elements(layout_results, padded_image, dims, model, max_batch_size, save_dir=None, image_name=None):
"""Parse all document elements with parallel decoding"""
layout_results = parse_layout_string(layout_results)
@ -143,12 +203,18 @@ def process_elements(layout_results, padded_image, dims, model, max_batch_size=N
cropped = padded_image[y1:y2, x1:x2]
if cropped.size > 0:
if label == "fig":
# For figure regions, add empty text result immediately
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
# 修改保存figure到本地文件而不是base64
figure_filename = save_figure_to_local(pil_crop, save_dir, image_name, reading_order)
# For figure regions, store relative path instead of base64
figure_results.append(
{
"label": label,
"text": f"![Figure](figures/{figure_filename})", # 相对路径
"figure_path": f"figures/{figure_filename}", # 添加专门的路径字段
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
"text": "",
"reading_order": reading_order,
}
)
@ -227,9 +293,9 @@ def process_element_batch(elements, model, prompt, max_batch_size=None):
def main():
parser = argparse.ArgumentParser(description="Document processing tool using DOLPHIN model")
parser = argparse.ArgumentParser(description="Document parsing based on DOLPHIN")
parser.add_argument("--model_path", default="./hf_model", help="Path to Hugging Face model")
parser.add_argument("--input_path", type=str, default="./demo", help="Path to input image or directory of images")
parser.add_argument("--input_path", type=str, default="./demo", help="Path to input image/PDF or directory of files")
parser.add_argument(
"--save_dir",
type=str,
@ -247,31 +313,42 @@ def main():
# Load Model
model = DOLPHIN(args.model_path)
# Collect Document Images
# Collect Document Files (images and PDFs)
if os.path.isdir(args.input_path):
image_files = []
for ext in [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"]:
image_files.extend(glob.glob(os.path.join(args.input_path, f"*{ext}")))
image_files = sorted(image_files)
# Support both image and PDF files
file_extensions = [".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG", ".pdf", ".PDF"]
document_files = []
for ext in file_extensions:
document_files.extend(glob.glob(os.path.join(args.input_path, f"*{ext}")))
document_files = sorted(document_files)
else:
if not os.path.exists(args.input_path):
raise FileNotFoundError(f"Input path {args.input_path} does not exist")
image_files = [args.input_path]
# Check if it's a supported file type
file_ext = os.path.splitext(args.input_path)[1].lower()
supported_exts = ['.jpg', '.jpeg', '.png', '.pdf']
if file_ext not in supported_exts:
raise ValueError(f"Unsupported file type: {file_ext}. Supported types: {supported_exts}")
document_files = [args.input_path]
save_dir = args.save_dir or (
args.input_path if os.path.isdir(args.input_path) else os.path.dirname(args.input_path)
)
setup_output_dirs(save_dir)
total_samples = len(image_files)
print(f"\nTotal samples to process: {total_samples}")
total_samples = len(document_files)
print(f"\nTotal files to process: {total_samples}")
# Process All Document Images
for image_path in image_files:
print(f"\nProcessing {image_path}")
# Process All Document Files
for file_path in document_files:
print(f"\nProcessing {file_path}")
try:
json_path, recognition_results = process_page(
image_path=image_path,
json_path, recognition_results = process_document(
document_path=file_path,
model=model,
save_dir=save_dir,
max_batch_size=args.max_batch_size,
@ -280,7 +357,7 @@ def main():
print(f"Processing completed. Results saved to {save_dir}")
except Exception as e:
print(f"Error processing {image_path}: {str(e)}")
print(f"Error processing {file_path}: {str(e)}")
continue

View File

@ -8,4 +8,5 @@ timm==0.5.4
torch==2.1.0
torchvision==0.16.0
transformers==4.47.0
accelerate==1.6.0
accelerate==1.6.0
pymupdf==1.26

View File

@ -223,21 +223,31 @@ class MarkdownConverter:
def _handle_figure(self, text: str, section_count: int) -> str:
"""
Convert base64 encoded image to markdown image syntax
Handle figure content
"""
try:
# Determine image format (assuming PNG if not specified)
img_format = "png"
# Check if it's a file path starting with "figures/"
if text.startswith("figures/"):
# Convert to relative path from markdown directory to figures directory
relative_path = f"../{text}"
return f"![Figure {section_count}]({relative_path})\n\n"
# Check if it's already a markdown format image link
if text.startswith("!["):
# Already in markdown format, return directly
return f"{text}\n\n"
# If it's still base64 format, maintain original logic
if text.startswith("data:image/"):
# Extract format from data URI
img_format = text.split(";")[0].split("/")[1]
return f"![Figure {section_count}]({text})\n\n"
elif ";" in text and "," in text:
# Already in data URI format
return f"![Figure {section_count}]({text})\n\n"
else:
# Raw base64, convert to data URI
# Assume it's raw base64, convert to data URI
img_format = "png"
data_uri = f"data:image/{img_format};base64,{text}"
return f"![Figure {section_count}]({data_uri})\n\n"
except Exception as e:
print(f"_handle_figure error: {str(e)}")
return f"*[Error processing figure: {str(e)}]*\n\n"

View File

@ -6,6 +6,7 @@ SPDX-License-Identifier: MIT
import copy
import json
import os
import io
import re
from dataclasses import dataclass
from typing import List, Tuple
@ -14,6 +15,7 @@ import albumentations as alb
import cv2
import numpy as np
from albumentations.pytorch import ToTensorV2
import pymupdf
from PIL import Image
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from torchvision.transforms.functional import resize
@ -21,6 +23,152 @@ from torchvision.transforms.functional import resize
from utils.markdown_utils import MarkdownConverter
def save_figure_to_local(pil_crop, save_dir, image_name, reading_order):
"""Save cropped figure to local file system
Args:
pil_crop: PIL Image object of the cropped figure
save_dir: Base directory to save results
image_name: Name of the source image/document
reading_order: Reading order of the figure in the document
Returns:
str: Filename of the saved figure
"""
try:
# Create figures directory if it doesn't exist
figures_dir = os.path.join(save_dir, "markdown", "figures")
# os.makedirs(figures_dir, exist_ok=True)
# Generate figure filename
figure_filename = f"{image_name}_figure_{reading_order:03d}.png"
figure_path = os.path.join(figures_dir, figure_filename)
# Save the figure
pil_crop.save(figure_path, format="PNG", quality=95)
# print(f"Saved figure: {figure_filename}")
return figure_filename
except Exception as e:
print(f"Error saving figure: {str(e)}")
# Return a fallback filename
return f"{image_name}_figure_{reading_order:03d}_error.png"
def convert_pdf_to_images(pdf_path, target_size=896):
"""Convert PDF pages to images
Args:
pdf_path: Path to PDF file
target_size: Target size for the longest dimension
Returns:
List of PIL Images
"""
images = []
try:
doc = pymupdf.open(pdf_path)
for page_num in range(len(doc)):
page = doc[page_num]
# Calculate scale to make longest dimension equal to target_size
rect = page.rect
scale = target_size / max(rect.width, rect.height)
# Render page as image
mat = pymupdf.Matrix(scale, scale)
pix = page.get_pixmap(matrix=mat)
# Convert to PIL Image
img_data = pix.tobytes("png")
pil_image = Image.open(io.BytesIO(img_data))
images.append(pil_image)
doc.close()
print(f"Successfully converted {len(images)} pages from PDF")
return images
except Exception as e:
print(f"Error converting PDF to images: {str(e)}")
return []
def is_pdf_file(file_path):
"""Check if file is a PDF"""
return file_path.lower().endswith('.pdf')
def save_combined_pdf_results(all_page_results, pdf_path, save_dir):
"""Save combined results for multi-page PDF with both JSON and Markdown
Args:
all_page_results: List of results for all pages
pdf_path: Path to original PDF file
save_dir: Directory to save results
Returns:
Path to saved combined JSON file
"""
# Create output filename based on PDF name
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
# Prepare combined results
combined_results = {
"source_file": pdf_path,
"total_pages": len(all_page_results),
"pages": all_page_results
}
# Save combined JSON results
json_filename = f"{base_name}.json"
json_path = os.path.join(save_dir, "recognition_json", json_filename)
os.makedirs(os.path.dirname(json_path), exist_ok=True)
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(combined_results, f, indent=2, ensure_ascii=False)
# Generate and save combined markdown
try:
markdown_converter = MarkdownConverter()
# Combine all page results into a single list for markdown conversion
all_elements = []
for page_data in all_page_results:
page_elements = page_data.get("elements", [])
if page_elements:
# Add page separator if not the first page
if all_elements:
all_elements.append({
"label": "page_separator",
"text": f"\n\n---\n\n",
"reading_order": len(all_elements)
})
all_elements.extend(page_elements)
# Generate markdown content
markdown_content = markdown_converter.convert(all_elements)
# Save markdown file
markdown_filename = f"{base_name}.md"
markdown_path = os.path.join(save_dir, "markdown", markdown_filename)
os.makedirs(os.path.dirname(markdown_path), exist_ok=True)
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
# print(f"Combined markdown saved to: {markdown_path}")
except ImportError:
print("MarkdownConverter not available, skipping markdown generation")
except Exception as e:
print(f"Error generating markdown: {e}")
# print(f"Combined JSON results saved to: {json_path}")
return json_path
def alb_wrapper(transform):
def f(im):
return transform(image=np.asarray(im))["image"]
@ -302,13 +450,12 @@ def prepare_image(image) -> Tuple[np.ndarray, ImageDimensions]:
return np.zeros((h, w, 3), dtype=np.uint8), dimensions
def setup_output_dirs(save_dir):
"""Create necessary output directories"""
os.makedirs(save_dir, exist_ok=True)
os.makedirs(os.path.join(save_dir, "markdown"), exist_ok=True)
os.makedirs(os.path.join(save_dir, "recognition_json"), exist_ok=True)
os.makedirs(os.path.join(save_dir, "markdown", "figures"), exist_ok=True)
def save_outputs(recognition_results, image_path, save_dir):