Dolphin/deployment/tensorrt_llm/convert_dolphin.sh

#!/usr/bin/env bash
set -ex

############################################################################################
# Reference: https://github.com/NVIDIA/TensorRT-LLM/tree/v0.18.2/examples/multimodal#nougat
############################################################################################

export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/tensorrt_libs/:/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib/:$LD_LIBRARY_PATH

# 1. Download Huggingface weights
export MODEL_NAME="Dolphin"
git clone https://huggingface.co/Bytedance/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}


export MAX_BATCH_SIZE=16
export MAX_SEQ_LEN=4096
export MAX_INPUT_LEN=10
export MAX_ENCODER_INPUT_LEN=784

# 2. Convert Huggingface weights into TRT-LLM checkpoints and build TRT engines using scripts in examples/enc_dec
python ./convert/convert_checkpoint.py --model_type bart \
    --model_dir tmp/hf_models/${MODEL_NAME} \
    --output_dir tmp/trt_models/${MODEL_NAME}/bfloat16 \
    --tp_size 1 \
    --pp_size 1 \
    --dtype bfloat16 \
    --nougat


trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/bfloat16/decoder \
    --output_dir tmp/trt_engines/${MODEL_NAME}/1-gpu/bfloat16/decoder \
    --paged_kv_cache disable \
    --moe_plugin disable \
    --gemm_plugin bfloat16 \
    --bert_attention_plugin bfloat16 \
    --gpt_attention_plugin bfloat16 \
    --remove_input_padding enable \
    --max_beam_width 1 \
    --max_batch_size ${MAX_BATCH_SIZE} \
    --max_seq_len ${MAX_SEQ_LEN} \
    --max_input_len ${MAX_INPUT_LEN} \
    --max_encoder_input_len $((${MAX_BATCH_SIZE} * ${MAX_ENCODER_INPUT_LEN})) # MAX_BATCH_SIZE (max_batch_size) * MAX_ENCODER_INPUT_LEN (num_visual_features)

# 3. Generate TensorRT engines for visual components and combine everything into final pipeline.
python ./convert/build_visual_engine.py --model_type nougat \
    --model_path tmp/hf_models/${MODEL_NAME} \
    --max_batch_size ${MAX_BATCH_SIZE}