47 lines
1.9 KiB
Bash
47 lines
1.9 KiB
Bash
#!/usr/bin/env bash
|
|
set -ex
|
|
|
|
############################################################################################
|
|
# Reference: https://github.com/NVIDIA/TensorRT-LLM/tree/v0.18.2/examples/multimodal#nougat
|
|
############################################################################################
|
|
|
|
export LD_LIBRARY_PATH=/usr/local/lib/python3.10/site-packages/tensorrt_libs/:/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib/:$LD_LIBRARY_PATH
|
|
|
|
# 1. Download Huggingface weights
|
|
export MODEL_NAME="Dolphin"
|
|
git clone https://huggingface.co/Bytedance/${MODEL_NAME} tmp/hf_models/${MODEL_NAME}
|
|
|
|
|
|
export MAX_BATCH_SIZE=16
|
|
export MAX_SEQ_LEN=4096
|
|
export MAX_INPUT_LEN=10
|
|
export MAX_ENCODER_INPUT_LEN=784
|
|
|
|
# 2. Convert Huggingface weights into TRT-LLM checkpoints and build TRT engines using scripts in examples/enc_dec
|
|
python ./convert/convert_checkpoint.py --model_type bart \
|
|
--model_dir tmp/hf_models/${MODEL_NAME} \
|
|
--output_dir tmp/trt_models/${MODEL_NAME}/bfloat16 \
|
|
--tp_size 1 \
|
|
--pp_size 1 \
|
|
--dtype bfloat16 \
|
|
--nougat
|
|
|
|
|
|
trtllm-build --checkpoint_dir tmp/trt_models/${MODEL_NAME}/bfloat16/decoder \
|
|
--output_dir tmp/trt_engines/${MODEL_NAME}/1-gpu/bfloat16/decoder \
|
|
--paged_kv_cache disable \
|
|
--moe_plugin disable \
|
|
--gemm_plugin bfloat16 \
|
|
--bert_attention_plugin bfloat16 \
|
|
--gpt_attention_plugin bfloat16 \
|
|
--remove_input_padding enable \
|
|
--max_beam_width 1 \
|
|
--max_batch_size ${MAX_BATCH_SIZE} \
|
|
--max_seq_len ${MAX_SEQ_LEN} \
|
|
--max_input_len ${MAX_INPUT_LEN} \
|
|
--max_encoder_input_len $((${MAX_BATCH_SIZE} * ${MAX_ENCODER_INPUT_LEN})) # MAX_BATCH_SIZE (max_batch_size) * MAX_ENCODER_INPUT_LEN (num_visual_features)
|
|
|
|
# 3. Generate TensorRT engines for visual components and combine everything into final pipeline.
|
|
python ./convert/build_visual_engine.py --model_type nougat \
|
|
--model_path tmp/hf_models/${MODEL_NAME} \
|
|
--max_batch_size ${MAX_BATCH_SIZE} |