diff --git a/deployment/tensorrt_llm/ReadMe.md b/deployment/tensorrt_llm/ReadMe.md index 240ca94..ffc8c42 100644 --- a/deployment/tensorrt_llm/ReadMe.md +++ b/deployment/tensorrt_llm/ReadMe.md @@ -5,7 +5,9 @@ ## ✅ Introduction The Dolphin model employs a **Swin Encoder + MBart Decoder** architecture. In the HuggingFace Transformers [Config](https://huggingface.co/ByteDance/Dolphin/blob/main/config.json), its architectures field is specified as "VisionEncoderDecoderModel". Dolphin, Nougat, and Donut share the same model architecture. TensorRT-LLM has already supported the Nougat model. -Following Nougat's conversion script, we have successfully implemented Dolphin on TensorRT-LLM. Note: input_ids MUST be of int32 type, otherwise TensorRT-LLM will produce incorrect results. +Following Nougat's conversion script, we have successfully implemented Dolphin on TensorRT-LLM. + +**Note:** prompt_ids MUST be of **int32** type, otherwise TensorRT-LLM will produce incorrect results. ## 🛠️ Installation > We only test TensorRT-LLM 0.18.1 on Linux. diff --git a/deployment/tensorrt_llm/dolphin_runner.py b/deployment/tensorrt_llm/dolphin_runner.py index ec6da0e..6985330 100644 --- a/deployment/tensorrt_llm/dolphin_runner.py +++ b/deployment/tensorrt_llm/dolphin_runner.py @@ -115,8 +115,9 @@ class DolphinRunner(MultimodalModelRunner): prompts = [f"{text.strip()} " for text in input_texts] images = self.processor(input_images, return_tensors="pt")['pixel_values'].to("cuda") prompt_ids = self.tokenizer(prompts, add_special_tokens=False, return_tensors="pt").input_ids.to("cuda") - prompt_ids = prompt_ids.to( - torch.int32) # Important! If the type of prompt_ids is not int32, the output will be wrong. + + # 🚨🚨🚨 Important! If the type of prompt_ids is not int32, the output will be wrong. 🚨🚨🚨 + prompt_ids = prompt_ids.to(torch.int32) logger.info("---------------------------------------------------------") logger.info(f"images size: {images.size()}")