Skip to content

Commit

Permalink
Export & deploy updates (part I) (NVIDIA#10941)
Browse files Browse the repository at this point in the history
* Update vLLMExporter docstring

Signed-off-by: Jan Lasek <[email protected]>

* No need to create empty kwargs here

Signed-off-by: Jan Lasek <[email protected]>

* Use debug from command line

Signed-off-by: Jan Lasek <[email protected]>

* Param save_engine for both both vLLM and TRT-LLM

Signed-off-by: Jan Lasek <[email protected]>

* Unused backend param in run_trt_llm_inference

Signed-off-by: Jan Lasek <[email protected]>

* Reindent files for non-existent checkpoint check

Signed-off-by: Jan Lasek <[email protected]>

* Docs for lora_checkpoints

Signed-off-by: Jan Lasek <[email protected]>

* Improve config readability

Signed-off-by: Jan Lasek <[email protected]>

* Raise error directly in get_vllm_deployable

Signed-off-by: Jan Lasek <[email protected]>

* Apply isort and black reformatting

Signed-off-by: janekl <[email protected]>

* Revert "Reindent files for non-existent checkpoint check"

This reverts commit 8499d50.

Signed-off-by: Jan Lasek <[email protected]>

* Cut off prompt for real

Signed-off-by: Jan Lasek <[email protected]>

* Apply isort and black reformatting

Signed-off-by: janekl <[email protected]>

---------

Signed-off-by: Jan Lasek <[email protected]>
Signed-off-by: janekl <[email protected]>
Co-authored-by: janekl <[email protected]>
Signed-off-by: Hainan Xu <[email protected]>
  • Loading branch information
2 people authored and Hainan Xu committed Nov 5, 2024
1 parent 16405bc commit d7296a5
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 53 deletions.
25 changes: 15 additions & 10 deletions nemo/export/vllm_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,26 +52,28 @@ def wrapper(*args, **kwargs):

class vLLMExporter(ITritonDeployable):
"""
The Exporter class implements conversion from a Nemo checkpoint format to something compatible with vLLM,
The vLLMExporter class implements conversion from a Nemo checkpoint format to something compatible with vLLM,
loading the model in vLLM, and binding that model to a Triton server.
Example:
from nemo.export.vllm import Exporter
from nemo.export.vllm_exporter import vLLMExporter
from nemo.deploy import DeployPyTriton
exporter = Exporter()
exporter = vLLMExporter()
exporter.export(
nemo_checkpoint='/path/to/checkpoint.nemo',
model_dir='/path/to/temp_dir',
model_type='llama')
model_type='llama',
)
server = DeployPyTriton(
model=exporter,
triton_model_name='LLAMA')
triton_model_name='LLAMA',
)
server.deploy()
server.serve()
server.stop()
"""

def __init__(self):
Expand All @@ -86,7 +88,7 @@ def export(
tensor_parallel_size: int = 1,
pipeline_parallel_size: int = 1,
max_model_len: int = None,
lora_checkpoints: List[str] = [],
lora_checkpoints: Optional[List[str]] = None,
dtype: str = 'auto',
seed: int = 0,
log_stats: bool = True,
Expand All @@ -110,6 +112,7 @@ def export(
pipeline_parallel_size (int): pipeline parallelism.
Values over 1 are not currently supported by vLLM.
max_model_len (int): model context length.
lora_checkpoints List[str]: paths to LoRA checkpoints.
dtype (str): data type for model weights and activations.
Possible choices: auto, half, float16, bfloat16, float, float32
"auto" will use FP16 precision for FP32 and FP16 models,
Expand Down Expand Up @@ -161,7 +164,7 @@ def export(
# vllm/huggingface doesn't like the absense of config file. Place config in load dir.
if model_config.model and not os.path.exists(os.path.join(model_config.model, 'config.json')):
with open(os.path.join(model_config.model, 'config.json'), "w") as f:
json.dump(model_config.hf_text_config.to_dict(), f)
json.dump(model_config.hf_text_config.to_dict(), f, indent=2)

# Dynamic online FP8 quantization currently does not support in-memory conversion [TODO]
if quantization is not None and weight_storage in {'auto', 'memory'}:
Expand Down Expand Up @@ -277,10 +280,12 @@ def export(
log_stats=log_stats,
)

def _prepare_lora_checkpoints(self, model_dir: str, lora_checkpoints: List[str], dtype) -> LoRAConfig:
def _prepare_lora_checkpoints(
self, model_dir: str, lora_checkpoints: Optional[List[str]], dtype: str
) -> LoRAConfig:
self.lora_checkpoints = []

if lora_checkpoints is None or len(lora_checkpoints) == 0:
if not lora_checkpoints:
return None

index = 0
Expand Down
32 changes: 14 additions & 18 deletions scripts/deploy/nlp/deploy_vllm_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,24 +104,20 @@ def get_args(argv):


def get_vllm_deployable(args, model_dir):

try:
exporter = vLLMExporter()
exporter.export(
nemo_checkpoint=args.nemo_checkpoint,
model_dir=model_dir,
model_type=args.model_type,
tensor_parallel_size=args.tensor_parallelism_size,
max_model_len=args.max_model_len,
lora_checkpoints=args.lora_ckpt,
dtype=args.dtype,
weight_storage=args.weight_storage,
gpu_memory_utilization=args.gpu_memory_utilization,
quantization=args.quantization,
)
return exporter
except Exception as error:
raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
exporter = vLLMExporter()
exporter.export(
nemo_checkpoint=args.nemo_checkpoint,
model_dir=model_dir,
model_type=args.model_type,
tensor_parallel_size=args.tensor_parallelism_size,
max_model_len=args.max_model_len,
lora_checkpoints=args.lora_ckpt,
dtype=args.dtype,
weight_storage=args.weight_storage,
gpu_memory_utilization=args.gpu_memory_utilization,
quantization=args.quantization,
)
return exporter


def nemo_deploy(argv):
Expand Down
23 changes: 11 additions & 12 deletions tests/deploy/nemo_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,7 @@ def run_trt_llm_inference(
stop_words_list=None,
test_deployment=False,
test_data_path=None,
backend="TensorRT-LLM",
save_trt_engine=False,
save_engine=False,
):
if Path(checkpoint_path).exists():
if n_gpu > torch.cuda.device_count():
Expand Down Expand Up @@ -319,14 +318,14 @@ def run_trt_llm_inference(
if test_deployment:
nm.stop()

if not save_trt_engine:
if not save_engine:
shutil.rmtree(trt_llm_model_dir)
return result

if test_deployment:
nm.stop()

if not save_trt_engine:
if not save_engine:
shutil.rmtree(trt_llm_model_dir)

return None, None, None, None, None
Expand Down Expand Up @@ -368,7 +367,7 @@ def run_existing_checkpoints(
stop_words_list=None,
test_data_path=None,
backend="tensorrt-llm",
save_trt_engine=False,
save_engine=False,
):
if n_gpus > torch.cuda.device_count():
print("Skipping the test due to not enough number of GPUs")
Expand Down Expand Up @@ -433,7 +432,7 @@ def run_existing_checkpoints(
stop_words_list=stop_words_list,
test_deployment=test_deployment,
test_data_path=test_data_path,
save_trt_engine=save_trt_engine,
save_engine=save_engine,
)


Expand Down Expand Up @@ -573,7 +572,7 @@ def get_args():
help="Different options to deploy nemo model.",
)
parser.add_argument(
"--save_trt_engine",
"--save_engine",
type=str,
default="False",
)
Expand All @@ -587,10 +586,10 @@ def run_inference_tests(args):
else:
args.test_deployment = False

if args.save_trt_engine == "True":
args.save_trt_engine = True
if args.save_engine == "True":
args.save_engine = True
else:
args.save_trt_engine = False
args.save_engine = False

if args.run_accuracy == "True":
args.run_accuracy = True
Expand Down Expand Up @@ -621,7 +620,7 @@ def run_inference_tests(args):
run_accuracy=args.run_accuracy,
test_data_path=args.test_data_path,
backend=args.backend.lower(),
save_trt_engine=args.save_trt_engine,
save_engine=args.save_engine,
)

n_gpus = n_gpus * 2
Expand Down Expand Up @@ -658,7 +657,7 @@ def run_inference_tests(args):
streaming=args.streaming,
test_deployment=args.test_deployment,
test_data_path=args.test_data_path,
save_trt_engine=args.save_trt_engine,
save_engine=args.save_engine,
)
else:
result_dic[n_gpus] = run_in_framework_inference(
Expand Down
23 changes: 10 additions & 13 deletions tests/export/nemo_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def run_inference(
test_cpp_runtime=False,
test_deployment=False,
test_data_path=None,
save_trt_engine=False,
save_engine=False,
fp8_quantized=False,
fp8_kvcache=False,
trt_llm_export_kwargs=None,
Expand Down Expand Up @@ -442,7 +442,7 @@ def run_inference(
if test_deployment:
nm.stop()

if not save_trt_engine and model_dir:
if not save_engine and model_dir:
shutil.rmtree(model_dir)

return (functional_result, accuracy_result)
Expand All @@ -464,7 +464,7 @@ def run_existing_checkpoints(
test_deployment=False,
stop_words_list=None,
test_data_path=None,
save_trt_engine=False,
save_engine=False,
in_framework=False,
fp8_quantized=False,
fp8_kvcache=False,
Expand Down Expand Up @@ -497,9 +497,6 @@ def run_existing_checkpoints(
else:
use_embedding_sharing = False

if trt_llm_export_kwargs is None:
trt_llm_export_kwargs = {}

if in_framework:
return run_in_framework_inference(
model_name=model_name,
Expand Down Expand Up @@ -542,7 +539,7 @@ def run_existing_checkpoints(
test_cpp_runtime=test_cpp_runtime,
test_deployment=test_deployment,
test_data_path=test_data_path,
save_trt_engine=save_trt_engine,
save_engine=save_engine,
fp8_quantized=fp8_quantized,
fp8_kvcache=fp8_kvcache,
trt_llm_export_kwargs=trt_llm_export_kwargs,
Expand Down Expand Up @@ -591,7 +588,7 @@ def run_in_framework_inference(
output_deployed = output_deployed["sentences"]
# MegatronLLMDeployable will return the prompt + generated output, so cut off the prompt
for i, output in enumerate(output_deployed):
output = output[len(prompts[i]) :]
output_deployed[i, :] = output[0][len(prompts[i]) :]

# Unwrap the generator if needed
output_deployed = list(output_deployed)
Expand Down Expand Up @@ -744,7 +741,7 @@ def get_args():
default=None,
)
parser.add_argument(
"--save_trt_engine",
"--save_engine",
type=str,
default="False",
)
Expand Down Expand Up @@ -811,7 +808,7 @@ def str_to_bool(name: str, s: str, optional: bool = False) -> Optional[bool]:
args.test_cpp_runtime = str_to_bool("test_cpp_runtime", args.test_cpp_runtime)
args.test_deployment = str_to_bool("test_deployment", args.test_deployment)
args.functional_test = str_to_bool("functional_test", args.functional_test)
args.save_trt_engine = str_to_bool("save_trt_engin", args.save_trt_engine)
args.save_engine = str_to_bool("save_engine", args.save_engine)
args.run_accuracy = str_to_bool("run_accuracy", args.run_accuracy)
args.use_vllm = str_to_bool("use_vllm", args.use_vllm)
args.lora = str_to_bool("lora", args.lora)
Expand Down Expand Up @@ -871,7 +868,7 @@ def run_inference_tests(args):
test_cpp_runtime=args.test_cpp_runtime,
run_accuracy=args.run_accuracy,
test_data_path=args.test_data_path,
save_trt_engine=args.save_trt_engine,
save_engine=args.save_engine,
in_framework=args.in_framework,
fp8_quantized=args.export_fp8_quantized,
fp8_kvcache=args.use_fp8_kv_cache,
Expand Down Expand Up @@ -900,7 +897,7 @@ def run_inference_tests(args):
top_p=args.top_p,
temperature=args.temperature,
run_accuracy=args.run_accuracy,
debug=True,
debug=args.debug,
test_data_path=args.test_data_path,
)
else:
Expand Down Expand Up @@ -932,7 +929,7 @@ def run_inference_tests(args):
test_deployment=args.test_deployment,
test_cpp_runtime=args.test_cpp_runtime,
test_data_path=args.test_data_path,
save_trt_engine=args.save_trt_engine,
save_engine=args.save_engine,
fp8_quantized=args.export_fp8_quantized,
fp8_kvcache=args.use_fp8_kv_cache,
trt_llm_export_kwargs=args.trt_llm_export_kwargs,
Expand Down

0 comments on commit d7296a5

Please sign in to comment.