Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export & deploy updates (part I) #10941

Merged
merged 14 commits into from
Oct 29, 2024
25 changes: 15 additions & 10 deletions nemo/export/vllm_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,26 +52,28 @@ def wrapper(*args, **kwargs):

class vLLMExporter(ITritonDeployable):
"""
The Exporter class implements conversion from a Nemo checkpoint format to something compatible with vLLM,
The vLLMExporter class implements conversion from a Nemo checkpoint format to something compatible with vLLM,
loading the model in vLLM, and binding that model to a Triton server.

Example:
from nemo.export.vllm import Exporter
from nemo.export.vllm_exporter import vLLMExporter
from nemo.deploy import DeployPyTriton

exporter = Exporter()
exporter = vLLMExporter()

exporter.export(
nemo_checkpoint='/path/to/checkpoint.nemo',
model_dir='/path/to/temp_dir',
model_type='llama')
model_type='llama',
)

server = DeployPyTriton(
model=exporter,
triton_model_name='LLAMA')
triton_model_name='LLAMA',
)

server.deploy()
server.serve()
server.stop()
"""

def __init__(self):
Expand All @@ -86,7 +88,7 @@ def export(
tensor_parallel_size: int = 1,
pipeline_parallel_size: int = 1,
max_model_len: int = None,
lora_checkpoints: List[str] = [],
lora_checkpoints: Optional[List[str]] = None,
dtype: str = 'auto',
seed: int = 0,
log_stats: bool = True,
Expand All @@ -110,6 +112,7 @@ def export(
pipeline_parallel_size (int): pipeline parallelism.
Values over 1 are not currently supported by vLLM.
max_model_len (int): model context length.
lora_checkpoints List[str]: paths to LoRA checkpoints.
dtype (str): data type for model weights and activations.
Possible choices: auto, half, float16, bfloat16, float, float32
"auto" will use FP16 precision for FP32 and FP16 models,
Expand Down Expand Up @@ -161,7 +164,7 @@ def export(
# vllm/huggingface doesn't like the absense of config file. Place config in load dir.
if model_config.model and not os.path.exists(os.path.join(model_config.model, 'config.json')):
with open(os.path.join(model_config.model, 'config.json'), "w") as f:
json.dump(model_config.hf_text_config.to_dict(), f)
json.dump(model_config.hf_text_config.to_dict(), f, indent=2)

# Dynamic online FP8 quantization currently does not support in-memory conversion [TODO]
if quantization is not None and weight_storage in {'auto', 'memory'}:
Expand Down Expand Up @@ -277,10 +280,12 @@ def export(
log_stats=log_stats,
)

def _prepare_lora_checkpoints(self, model_dir: str, lora_checkpoints: List[str], dtype) -> LoRAConfig:
def _prepare_lora_checkpoints(
self, model_dir: str, lora_checkpoints: Optional[List[str]], dtype: str
) -> LoRAConfig:
self.lora_checkpoints = []

if lora_checkpoints is None or len(lora_checkpoints) == 0:
if not lora_checkpoints:
return None

index = 0
Expand Down
32 changes: 14 additions & 18 deletions scripts/deploy/nlp/deploy_vllm_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,24 +104,20 @@ def get_args(argv):


def get_vllm_deployable(args, model_dir):

try:
exporter = vLLMExporter()
exporter.export(
nemo_checkpoint=args.nemo_checkpoint,
model_dir=model_dir,
model_type=args.model_type,
tensor_parallel_size=args.tensor_parallelism_size,
max_model_len=args.max_model_len,
lora_checkpoints=args.lora_ckpt,
dtype=args.dtype,
weight_storage=args.weight_storage,
gpu_memory_utilization=args.gpu_memory_utilization,
quantization=args.quantization,
)
return exporter
except Exception as error:
raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: just propagate the original exception (hence removing try / catch block)

exporter = vLLMExporter()
exporter.export(
nemo_checkpoint=args.nemo_checkpoint,
model_dir=model_dir,
model_type=args.model_type,
tensor_parallel_size=args.tensor_parallelism_size,
max_model_len=args.max_model_len,
lora_checkpoints=args.lora_ckpt,
dtype=args.dtype,
weight_storage=args.weight_storage,
gpu_memory_utilization=args.gpu_memory_utilization,
quantization=args.quantization,
)
return exporter


def nemo_deploy(argv):
Expand Down
23 changes: 11 additions & 12 deletions tests/deploy/nemo_deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,7 @@ def run_trt_llm_inference(
stop_words_list=None,
test_deployment=False,
test_data_path=None,
backend="TensorRT-LLM",
save_trt_engine=False,
save_engine=False,
):
if Path(checkpoint_path).exists():
if n_gpu > torch.cuda.device_count():
Expand Down Expand Up @@ -319,14 +318,14 @@ def run_trt_llm_inference(
if test_deployment:
nm.stop()

if not save_trt_engine:
if not save_engine:
shutil.rmtree(trt_llm_model_dir)
return result

if test_deployment:
nm.stop()

if not save_trt_engine:
if not save_engine:
shutil.rmtree(trt_llm_model_dir)

return None, None, None, None, None
Expand Down Expand Up @@ -368,7 +367,7 @@ def run_existing_checkpoints(
stop_words_list=None,
test_data_path=None,
backend="tensorrt-llm",
save_trt_engine=False,
save_engine=False,
):
if n_gpus > torch.cuda.device_count():
print("Skipping the test due to not enough number of GPUs")
Expand Down Expand Up @@ -433,7 +432,7 @@ def run_existing_checkpoints(
stop_words_list=stop_words_list,
test_deployment=test_deployment,
test_data_path=test_data_path,
save_trt_engine=save_trt_engine,
save_engine=save_engine,
)


Expand Down Expand Up @@ -573,7 +572,7 @@ def get_args():
help="Different options to deploy nemo model.",
)
parser.add_argument(
"--save_trt_engine",
"--save_engine",
type=str,
default="False",
)
Expand All @@ -587,10 +586,10 @@ def run_inference_tests(args):
else:
args.test_deployment = False

if args.save_trt_engine == "True":
args.save_trt_engine = True
if args.save_engine == "True":
args.save_engine = True
else:
args.save_trt_engine = False
args.save_engine = False

if args.run_accuracy == "True":
args.run_accuracy = True
Expand Down Expand Up @@ -621,7 +620,7 @@ def run_inference_tests(args):
run_accuracy=args.run_accuracy,
test_data_path=args.test_data_path,
backend=args.backend.lower(),
save_trt_engine=args.save_trt_engine,
save_engine=args.save_engine,
)

n_gpus = n_gpus * 2
Expand Down Expand Up @@ -658,7 +657,7 @@ def run_inference_tests(args):
streaming=args.streaming,
test_deployment=args.test_deployment,
test_data_path=args.test_data_path,
save_trt_engine=args.save_trt_engine,
save_engine=args.save_engine,
)
else:
result_dic[n_gpus] = run_in_framework_inference(
Expand Down
21 changes: 9 additions & 12 deletions tests/export/nemo_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def run_inference(
test_cpp_runtime=False,
test_deployment=False,
test_data_path=None,
save_trt_engine=False,
save_engine=False,
fp8_quantized=False,
fp8_kvcache=False,
trt_llm_export_kwargs=None,
Expand Down Expand Up @@ -442,7 +442,7 @@ def run_inference(
if test_deployment:
nm.stop()

if not save_trt_engine and model_dir:
if not save_engine and model_dir:
shutil.rmtree(model_dir)

return (functional_result, accuracy_result)
Expand All @@ -464,7 +464,7 @@ def run_existing_checkpoints(
test_deployment=False,
stop_words_list=None,
test_data_path=None,
save_trt_engine=False,
save_engine=False,
in_framework=False,
fp8_quantized=False,
fp8_kvcache=False,
Expand Down Expand Up @@ -497,9 +497,6 @@ def run_existing_checkpoints(
else:
use_embedding_sharing = False

if trt_llm_export_kwargs is None:
trt_llm_export_kwargs = {}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not needed here


if in_framework:
return run_in_framework_inference(
model_name=model_name,
Expand Down Expand Up @@ -542,7 +539,7 @@ def run_existing_checkpoints(
test_cpp_runtime=test_cpp_runtime,
test_deployment=test_deployment,
test_data_path=test_data_path,
save_trt_engine=save_trt_engine,
save_engine=save_engine,
fp8_quantized=fp8_quantized,
fp8_kvcache=fp8_kvcache,
trt_llm_export_kwargs=trt_llm_export_kwargs,
Expand Down Expand Up @@ -744,7 +741,7 @@ def get_args():
default=None,
)
parser.add_argument(
"--save_trt_engine",
"--save_engine",
type=str,
default="False",
)
Expand Down Expand Up @@ -811,7 +808,7 @@ def str_to_bool(name: str, s: str, optional: bool = False) -> Optional[bool]:
args.test_cpp_runtime = str_to_bool("test_cpp_runtime", args.test_cpp_runtime)
args.test_deployment = str_to_bool("test_deployment", args.test_deployment)
args.functional_test = str_to_bool("functional_test", args.functional_test)
args.save_trt_engine = str_to_bool("save_trt_engin", args.save_trt_engine)
args.save_engine = str_to_bool("save_engine", args.save_engine)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renamed as this flag is for both TRT-LLM and vLLM engines.

args.run_accuracy = str_to_bool("run_accuracy", args.run_accuracy)
args.use_vllm = str_to_bool("use_vllm", args.use_vllm)
args.lora = str_to_bool("lora", args.lora)
Expand Down Expand Up @@ -871,7 +868,7 @@ def run_inference_tests(args):
test_cpp_runtime=args.test_cpp_runtime,
run_accuracy=args.run_accuracy,
test_data_path=args.test_data_path,
save_trt_engine=args.save_trt_engine,
save_engine=args.save_engine,
in_framework=args.in_framework,
fp8_quantized=args.export_fp8_quantized,
fp8_kvcache=args.use_fp8_kv_cache,
Expand Down Expand Up @@ -900,7 +897,7 @@ def run_inference_tests(args):
top_p=args.top_p,
temperature=args.temperature,
run_accuracy=args.run_accuracy,
debug=True,
debug=args.debug,
test_data_path=args.test_data_path,
)
else:
Expand Down Expand Up @@ -932,7 +929,7 @@ def run_inference_tests(args):
test_deployment=args.test_deployment,
test_cpp_runtime=args.test_cpp_runtime,
test_data_path=args.test_data_path,
save_trt_engine=args.save_trt_engine,
save_engine=args.save_engine,
fp8_quantized=args.export_fp8_quantized,
fp8_kvcache=args.use_fp8_kv_cache,
trt_llm_export_kwargs=args.trt_llm_export_kwargs,
Expand Down
Loading