Export & deploy updates (part I) (NVIDIA#10941)

* Update vLLMExporter docstring Signed-off-by: Jan Lasek <[email protected]> * No need to create empty kwargs here Signed-off-by: Jan Lasek <[email protected]> * Use debug from command line Signed-off-by: Jan Lasek <[email protected]> * Param save_engine for both both vLLM and TRT-LLM Signed-off-by: Jan Lasek <[email protected]> * Unused backend param in run_trt_llm_inference Signed-off-by: Jan Lasek <[email protected]> * Reindent files for non-existent checkpoint check Signed-off-by: Jan Lasek <[email protected]> * Docs for lora_checkpoints Signed-off-by: Jan Lasek <[email protected]> * Improve config readability Signed-off-by: Jan Lasek <[email protected]> * Raise error directly in get_vllm_deployable Signed-off-by: Jan Lasek <[email protected]> * Apply isort and black reformatting Signed-off-by: janekl <[email protected]> * Revert "Reindent files for non-existent checkpoint check" This reverts commit 8499d50. Signed-off-by: Jan Lasek <[email protected]> * Cut off prompt for real Signed-off-by: Jan Lasek <[email protected]> * Apply isort and black reformatting Signed-off-by: janekl <[email protected]> --------- Signed-off-by: Jan Lasek <[email protected]> Signed-off-by: janekl <[email protected]> Co-authored-by: janekl <[email protected]> Signed-off-by: Hainan Xu <[email protected]>
hainan-xv · Nov 5, 2024 · d7296a5 · d7296a5
1 parent 16405bc
commit d7296a5
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 53 deletions.
diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py
@@ -52,26 +52,28 @@ def wrapper(*args, **kwargs):
 
 class vLLMExporter(ITritonDeployable):
     """
-    The Exporter class implements conversion from a Nemo checkpoint format to something compatible with vLLM,
+    The vLLMExporter class implements conversion from a Nemo checkpoint format to something compatible with vLLM,
     loading the model in vLLM, and binding that model to a Triton server.
 
     Example:
-        from nemo.export.vllm import Exporter
+        from nemo.export.vllm_exporter import vLLMExporter
         from nemo.deploy import DeployPyTriton
 
-        exporter = Exporter()
+        exporter = vLLMExporter()
+
         exporter.export(
             nemo_checkpoint='/path/to/checkpoint.nemo',
             model_dir='/path/to/temp_dir',
-            model_type='llama')
+            model_type='llama',
+        )
 
         server = DeployPyTriton(
             model=exporter,
-            triton_model_name='LLAMA')
+            triton_model_name='LLAMA',
+        )
 
         server.deploy()
         server.serve()
-        server.stop()
     """
 
     def __init__(self):
@@ -86,7 +88,7 @@ def export(
         tensor_parallel_size: int = 1,
         pipeline_parallel_size: int = 1,
         max_model_len: int = None,
-        lora_checkpoints: List[str] = [],
+        lora_checkpoints: Optional[List[str]] = None,
         dtype: str = 'auto',
         seed: int = 0,
         log_stats: bool = True,
@@ -110,6 +112,7 @@ def export(
             pipeline_parallel_size (int): pipeline parallelism.
                 Values over 1 are not currently supported by vLLM.
             max_model_len (int): model context length.
+            lora_checkpoints List[str]: paths to LoRA checkpoints.
             dtype (str): data type for model weights and activations.
                 Possible choices: auto, half, float16, bfloat16, float, float32
                 "auto" will use FP16 precision for FP32 and FP16 models,
@@ -161,7 +164,7 @@ def export(
         # vllm/huggingface doesn't like the absense of config file. Place config in load dir.
         if model_config.model and not os.path.exists(os.path.join(model_config.model, 'config.json')):
             with open(os.path.join(model_config.model, 'config.json'), "w") as f:
-                json.dump(model_config.hf_text_config.to_dict(), f)
+                json.dump(model_config.hf_text_config.to_dict(), f, indent=2)
 
         # Dynamic online FP8 quantization currently does not support in-memory conversion [TODO]
         if quantization is not None and weight_storage in {'auto', 'memory'}:
@@ -277,10 +280,12 @@ def export(
             log_stats=log_stats,
         )
 
-    def _prepare_lora_checkpoints(self, model_dir: str, lora_checkpoints: List[str], dtype) -> LoRAConfig:
+    def _prepare_lora_checkpoints(
+        self, model_dir: str, lora_checkpoints: Optional[List[str]], dtype: str
+    ) -> LoRAConfig:
         self.lora_checkpoints = []
 
-        if lora_checkpoints is None or len(lora_checkpoints) == 0:
+        if not lora_checkpoints:
             return None
 
         index = 0

diff --git a/scripts/deploy/nlp/deploy_vllm_triton.py b/scripts/deploy/nlp/deploy_vllm_triton.py
@@ -104,24 +104,20 @@ def get_args(argv):
 
 
 def get_vllm_deployable(args, model_dir):
-
-    try:
-        exporter = vLLMExporter()
-        exporter.export(
-            nemo_checkpoint=args.nemo_checkpoint,
-            model_dir=model_dir,
-            model_type=args.model_type,
-            tensor_parallel_size=args.tensor_parallelism_size,
-            max_model_len=args.max_model_len,
-            lora_checkpoints=args.lora_ckpt,
-            dtype=args.dtype,
-            weight_storage=args.weight_storage,
-            gpu_memory_utilization=args.gpu_memory_utilization,
-            quantization=args.quantization,
-        )
-        return exporter
-    except Exception as error:
-        raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
+    exporter = vLLMExporter()
+    exporter.export(
+        nemo_checkpoint=args.nemo_checkpoint,
+        model_dir=model_dir,
+        model_type=args.model_type,
+        tensor_parallel_size=args.tensor_parallelism_size,
+        max_model_len=args.max_model_len,
+        lora_checkpoints=args.lora_ckpt,
+        dtype=args.dtype,
+        weight_storage=args.weight_storage,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        quantization=args.quantization,
+    )
+    return exporter
 
 
 def nemo_deploy(argv):

diff --git a/tests/deploy/nemo_deploy.py b/tests/deploy/nemo_deploy.py
@@ -180,8 +180,7 @@ def run_trt_llm_inference(
     stop_words_list=None,
     test_deployment=False,
     test_data_path=None,
-    backend="TensorRT-LLM",
-    save_trt_engine=False,
+    save_engine=False,
 ):
     if Path(checkpoint_path).exists():
         if n_gpu > torch.cuda.device_count():
@@ -319,14 +318,14 @@ def run_trt_llm_inference(
             if test_deployment:
                 nm.stop()
 
-            if not save_trt_engine:
+            if not save_engine:
                 shutil.rmtree(trt_llm_model_dir)
             return result
 
         if test_deployment:
             nm.stop()
 
-        if not save_trt_engine:
+        if not save_engine:
             shutil.rmtree(trt_llm_model_dir)
 
         return None, None, None, None, None
@@ -368,7 +367,7 @@ def run_existing_checkpoints(
     stop_words_list=None,
     test_data_path=None,
     backend="tensorrt-llm",
-    save_trt_engine=False,
+    save_engine=False,
 ):
     if n_gpus > torch.cuda.device_count():
         print("Skipping the test due to not enough number of GPUs")
@@ -433,7 +432,7 @@ def run_existing_checkpoints(
             stop_words_list=stop_words_list,
             test_deployment=test_deployment,
             test_data_path=test_data_path,
-            save_trt_engine=save_trt_engine,
+            save_engine=save_engine,
         )
 
 
@@ -573,7 +572,7 @@ def get_args():
         help="Different options to deploy nemo model.",
     )
     parser.add_argument(
-        "--save_trt_engine",
+        "--save_engine",
         type=str,
         default="False",
     )
@@ -587,10 +586,10 @@ def run_inference_tests(args):
     else:
         args.test_deployment = False
 
-    if args.save_trt_engine == "True":
-        args.save_trt_engine = True
+    if args.save_engine == "True":
+        args.save_engine = True
     else:
-        args.save_trt_engine = False
+        args.save_engine = False
 
     if args.run_accuracy == "True":
         args.run_accuracy = True
@@ -621,7 +620,7 @@ def run_inference_tests(args):
                 run_accuracy=args.run_accuracy,
                 test_data_path=args.test_data_path,
                 backend=args.backend.lower(),
-                save_trt_engine=args.save_trt_engine,
+                save_engine=args.save_engine,
             )
 
             n_gpus = n_gpus * 2
@@ -658,7 +657,7 @@ def run_inference_tests(args):
                     streaming=args.streaming,
                     test_deployment=args.test_deployment,
                     test_data_path=args.test_data_path,
-                    save_trt_engine=args.save_trt_engine,
+                    save_engine=args.save_engine,
                 )
             else:
                 result_dic[n_gpus] = run_in_framework_inference(

diff --git a/tests/export/nemo_export.py b/tests/export/nemo_export.py
@@ -241,7 +241,7 @@ def run_inference(
     test_cpp_runtime=False,
     test_deployment=False,
     test_data_path=None,
-    save_trt_engine=False,
+    save_engine=False,
     fp8_quantized=False,
     fp8_kvcache=False,
     trt_llm_export_kwargs=None,
@@ -442,7 +442,7 @@ def run_inference(
         if test_deployment:
             nm.stop()
 
-        if not save_trt_engine and model_dir:
+        if not save_engine and model_dir:
             shutil.rmtree(model_dir)
 
         return (functional_result, accuracy_result)
@@ -464,7 +464,7 @@ def run_existing_checkpoints(
     test_deployment=False,
     stop_words_list=None,
     test_data_path=None,
-    save_trt_engine=False,
+    save_engine=False,
     in_framework=False,
     fp8_quantized=False,
     fp8_kvcache=False,
@@ -497,9 +497,6 @@ def run_existing_checkpoints(
     else:
         use_embedding_sharing = False
 
-    if trt_llm_export_kwargs is None:
-        trt_llm_export_kwargs = {}
-
     if in_framework:
         return run_in_framework_inference(
             model_name=model_name,
@@ -542,7 +539,7 @@ def run_existing_checkpoints(
             test_cpp_runtime=test_cpp_runtime,
             test_deployment=test_deployment,
             test_data_path=test_data_path,
-            save_trt_engine=save_trt_engine,
+            save_engine=save_engine,
             fp8_quantized=fp8_quantized,
             fp8_kvcache=fp8_kvcache,
             trt_llm_export_kwargs=trt_llm_export_kwargs,
@@ -591,7 +588,7 @@ def run_in_framework_inference(
         output_deployed = output_deployed["sentences"]
         # MegatronLLMDeployable will return the prompt + generated output, so cut off the prompt
         for i, output in enumerate(output_deployed):
-            output = output[len(prompts[i]) :]
+            output_deployed[i, :] = output[0][len(prompts[i]) :]
 
         # Unwrap the generator if needed
         output_deployed = list(output_deployed)
@@ -744,7 +741,7 @@ def get_args():
         default=None,
     )
     parser.add_argument(
-        "--save_trt_engine",
+        "--save_engine",
         type=str,
         default="False",
     )
@@ -811,7 +808,7 @@ def str_to_bool(name: str, s: str, optional: bool = False) -> Optional[bool]:
     args.test_cpp_runtime = str_to_bool("test_cpp_runtime", args.test_cpp_runtime)
     args.test_deployment = str_to_bool("test_deployment", args.test_deployment)
     args.functional_test = str_to_bool("functional_test", args.functional_test)
-    args.save_trt_engine = str_to_bool("save_trt_engin", args.save_trt_engine)
+    args.save_engine = str_to_bool("save_engine", args.save_engine)
     args.run_accuracy = str_to_bool("run_accuracy", args.run_accuracy)
     args.use_vllm = str_to_bool("use_vllm", args.use_vllm)
     args.lora = str_to_bool("lora", args.lora)
@@ -871,7 +868,7 @@ def run_inference_tests(args):
                 test_cpp_runtime=args.test_cpp_runtime,
                 run_accuracy=args.run_accuracy,
                 test_data_path=args.test_data_path,
-                save_trt_engine=args.save_trt_engine,
+                save_engine=args.save_engine,
                 in_framework=args.in_framework,
                 fp8_quantized=args.export_fp8_quantized,
                 fp8_kvcache=args.use_fp8_kv_cache,
@@ -900,7 +897,7 @@ def run_inference_tests(args):
                     top_p=args.top_p,
                     temperature=args.temperature,
                     run_accuracy=args.run_accuracy,
-                    debug=True,
+                    debug=args.debug,
                     test_data_path=args.test_data_path,
                 )
             else:
@@ -932,7 +929,7 @@ def run_inference_tests(args):
                     test_deployment=args.test_deployment,
                     test_cpp_runtime=args.test_cpp_runtime,
                     test_data_path=args.test_data_path,
-                    save_trt_engine=args.save_trt_engine,
+                    save_engine=args.save_engine,
                     fp8_quantized=args.export_fp8_quantized,
                     fp8_kvcache=args.use_fp8_kv_cache,
                     trt_llm_export_kwargs=args.trt_llm_export_kwargs,