vllm-project · simon-mo · Dec 4, 2024 · Dec 5, 2024 · Dec 6, 2024 · Dec 6, 2024
@@ -23,29 +23,29 @@ usage() {
 
 while getopts "m:b:l:f:t:" OPT; do
   case ${OPT} in
-    m ) 
+    m )
         MODEL="$OPTARG"
         ;;
-    b ) 
+    b )
         BATCH_SIZE="$OPTARG"
         ;;
-    l ) 
+    l )
         LIMIT="$OPTARG"
         ;;
-    f ) 
+    f )
         FEWSHOT="$OPTARG"
         ;;
     t )
         TP_SIZE="$OPTARG"
         ;;
-    \? ) 
+    \? )
         usage
         exit 1
         ;;
   esac
 done
 
 lm_eval --model vllm \
-  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096,enforce_eager=true" \
   --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
   --batch_size "$BATCH_SIZE"
diff --git a/examples/offline_inference/basic.py b/examples/offline_inference/basic.py
@@ -8,15 +8,30 @@
     "The future of AI is",
 ]
 # Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+sampling_params = SamplingParams(
+    temperature=0.8,
+    top_p=0.95,
+    max_tokens=512,
+)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(
+    model="deepseek-ai/DeepSeek-V2-Lite-Chat",
+    # model="deepseek-ai/DeepSeek-V2.5",
+    tensor_parallel_size=1,
+    trust_remote_code=True,
+    max_model_len=4096,
+    #   dtype="float16",
+    enforce_eager=True,
+    #   max_num_seqs=1,
+    #   block_size=128,
+    # disable_mla=True,
+)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
 outputs = llm.generate(prompts, sampling_params)
 # Print the outputs.
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")