performance downgrade on dGPU Arc770 after loading more than one LLM model #12660

qing-xu-intel · 2025-01-07T05:38:24Z

Test env is setup as follow:
https://github.com/intel-analytics/ipex-llm/blob/main/docs/mddocs/Quickstart/install_linux_gpu.md
https://github.com/intel-analytics/ipex-llm/tree/main/python/llm/example/GPU/HuggingFace/LLM/chatglm3
OS env

dGPU Arc770
Linux kernel = 6.1.59
Ubuntu = 22.04

python test.py
if load more than one models, the inference latency increase:

if only chatglm3 model loaded: llm infer 0.90 s
if only whisper-small model loaded: wsp infer 0.68 s
if both whisper-small + chatglm3 loaded:
llm infer 1.22 s
wsp infer 1.01 s
if 4 models loaded:
llm infer 2.07 s
cpm infer 2.97 s
sd infer 0.74 s
wsp infer 1.93 s

qing-xu-intel · 2025-01-07T05:39:55Z

import os
import time
import torch
import requests
import librosa
from diffusers import DiffusionPipeline
from PIL import Image
from ipex_llm.transformers import AutoModel
from ipex_llm.transformers import AutoModelForCausalLM
from ipex_llm.optimize import low_memory_init, load_low_bit
from ipex_llm import optimize_model
from ipex_llm.transformers import AutoModelForSpeechSeq2Seq
from transformers import AutoTokenizer, AutoProcessor, WhisperProcessor, AutoModelForSeq2SeqLM

WHISPER_SAMPLING_RATE = 16000

def test_chatglm(llm_model, llm_tokenizer, report, is_report):
with torch.inference_mode():
#warm up=======================
prompt = '问：[INST]请简单地回答。 <>\n\n\n<>\n\n请介绍一下上海 [/INST]\n\n答：'
# tokenize the input prompt from string to token ids;
# with .to('xpu') specifically for inference on Intel GPUs
input_ids = llm_tokenizer.encode(prompt, return_tensors="pt").to('xpu')
st = time.time()
# predict the next tokens (maximum 32) based on the input token ids
output = llm_model.generate(input_ids, max_new_tokens=128)
if is_report:
report.append('llm infer {:.2f} s'.format(time.time()-st))
print(f'==============================llm generate {time.time()-st}')
# decode the predicted token ids to output string

st = time.time() output = output.cpu() output_str = llm_tokenizer.decode(output[0], skip_special_tokens=True) #output = output.cpu() print(f'==============================llm decode {time.time()-st}') print('-'*20, 'Output', '-'*20) print(output_str)

def test_sd(sd_model, report, is_report):
imgidx = 1
save_path = "/home/rdl/git/sd-img" + str(imgidx) + ".png"
prompt_en = "a pig flying in the sky"
prompt_en2 = "a horse running in the city"

# output_image = sd_model( # prompt=prompt_en, # num_inference_steps=4, # guidance_scale=1, # num_images_per_prompt=1, # output_type="pil", # width=512, # height=512, # lcm_origin_steps=50, # ).images[0] # # torch.xpu.synchronize() st = time.time() output_image = sd_model( prompt=prompt_en2, num_inference_steps=4, guidance_scale=1, num_images_per_prompt=1, output_type="pil", width=512, height=512, lcm_origin_steps=50, ).images[0] if is_report: report.append('sd infer {:.2f} s'.format(time.time()-st)) print("========== sd generate {:.2f} s".format(time.time()-st)) #0.51s, Arc770, 16GB output_image.save(save_path) print("========== sd save {:.2f} s".format(time.time()-st)) #0.64s, Arc770, 16GB

def test_minicpm(model, tokenizer, report, is_report):
query = "图片里有什么?他们在干什么？"
stream = False
image_path = '/home/rdl/qt_image_2/current.jpg'
if os.path.exists(image_path):
image = Image.open(image_path).convert('RGB')
else:
image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB')

# Generate predicted tokens # here the prompt tuning refers to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/README.md msgs = [{'role': 'user', 'content': [image, query]}] # ipex_llm model needs a warmup, then inference time can be accurate model.chat( image=None, msgs=msgs, tokenizer=tokenizer, ) if stream: res = model.chat( image=None, msgs=msgs, tokenizer=tokenizer, stream=True ) print('-'*20, 'Input Image', '-'*20) print(image_path) print('-'*20, 'Input Prompt', '-'*20) print(query) print('-'*20, 'Stream Chat Output', '-'*20) for new_text in res: print(new_text, flush=True, end='') else: st = time.time() res = model.chat( image=None, msgs=msgs, tokenizer=tokenizer, ) torch.xpu.synchronize() end = time.time() if is_report: report.append('cpm infer {:.2f} s'.format(end-st)) print(f'=========minicpm Inference time: {end-st} s') print('-'*20, 'Input Image', '-'*20) print(image_path) print('-'*20, 'Input Prompt', '-'*20) print(query) print('-'*20, 'Chat Output', '-'*20) print(res)

def test_whisper(whisper_processor, whisper_model, report, is_report):
init_wav_file = os.path.join(os.path.dirname(file), "test_draw-pic-of-horse.wav")
audio, __ = librosa.load(path=init_wav_file, sr=WHISPER_SAMPLING_RATE)
# get_text_from_voice(audio, skip_print=True)

#https://github.com/huggingface/transformers/issues/22395, prompt for whisper simple_zh = "以下是普通话的句子[\INST]" #简体字，否则会出现繁体字 prompt_ids = whisper_processor.get_prompt_ids(simple_zh) forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="chinese", task="transcribe") # Generate predicted tokens output_str = None with torch.inference_mode(): if device == "GPU": input_features = whisper_processor(audio, sampling_rate=WHISPER_SAMPLING_RATE, return_tensors="pt").input_features.half().to('xpu') else: input_features = whisper_processor(audio, sampling_rate=WHISPER_SAMPLING_RATE, return_tensors="pt").input_features st = time.time() # if your selected model is capable of utilizing previous key/value attentions # to enhance decoding speed, but has `"use_cache": false` in its model config, # it is important to set `use_cache=True` explicitly in the `generate` function # to obtain optimal performance with BigDL-LLM INT4 optimizations predicted_ids = whisper_model.generate(input_features, forced_decoder_ids=forced_decoder_ids, prompt_ids=prompt_ids) output_str = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True) # output_str = output_str[0] output_str = output_str[0].split('[\INST]')[-1] inf_time = time.time()-st # self._json_output[self._cycle_count] = {"Recognized question: ": output_str, "Inference and decoding time: ": inf_time} # self._metric_report.append(f'== Whisper inference time for {output_str}: {inf_time} s') if is_report: report.append('wsp infer {:.2f} s'.format(inf_time)) print('=============== whisper latency {:.2f} s'.format(inf_time)) print('-'*20, 'Recognized text', '-'*20) print(f'{output_str}') print('-'*40) return output_str

if name == 'main':
device = "GPU"
llm_test = True
sd_test = False
minicpm_test = False
wp_test = False
metric_report = []
test_loop = 10

############# #load whisper ############# if wp_test: st = time.time() whisper_model_path = "/home/rdl/git/whisper-small" # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format if device == "GPU": whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, load_in_4bit=True, optimize_model=False, use_cache=True) whisper_model.half().to('xpu') else: whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, load_in_4bit=True) whisper_model.config.forced_decoder_ids = None # Load processor whisper_processor = WhisperProcessor.from_pretrained(whisper_model_path) print("============ whisper load to gpu") test_whisper(whisper_processor, whisper_model, metric_report, False) print("============ whisper warm up done!") ############# #load chatglm ############# if llm_test: llm_model_path = "/home/rdl/git/chatglm3-6b" save_directory = './open-chatglm3-3b-v2-ipex-llm-INT4' llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_path, trust_remote_code=True) if os.path.exists(save_directory): llm_model = AutoModel.load_low_bit(save_directory, trust_remote_code=True) else: llm_model = AutoModel.from_pretrained(llm_model_path, load_in_4bit=True, optimize_model=True, trust_remote_code=True, use_cache=True) llm_model.save_low_bit(save_directory) del(llm_model) llm_model = AutoModel.load_low_bit(save_directory, trust_remote_code=True) if device == "GPU": llm_model.to('xpu') print("============ llm load to gpu") test_chatglm(llm_model, llm_tokenizer, metric_report, False) print("============ llm warm done!") ########################## #load lcm stable diffusion ########################## if sd_test: sd_model_path = "/home/rdl/git/LCM_Dreamshaper_v7" # autoencoder_model_path = "/home/rdl/git/LCM_Dreamshaper_v7/vae" # sd_model_ = DiffusionPipeline.from_pretrained(sd_model_path, torch_dtype=torch.float32) # imgidx = 1 # pipe.vae = AutoencoderTiny.from_pretrained("./LCM_Dreamshaper_v7/taesd", torch_dtype=torch.float16) #pipe = pipe.to("xpu") # save_path = "/home/rdl/git/sd-img" + str(imgidx) + ".png" # prompt_en = "a pig flying in the sky" # prompt_en2 = "a horse running in the city" sd_model = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path=sd_model_path, torch_dtype=torch.float16) sd_model.to("xpu") print("============ sd load to gpu") test_sd(sd_model, metric_report, False) print("============ sd warm done!") ############# #load minicpm ############# if minicpm_test: minicpm_model_path = "/home/rdl/git/MiniCPM-V-2_6" # image_path = '/home/rdl/qt_image_2/current.jpg' minicpm_lowbit_path = "./open-MiniCPM-V-26-ipex-llm-INT4" if not minicpm_lowbit_path or not os.path.exists(minicpm_lowbit_path): # Load model in 4 bit, # which convert the relevant layers in the model into INT4 format # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function. # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU. minicpm_model = AutoModel.from_pretrained(minicpm_model_path, load_in_low_bit="sym_int4", optimize_model=True, trust_remote_code=True, use_cache=True, modules_to_not_convert=["vpm", "resampler"]) minicpm_tokenizer = AutoTokenizer.from_pretrained(minicpm_model_path, trust_remote_code=True) else: minicpm_model = AutoModel.load_low_bit(minicpm_lowbit_path, optimize_model=True, trust_remote_code=True, use_cache=True, modules_to_not_convert=["vpm", "resampler"]) minicpm_tokenizer = AutoTokenizer.from_pretrained(minicpm_lowbit_path, trust_remote_code=True) minicpm_model.eval() if minicpm_model_path and not os.path.exists(minicpm_model_path): minicpm_processor = AutoProcessor.from_pretrained(minicpm_model_path, trust_remote_code=True) minicpm_model.save_low_bit(minicpm_lowbit_path) minicpm_tokenizer.save_pretrained(minicpm_lowbit_path) minicpm_processor.save_pretrained(minicpm_lowbit_path) minicpm_model = minicpm_model.half().to('xpu') print("============ cpm load to gpu") test_minicpm(minicpm_model, minicpm_tokenizer, metric_report, False) print("============ cpm warm up done!") ############# #test ############# i = 0 while i<test_loop: i += 1 print('\n', '=='*20, ' TEST STARTING ', '=='*20) metric_report.append('\n-----------------loop {}'.format(i)) if llm_test: print('\n=================test_chatglm') test_chatglm(llm_model, llm_tokenizer, metric_report, True) if minicpm_test: print('\n=================test_minicpm') test_minicpm(minicpm_model, minicpm_tokenizer, metric_report, True) if sd_test: print('\n=================test_sd') test_sd(sd_model, metric_report,True) if wp_test: print('\n=================test_whisper') test_whisper(whisper_processor, whisper_model, metric_report, True) for rp in metric_report: print(rp)

qing-xu-intel · 2025-01-07T05:46:11Z

Please use below value to run the test cases (if want to run the case, set it to True):
llm_test = True
sd_test = False
minicpm_test = False
wp_test = False

ACupofAir · 2025-01-08T00:33:36Z

import os import time import torch import requests import librosa from diffusers import DiffusionPipeline from PIL import Image from ipex_llm.transformers import AutoModel from ipex_llm.transformers import AutoModelForCausalLM from ipex_llm.optimize import low_memory_init, load_low_bit from ipex_llm import optimize_model from ipex_llm.transformers import AutoModelForSpeechSeq2Seq from transformers import AutoTokenizer, AutoProcessor, WhisperProcessor, AutoModelForSeq2SeqLM

WHISPER_SAMPLING_RATE = 16000

def test_chatglm(llm_model, llm_tokenizer, report, is_report): with torch.inference_mode(): #warm up======================= prompt = '问：[INST]请简单地回答。 <>\n\n\n<>\n\n请介绍一下上海 [/INST]\n\n答：' # tokenize the input prompt from string to token ids; # with .to('xpu') specifically for inference on Intel GPUs input_ids = llm_tokenizer.encode(prompt, return_tensors="pt").to('xpu') st = time.time() # predict the next tokens (maximum 32) based on the input token ids output = llm_model.generate(input_ids, max_new_tokens=128) if is_report: report.append('llm infer {:.2f} s'.format(time.time()-st)) print(f'==============================llm generate {time.time()-st}') # decode the predicted token ids to output string

    st = time.time()
    output = output.cpu()
    output_str = llm_tokenizer.decode(output[0], skip_special_tokens=True)
    #output = output.cpu()
    print(f'==============================llm decode {time.time()-st}')
    print('-'*20, 'Output', '-'*20)
    print(output_str)

def test_sd(sd_model, report, is_report): imgidx = 1 save_path = "/home/rdl/git/sd-img" + str(imgidx) + ".png" prompt_en = "a pig flying in the sky" prompt_en2 = "a horse running in the city"

# output_image = sd_model(
#     prompt=prompt_en,
#     num_inference_steps=4,
#     guidance_scale=1,
#     num_images_per_prompt=1,
#     output_type="pil",
#     width=512,
#     height=512,
#     lcm_origin_steps=50,
# ).images[0]
# # torch.xpu.synchronize()

st = time.time()
output_image = sd_model(
    prompt=prompt_en2,
    num_inference_steps=4,
    guidance_scale=1,
    num_images_per_prompt=1,
    output_type="pil",
    width=512,
    height=512,
    lcm_origin_steps=50,
).images[0]

if is_report:
    report.append('sd  infer {:.2f} s'.format(time.time()-st))
print("========== sd generate {:.2f} s".format(time.time()-st)) #0.51s, Arc770, 16GB
output_image.save(save_path)
print("========== sd save {:.2f} s".format(time.time()-st)) #0.64s, Arc770, 16GB

def test_minicpm(model, tokenizer, report, is_report): query = "图片里有什么?他们在干什么？" stream = False image_path = '/home/rdl/qt_image_2/current.jpg' if os.path.exists(image_path): image = Image.open(image_path).convert('RGB') else: image = Image.open(requests.get(image_path, stream=True).raw).convert('RGB')

# Generate predicted tokens
# here the prompt tuning refers to https://huggingface.co/openbmb/MiniCPM-V-2_6/blob/main/README.md
msgs = [{'role': 'user', 'content': [image, query]}]

# ipex_llm model needs a warmup, then inference time can be accurate
model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer,
)

if stream:
    res = model.chat(
        image=None,
        msgs=msgs,
        tokenizer=tokenizer,
        stream=True
    )

    print('-'*20, 'Input Image', '-'*20)
    print(image_path)
    print('-'*20, 'Input Prompt', '-'*20)
    print(query)
    print('-'*20, 'Stream Chat Output', '-'*20)
    for new_text in res:
        print(new_text, flush=True, end='')
else:
    st = time.time()
    res = model.chat(
        image=None,
        msgs=msgs,
        tokenizer=tokenizer,
    )
    torch.xpu.synchronize()
    end = time.time()
    if is_report:
        report.append('cpm infer {:.2f} s'.format(end-st))
    print(f'=========minicpm Inference time: {end-st} s')
    print('-'*20, 'Input Image', '-'*20)
    print(image_path)
    print('-'*20, 'Input Prompt', '-'*20)
    print(query)
    print('-'*20, 'Chat Output', '-'*20)
    print(res)

def test_whisper(whisper_processor, whisper_model, report, is_report): init_wav_file = os.path.join(os.path.dirname(file), "test_draw-pic-of-horse.wav") audio, __ = librosa.load(path=init_wav_file, sr=WHISPER_SAMPLING_RATE) # get_text_from_voice(audio, skip_print=True)

#https://github.com/huggingface/transformers/issues/22395, prompt for whisper
simple_zh = "以下是普通话的句子[\INST]" #简体字，否则会出现繁体字
prompt_ids = whisper_processor.get_prompt_ids(simple_zh)
forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language="chinese", task="transcribe")
# Generate predicted tokens
output_str = None
with torch.inference_mode():
    if device == "GPU":
        input_features = whisper_processor(audio,
                            sampling_rate=WHISPER_SAMPLING_RATE,
                            return_tensors="pt").input_features.half().to('xpu')
    else:
        input_features = whisper_processor(audio,
                            sampling_rate=WHISPER_SAMPLING_RATE,
                            return_tensors="pt").input_features 
    st = time.time()
    # if your selected model is capable of utilizing previous key/value attentions
    # to enhance decoding speed, but has `"use_cache": false` in its model config,
    # it is important to set `use_cache=True` explicitly in the `generate` function
    # to obtain optimal performance with BigDL-LLM INT4 optimizations
    predicted_ids = whisper_model.generate(input_features,
                                forced_decoder_ids=forced_decoder_ids,
                                prompt_ids=prompt_ids)
    output_str = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)
    # output_str = output_str[0]
    output_str = output_str[0].split('[\INST]')[-1]
inf_time = time.time()-st
# self._json_output[self._cycle_count] = {"Recognized question: ": output_str, "Inference and decoding time: ": inf_time}
# self._metric_report.append(f'== Whisper inference time for {output_str}: {inf_time} s')
if is_report:
    report.append('wsp infer {:.2f} s'.format(inf_time))
print('=============== whisper latency {:.2f} s'.format(inf_time))
print('-'*20, 'Recognized text', '-'*20)
print(f'{output_str}')
print('-'*40)
return output_str

if name == 'main': device = "GPU" llm_test = True sd_test = False minicpm_test = False wp_test = False metric_report = [] test_loop = 10

#############
#load whisper
#############
if wp_test:
    st = time.time()
    whisper_model_path = "/home/rdl/git/whisper-small"

    # Load model in 4 bit,
    # which convert the relevant layers in the model into INT4 format
    if device == "GPU":
        whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path,
                                                    load_in_4bit=True,
                                                    optimize_model=False,
                                                    use_cache=True)
        whisper_model.half().to('xpu')
    else:
        whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path,
                                                    load_in_4bit=True)
    whisper_model.config.forced_decoder_ids = None

    # Load processor
    whisper_processor = WhisperProcessor.from_pretrained(whisper_model_path)
    print("============ whisper load to gpu")
    test_whisper(whisper_processor, whisper_model, metric_report, False)
    print("============ whisper warm up done!")

#############
#load chatglm
#############
if llm_test:
    llm_model_path = "/home/rdl/git/chatglm3-6b"
    save_directory = './open-chatglm3-3b-v2-ipex-llm-INT4'
    llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_path, trust_remote_code=True)
    if os.path.exists(save_directory):
        llm_model = AutoModel.load_low_bit(save_directory, trust_remote_code=True)
    else:
        llm_model = AutoModel.from_pretrained(llm_model_path,
                                    load_in_4bit=True,
                                    optimize_model=True,
                                    trust_remote_code=True,
                                    use_cache=True)
        llm_model.save_low_bit(save_directory)
        del(llm_model)
        llm_model = AutoModel.load_low_bit(save_directory, trust_remote_code=True)

    if device == "GPU":
        llm_model.to('xpu')
    print("============ llm load to gpu")
    test_chatglm(llm_model, llm_tokenizer, metric_report, False)
    print("============ llm warm done!")

##########################
#load lcm stable diffusion
##########################
if sd_test:
    sd_model_path = "/home/rdl/git/LCM_Dreamshaper_v7"
    # autoencoder_model_path = "/home/rdl/git/LCM_Dreamshaper_v7/vae"
    # sd_model_ = DiffusionPipeline.from_pretrained(sd_model_path, torch_dtype=torch.float32)
    # imgidx = 1
    # pipe.vae = AutoencoderTiny.from_pretrained("./LCM_Dreamshaper_v7/taesd", torch_dtype=torch.float16)
    #pipe = pipe.to("xpu")
    # save_path = "/home/rdl/git/sd-img" + str(imgidx) + ".png"
    # prompt_en = "a pig flying in the sky"
    # prompt_en2 = "a horse running in the city"

    sd_model = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path=sd_model_path,
                                                    torch_dtype=torch.float16)
    sd_model.to("xpu")
    print("============ sd load to gpu")
    test_sd(sd_model, metric_report, False)
    print("============ sd warm done!")

#############
#load minicpm
#############
if minicpm_test:
    minicpm_model_path = "/home/rdl/git/MiniCPM-V-2_6"
    # image_path = '/home/rdl/qt_image_2/current.jpg'
    minicpm_lowbit_path = "./open-MiniCPM-V-26-ipex-llm-INT4"

    if not minicpm_lowbit_path or not os.path.exists(minicpm_lowbit_path):
        # Load model in 4 bit,
        # which convert the relevant layers in the model into INT4 format
        # When running LLMs on Intel iGPUs for Windows users, we recommend setting `cpu_embedding=True` in the from_pretrained function.
        # This will allow the memory-intensive embedding layer to utilize the CPU instead of iGPU.
        minicpm_model = AutoModel.from_pretrained(minicpm_model_path, 
                                        load_in_low_bit="sym_int4",
                                        optimize_model=True,
                                        trust_remote_code=True,
                                        use_cache=True,
                                        modules_to_not_convert=["vpm", "resampler"])
        minicpm_tokenizer = AutoTokenizer.from_pretrained(minicpm_model_path,
                                                trust_remote_code=True)
    else:
        minicpm_model = AutoModel.load_low_bit(minicpm_lowbit_path, 
                                    optimize_model=True,
                                    trust_remote_code=True,
                                    use_cache=True,
                                    modules_to_not_convert=["vpm", "resampler"])
        minicpm_tokenizer = AutoTokenizer.from_pretrained(minicpm_lowbit_path,
                                                trust_remote_code=True)
    
    minicpm_model.eval()

    if minicpm_model_path and not os.path.exists(minicpm_model_path):
        minicpm_processor = AutoProcessor.from_pretrained(minicpm_model_path,
                                                trust_remote_code=True)
        minicpm_model.save_low_bit(minicpm_lowbit_path)
        minicpm_tokenizer.save_pretrained(minicpm_lowbit_path)
        minicpm_processor.save_pretrained(minicpm_lowbit_path)

    minicpm_model = minicpm_model.half().to('xpu')    
    print("============ cpm load to gpu")
    test_minicpm(minicpm_model, minicpm_tokenizer, metric_report, False)
    print("============ cpm warm up done!")

#############
#test
#############
i = 0
while i<test_loop:
    i += 1
    print('\n', '=='*20, ' TEST STARTING ', '=='*20)
    metric_report.append('\n-----------------loop {}'.format(i))    
    if llm_test:
        print('\n=================test_chatglm')
        test_chatglm(llm_model, llm_tokenizer, metric_report, True)
    if minicpm_test:
        print('\n=================test_minicpm')
        test_minicpm(minicpm_model, minicpm_tokenizer, metric_report, True)
    if sd_test:
        print('\n=================test_sd')
        test_sd(sd_model, metric_report,True)
    if wp_test:
        print('\n=================test_whisper')
        test_whisper(whisper_processor, whisper_model, metric_report, True)

for rp in metric_report:
    print(rp)

The code you provided has confusing indentation. Can you provide a formatted code?

qing-xu-intel · 2025-01-08T05:29:32Z

test.txt

please modify attached test.txt to test.py

qing-xu-intel · 2025-01-10T04:47:40Z

Do you have any progress on this issue? Is there any other information that you need from me?

ACupofAir · 2025-01-10T08:28:31Z

test.txt

please modify attached test.txt to test.py

Thank you, we have reproduced the results you provided, and the specific reasons are still under analysis.

ACupofAir · 2025-01-14T17:49:59Z

You need to comment out your warmup code. Like this:

After comment all your warmup code. Test result:

model	only	glm+wsp	glm+sd	sd+wsp	glm+wsp+sd	glm+wsp+sd+cpm
chatglm3	0.65	0.66	0.66	-	0.66	0.69
whisper	0.51	0.52	-	0.59	0.63	0.77
sd	0.48	-	0.49	0.50	0.50	0.49
cpm	1.50	-	-	-	-	1.50

After loading multiple models, the performance of the model will not decrease. Whisper has abnormal performance when loaded together with other non-text large models, but its performance is normal when loaded together with glm，so the problem should not be related to xpu

qing-xu-intel · 2025-01-15T00:54:59Z

Thanks for debugging and very useful discovery! I would follow your suggestion and retry.
And the behavior seems very interesting, as the warmup could also considered as the test, the difference is that warmup running immediately after loading the models. Is there any usage limitation or reasons?
And, do you have recommendation of whisper within ipex-llm project, I use it for Chinese text-to-speech.

Thanks!
Qing

Oscilloscope98 · 2025-01-16T08:11:36Z

do you have recommendation of whisper within ipex-llm project, I use it for Chinese text-to-speech.

Hi @qing-xu-intel,

You could refer to our optimized text-to-speech model examples on Intel GPU (e.g. SpeechT5 and Bark) for more information :)

qing-xu-intel · 2025-01-26T06:38:30Z

hi, I tried the workaround - removing all the warm-up, however, inference latency is still very long, see below results:

//with warm-up
llm infer 2.10 s
cpm infer 4.40 s
sd infer 0.76 s
wsp infer 2.06 s

//remove warm-up
llm infer 1.55 s
cpm infer 3.16 s
sd infer 0.60 s
wsp infer 1.82 s

//whisper only
wsp infer 0.71 s

//chatglm only
llm infer 0.87 s

//minicpm only
cpm infer 2.24 s

//sd only
sd infer 0.52 s

qing-xu-intel · 2025-01-26T06:49:57Z

whisper latency without warmup (1.82s), is 2.5 times to only whisper latency (0.71s)

glorysdj assigned ACupofAir Jan 7, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

performance downgrade on dGPU Arc770 after loading more than one LLM model #12660

performance downgrade on dGPU Arc770 after loading more than one LLM model #12660

qing-xu-intel commented Jan 7, 2025

qing-xu-intel commented Jan 7, 2025

qing-xu-intel commented Jan 7, 2025

ACupofAir commented Jan 8, 2025

qing-xu-intel commented Jan 8, 2025

qing-xu-intel commented Jan 10, 2025

ACupofAir commented Jan 10, 2025

ACupofAir commented Jan 14, 2025

qing-xu-intel commented Jan 15, 2025

Oscilloscope98 commented Jan 16, 2025 •

edited

Loading

qing-xu-intel commented Jan 26, 2025

qing-xu-intel commented Jan 26, 2025 •

edited

Loading

performance downgrade on dGPU Arc770 after loading more than one LLM model #12660

performance downgrade on dGPU Arc770 after loading more than one LLM model #12660

Comments

qing-xu-intel commented Jan 7, 2025

qing-xu-intel commented Jan 7, 2025

qing-xu-intel commented Jan 7, 2025

ACupofAir commented Jan 8, 2025

qing-xu-intel commented Jan 8, 2025

qing-xu-intel commented Jan 10, 2025

ACupofAir commented Jan 10, 2025

ACupofAir commented Jan 14, 2025

You need to comment out your warmup code. Like this:

After comment all your warmup code. Test result:

qing-xu-intel commented Jan 15, 2025

Oscilloscope98 commented Jan 16, 2025 • edited Loading

qing-xu-intel commented Jan 26, 2025

qing-xu-intel commented Jan 26, 2025 • edited Loading

Oscilloscope98 commented Jan 16, 2025 •

edited

Loading

qing-xu-intel commented Jan 26, 2025 •

edited

Loading