-
Notifications
You must be signed in to change notification settings - Fork 197
/
benchmark_genai.cpp
74 lines (62 loc) · 3.4 KB
/
benchmark_genai.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#include "openvino/genai/llm_pipeline.hpp"
#include <cxxopts.hpp>
int main(int argc, char* argv[]) try {
cxxopts::Options options("benchmark_vanilla_genai", "Help command");
options.add_options()
("m,model", "Path to model and tokenizers base directory", cxxopts::value<std::string>()->default_value("."))
("p,prompt", "Prompt", cxxopts::value<std::string>()->default_value("The Sky is blue because"))
("nw,num_warmup", "Number of warmup iterations", cxxopts::value<size_t>()->default_value(std::to_string(1)))
("n,num_iter", "Number of iterations", cxxopts::value<size_t>()->default_value(std::to_string(3)))
("mt,max_new_tokens", "Maximal number of new tokens", cxxopts::value<size_t>()->default_value(std::to_string(20)))
("d,device", "device", cxxopts::value<std::string>()->default_value("CPU"))
("h,help", "Print usage");
cxxopts::ParseResult result;
try {
result = options.parse(argc, argv);
} catch (const cxxopts::exceptions::exception& e) {
std::cout << e.what() << "\n\n";
std::cout << options.help() << std::endl;
return EXIT_FAILURE;
}
if (result.count("help")) {
std::cout << options.help() << std::endl;
return EXIT_SUCCESS;
}
std::string prompt = result["prompt"].as<std::string>();
const std::string models_path = result["model"].as<std::string>();
std::string device = result["device"].as<std::string>();
size_t num_warmup = result["num_warmup"].as<size_t>();
size_t num_iter = result["num_iter"].as<size_t>();
ov::genai::GenerationConfig config;
config.max_new_tokens = result["max_new_tokens"].as<size_t>();
ov::genai::LLMPipeline pipe(models_path, device);
for (size_t i = 0; i < num_warmup; i++)
pipe.generate(prompt, config);
ov::genai::DecodedResults res = pipe.generate(prompt, config);
ov::genai::PerfMetrics metrics = res.perf_metrics;
for (size_t i = 0; i < num_iter - 1; i++) {
res = pipe.generate(prompt, config);
metrics = metrics + res.perf_metrics;
}
std::cout << std::fixed << std::setprecision(2);
std::cout << "Load time: " << metrics.get_load_time() << " ms" << std::endl;
std::cout << "Generate time: " << metrics.get_generate_duration().mean << " ± " << metrics.get_generate_duration().std << " ms" << std::endl;
std::cout << "Tokenization time: " << metrics.get_tokenization_duration().mean << " ± " << metrics.get_tokenization_duration().std << " ms" << std::endl;
std::cout << "Detokenization time: " << metrics.get_detokenization_duration().mean << " ± " << metrics.get_detokenization_duration().std << " ms" << std::endl;
std::cout << "TTFT: " << metrics.get_ttft().mean << " ± " << metrics.get_ttft().std << " ms" << std::endl;
std::cout << "TPOT: " << metrics.get_tpot().mean << " ± " << metrics.get_tpot().std << " ms/token " << std::endl;
std::cout << "Throughput: " << metrics.get_throughput().mean << " ± " << metrics.get_throughput().std << " tokens/s" << std::endl;
return 0;
} catch (const std::exception& error) {
try {
std::cerr << error.what() << '\n';
} catch (const std::ios_base::failure&) {}
return EXIT_FAILURE;
} catch (...) {
try {
std::cerr << "Non-exception object thrown\n";
} catch (const std::ios_base::failure&) {}
return EXIT_FAILURE;
}