diff --git a/core/_core.py b/core/_core.py index a6fefa448..f9306db4b 100644 --- a/core/_core.py +++ b/core/_core.py @@ -27,7 +27,7 @@ lib = cdll.LoadLibrary(str(core_dll_path)) # 関数型定義 -lib.initialize.argtypes = (c_bool, c_int) +lib.initialize.argtypes = (c_bool, c_int, c_bool) lib.initialize.restype = c_bool lib.finalize.argtypes = () @@ -52,8 +52,8 @@ # ラッパー関数 -def initialize(use_gpu: bool, cpu_num_threads=0): - success = lib.initialize(use_gpu, cpu_num_threads) +def initialize(use_gpu: bool, cpu_num_threads=0, load_all_models=True): + success = lib.initialize(use_gpu, cpu_num_threads, load_all_models) if not success: raise Exception(lib.last_error_message().decode()) diff --git a/core/src/core.cpp b/core/src/core.cpp index ee0315714..e7b7cc4d8 100644 --- a/core/src/core.cpp +++ b/core/src/core.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -19,9 +20,7 @@ #include "core.h" #define NOT_INITIALIZED_ERR "Call initialize() first." -#define NOT_FOUND_ERR "No such file or directory: " -#define FAILED_TO_OPEN_MODEL_ERR "Unable to open model files." -#define FAILED_TO_OPEN_METAS_ERR "Unable to open metas.json." +#define NOT_LOADED_ERR "Model is not loaded." #define ONNX_ERR "ONNX raise exception: " #define JSON_ERR "JSON parser raise exception: " #define GPU_NOT_SUPPORTED_ERR "This library is CPU version. GPU is not supported." @@ -43,13 +42,19 @@ EMBED_DECL(YUKARIN_S); EMBED_DECL(YUKARIN_SA); EMBED_DECL(DECODE); -const struct { +/** + * 3種類のモデルを一纏めにしたもの + */ +struct VVMODEL { embed::EMBED_RES (*YUKARIN_S)(); embed::EMBED_RES (*YUKARIN_SA)(); embed::EMBED_RES (*DECODE)(); -} MODELS_LIST[] = {{YUKARIN_S, YUKARIN_SA, DECODE}}; +}; +const VVMODEL VVMODEL_LIST[] = { + {YUKARIN_S, YUKARIN_SA, DECODE}, +}; } // namespace EMBED_DECL_NAMESPACE -using EMBED_DECL_NAMESPACE::MODELS_LIST; +using EMBED_DECL_NAMESPACE::VVMODEL_LIST; // 複数モデルある場合のspeaker_idマッピング // {元のspeaker_id: {モデル番号, 新しいspeaker_id}} @@ -76,8 +81,23 @@ SupportedDevices get_supported_devices() { } struct Status { - Status(bool use_gpu_) - : use_gpu(use_gpu_), memory_info(Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU)) {} + Status(int model_count, bool use_gpu, int cpu_num_threads) + : memory_info(Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU)) { + yukarin_s_list = std::vector>(model_count); + yukarin_sa_list = std::vector>(model_count); + decode_list = std::vector>(model_count); + + session_options.SetInterOpNumThreads(cpu_num_threads).SetIntraOpNumThreads(cpu_num_threads); + if (use_gpu) { +#ifdef DIRECTML + session_options.DisableMemPattern().SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_DML(session_options, 0)); +#else + const OrtCUDAProviderOptions cuda_options; + session_options.AppendExecutionProvider_CUDA(cuda_options); +#endif + } + } /** * Loads the metas.json. * @@ -89,7 +109,7 @@ struct Status { * version: string * }] */ - bool load(int cpu_num_threads) { + bool load_metas() { embed::Resource metas_file = METAS(); metas = nlohmann::json::parse(metas_file.data, metas_file.data + metas_file.size); @@ -100,36 +120,32 @@ struct Status { supported_styles.insert(style["id"].get()); } } + return true; + } - for (const auto MODELS : MODELS_LIST) { - embed::Resource yukarin_s_model = MODELS.YUKARIN_S(); - embed::Resource yukarin_sa_model = MODELS.YUKARIN_SA(); - embed::Resource decode_model = MODELS.DECODE(); - - Ort::SessionOptions session_options; - session_options.SetInterOpNumThreads(cpu_num_threads).SetIntraOpNumThreads(cpu_num_threads); - yukarin_s_list.push_back(Ort::Session(env, yukarin_s_model.data, yukarin_s_model.size, session_options)); - yukarin_sa_list.push_back(Ort::Session(env, yukarin_sa_model.data, yukarin_sa_model.size, session_options)); - if (use_gpu) { -#ifdef DIRECTML - session_options.DisableMemPattern().SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_DML(session_options, 0)); -#else - const OrtCUDAProviderOptions cuda_options; - session_options.AppendExecutionProvider_CUDA(cuda_options); -#endif - } - decode_list.push_back(Ort::Session(env, decode_model.data, decode_model.size, session_options)); - } + /** + * モデルを読み込む + */ + bool load_model(int model_index) { + const auto VVMODEL = VVMODEL_LIST[model_index]; + embed::Resource yukarin_s_model = VVMODEL.YUKARIN_S(); + embed::Resource yukarin_sa_model = VVMODEL.YUKARIN_SA(); + embed::Resource decode_model = VVMODEL.DECODE(); + + yukarin_s_list[model_index] = + std::move(Ort::Session(env, yukarin_s_model.data, yukarin_s_model.size, session_options)); + yukarin_sa_list[model_index] = + std::move(Ort::Session(env, yukarin_sa_model.data, yukarin_sa_model.size, session_options)); + decode_list[model_index] = std::move(Ort::Session(env, decode_model.data, decode_model.size, session_options)); return true; } std::string root_dir_path; - bool use_gpu; + Ort::SessionOptions session_options; Ort::MemoryInfo memory_info; Ort::Env env{ORT_LOGGING_LEVEL_ERROR}; - std::vector yukarin_s_list, yukarin_sa_list, decode_list; + std::vector> yukarin_s_list, yukarin_sa_list, decode_list; nlohmann::json metas; std::string metas_str; @@ -166,7 +182,7 @@ std::pair get_model_index_and_speaker_id(int64_t speaker_id) { return found->second; } -bool initialize(bool use_gpu, int cpu_num_threads) { +bool initialize(bool use_gpu, int cpu_num_threads, bool load_all_models) { initialized = false; #ifdef DIRECTML @@ -178,18 +194,29 @@ bool initialize(bool use_gpu, int cpu_num_threads) { return false; } try { - status = std::make_unique(use_gpu); - if (!status->load(cpu_num_threads)) { + const int model_count = std::size(VVMODEL_LIST); + status = std::make_unique(model_count, use_gpu, cpu_num_threads); + if (!status->load_metas()) { return false; } - if (use_gpu) { - // 一回走らせて十分なGPUメモリを確保させる - int length = 500; - int phoneme_size = 45; - std::vector phoneme(length * phoneme_size), f0(length); - int64_t speaker_id = 0; - std::vector output(length * 256); - decode_forward(length, phoneme_size, f0.data(), phoneme.data(), &speaker_id, output.data()); + + if (load_all_models) { + for (int model_index = 0; model_index < model_count; model_index++) { + if (!status->load_model(model_index)) { + return false; + } + } + + if (use_gpu) { + // 一回走らせて十分なGPUメモリを確保させる + // TODO: 全MODELに対して行う + int length = 500; + int phoneme_size = 45; + std::vector phoneme(length * phoneme_size), f0(length); + int64_t speaker_id = 0; + std::vector output(length * 256); + decode_forward(length, phoneme_size, f0.data(), phoneme.data(), &speaker_id, output.data()); + } } } catch (const Ort::Exception &e) { error_message = ONNX_ERR; @@ -208,6 +235,17 @@ bool initialize(bool use_gpu, int cpu_num_threads) { return true; } +bool load_model(int64_t speaker_id) { + auto [model_index, _] = get_model_index_and_speaker_id(speaker_id); + return status->load_model(model_index); +} + +bool is_model_loaded(int64_t speaker_id) { + auto [model_index, _] = get_model_index_and_speaker_id(speaker_id); + return (status->yukarin_s_list[model_index].has_value() && status->yukarin_sa_list[model_index].has_value() && + status->decode_list[model_index].has_value()); +} + void finalize() { initialized = false; status.reset(); @@ -231,6 +269,11 @@ bool yukarin_s_forward(int64_t length, int64_t *phoneme_list, int64_t *speaker_i return false; } auto [model_index, model_speaker_id] = get_model_index_and_speaker_id(*speaker_id); + auto &model = status->yukarin_s_list[model_index]; + if (!model) { + error_message = NOT_LOADED_ERR; + return false; + } try { const char *inputs[] = {"phoneme_list", "speaker_id"}; const char *outputs[] = {"phoneme_length"}; @@ -240,8 +283,8 @@ bool yukarin_s_forward(int64_t length, int64_t *phoneme_list, int64_t *speaker_i to_tensor(&model_speaker_id, speaker_shape)}; Ort::Value output_tensor = to_tensor(output, phoneme_shape); - status->yukarin_s_list[model_index].Run(Ort::RunOptions{nullptr}, inputs, input_tensors.data(), - input_tensors.size(), outputs, &output_tensor, 1); + model.value().Run(Ort::RunOptions{nullptr}, inputs, input_tensors.data(), input_tensors.size(), outputs, + &output_tensor, 1); for (int64_t i = 0; i < length; i++) { if (output[i] < PHONEME_LENGTH_MINIMAL) output[i] = PHONEME_LENGTH_MINIMAL; @@ -266,6 +309,11 @@ bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list, int64_t *co return false; } auto [model_index, model_speaker_id] = get_model_index_and_speaker_id(*speaker_id); + auto &model = status->yukarin_sa_list[model_index]; + if (!model) { + error_message = NOT_LOADED_ERR; + return false; + } try { const char *inputs[] = { "length", "vowel_phoneme_list", "consonant_phoneme_list", "start_accent_list", @@ -283,8 +331,8 @@ bool yukarin_sa_forward(int64_t length, int64_t *vowel_phoneme_list, int64_t *co to_tensor(&model_speaker_id, speaker_shape)}; Ort::Value output_tensor = to_tensor(output, phoneme_shape); - status->yukarin_sa_list[model_index].Run(Ort::RunOptions{nullptr}, inputs, input_tensors.data(), - input_tensors.size(), outputs, &output_tensor, 1); + model.value().Run(Ort::RunOptions{nullptr}, inputs, input_tensors.data(), input_tensors.size(), outputs, + &output_tensor, 1); } catch (const Ort::Exception &e) { error_message = ONNX_ERR; error_message += e.what(); @@ -346,6 +394,11 @@ bool decode_forward(int64_t length, int64_t phoneme_size, float *f0, float *phon return false; } auto [model_index, model_speaker_id] = get_model_index_and_speaker_id(*speaker_id); + auto &model = status->decode_list[model_index]; + if (!model) { + error_message = NOT_LOADED_ERR; + return false; + } try { // 音が途切れてしまうのを避けるworkaround処理が入っている // TODO: 改善したらここのpadding処理を取り除く @@ -381,8 +434,8 @@ bool decode_forward(int64_t length, int64_t phoneme_size, float *f0, float *phon const char *inputs[] = {"f0", "phoneme", "speaker_id"}; const char *outputs[] = {"wave"}; - status->decode_list[model_index].Run(Ort::RunOptions{nullptr}, inputs, input_tensor.data(), input_tensor.size(), - outputs, &output_tensor, 1); + model.value().Run(Ort::RunOptions{nullptr}, inputs, input_tensor.data(), input_tensor.size(), outputs, + &output_tensor, 1); // TODO: 改善したらここのcopy処理を取り除く copy_output_with_padding_to_output(output_with_padding, output, padding_f0_size); diff --git a/core/src/core.h b/core/src/core.h index 15608dd1f..5c13e90fc 100644 --- a/core/src/core.h +++ b/core/src/core.h @@ -38,17 +38,41 @@ typedef enum { * @brief 音声合成するための初期化を行う。他の関数を正しく実行するには先に初期化が必要 * @param use_gpu trueならGPU用、falseならCPU用の初期化を行う * @param cpu_num_threads 推論に用いるスレッド数を設定する。0の場合論理コア数の半分か、物理コア数が設定される + * @param load_all_models trueなら全てのモデルをロードする * @return 成功したらtrue、失敗したらfalse * @detail * 何度も実行可能。use_gpuを変更して実行しなおすことも可能。 * 最後に実行したuse_gpuに従って他の関数が実行される。 */ -VOICEVOX_CORE_API bool initialize(bool use_gpu, int cpu_num_threads +VOICEVOX_CORE_API bool initialize(bool use_gpu, + int cpu_num_threads #ifdef __cplusplus - = 0 + = 0 +#endif + , + bool load_all_models +#ifdef __cplusplus + = true #endif ); +/** + * モデルをロードする + * @param speaker_id 話者番号 + * @return 成功したらtrue、失敗したらfalse + * @detail + * 必ずしも話者とモデルが1:1対応しているわけではない。 + */ +VOICEVOX_CORE_API bool load_model(int64_t speaker_id); + +/** + * @fn + * モデルがロード済みかどうか + * @param speaker_id 話者番号 + * @return ロード済みならtrue、そうでないならfalse + */ +VOICEVOX_CORE_API bool is_model_loaded(int64_t speaker_id); + /** * @fn * 終了処理を行う