From a54479c99e8f75f38062154ae6acfe0742462873 Mon Sep 17 00:00:00 2001 From: Marco Randazzo Date: Tue, 23 Mar 2021 15:16:26 +0100 Subject: [PATCH 1/2] googleSynthesis can now stream audio to an external player --- .../modules/googleSynthesis/main.cpp | 220 +++++++++++------- 1 file changed, 134 insertions(+), 86 deletions(-) diff --git a/speechInteraction/modules/googleSynthesis/main.cpp b/speechInteraction/modules/googleSynthesis/main.cpp index d4babe9..37d1d79 100644 --- a/speechInteraction/modules/googleSynthesis/main.cpp +++ b/speechInteraction/modules/googleSynthesis/main.cpp @@ -45,7 +45,8 @@ using namespace google::cloud::language::v1; using namespace google::cloud::texttospeech::v1; -bool is_changed; +bool is_changed; +using namespace std; static const std::map status_code_to_string { @@ -79,17 +80,27 @@ class Processing : public yarp::os::BufferedPort double pitch; yarp::os::RpcServer handlerPort; yarp::os::Port syncPort; - + yarp::os::Port soundOutputPort; + enum playbackmode_t + { + playFromDisk=0, + sendToPort_compressed=1, + sendToPort_uncompressed=2 + } playbackmode; + public: /********************************************************/ - Processing( const std::string &moduleName, const std::string &language, const std::string &voice, const double &speed, const double &pitch, std::string &state ): state(state) + Processing( const std::string &moduleName, const std::string &language, const std::string &voice, const double &speed, const double &pitch, std::string &state, string playmode ): state(state) { this->moduleName = moduleName; this->language = language; this->voice = voice; this->speed = speed; this->pitch = pitch; + if (playmode=="playFromDisk") {playbackmode=playbackmode_t::playFromDisk;} + else if (playmode=="sendToPort_compressed") {playbackmode=playbackmode_t::sendToPort_compressed;} + else if (playmode=="sendToPort_uncompressed") {playbackmode=playbackmode_t::sendToPort_uncompressed;} } /********************************************************/ @@ -104,7 +115,13 @@ class Processing : public yarp::os::BufferedPort this->useCallback(); yarp::os::BufferedPort::open( "/" + moduleName + "/text:i" ); syncPort.open( "/" + moduleName + "/sync:o" ); - + + if (playbackmode==playbackmode_t::sendToPort_compressed || + playbackmode==playbackmode_t::sendToPort_uncompressed) + { + soundOutputPort.open("/"+moduleName+"/sound:o"); + } + return true; } @@ -113,6 +130,11 @@ class Processing : public yarp::os::BufferedPort { yarp::os::BufferedPort::close(); syncPort.close(); + if (playbackmode==playbackmode_t::sendToPort_compressed || + playbackmode==playbackmode_t::sendToPort_uncompressed) + { + soundOutputPort.close(); + } } /********************************************************/ @@ -130,81 +152,105 @@ class Processing : public yarp::os::BufferedPort sendDone(); } - /********************************************************/ - void queryGoogleSynthesis(yarp::os::Bottle& text) - { - yDebug() << "in queryGoogleSynthesis"; - - yDebug() << "Phrase is " << text.toString().c_str(); - - std::string tmp = text.toString(); - - tmp.erase(std::remove(tmp.begin(),tmp.end(),'\"'),tmp.end()); - - yDebug() << "Phrase is now " << tmp.c_str(); - - std::string content = tmp; - - if (content.size()>0){ - SynthesizeSpeechRequest request; - SynthesizeSpeechResponse response; - - grpc::Status status; - grpc::ClientContext context; - - auto creds = grpc::GoogleDefaultCredentials(); - auto channel = grpc::CreateChannel("texttospeech.googleapis.com", creds); - std::unique_ptr tts(TextToSpeech::NewStub(channel)); - - AudioConfig audio_config; - VoiceSelectionParams params; - - SynthesisInput input; - input.set_text(content); - - audio_config.set_audio_encoding(MP3); - params.set_language_code(language); - params.set_ssml_gender(NEUTRAL); - params.set_name(voice); - audio_config.set_speaking_rate(speed); - audio_config.set_pitch(pitch); - - request.set_allocated_input(&input); - request.set_allocated_voice(¶ms); - request.set_allocated_audio_config(&audio_config); - - checkState("Busy"); - yarp::os::Time::delay(0.2); - grpc::Status tts_status = tts->SynthesizeSpeech(&context, request, &response); - std::string status_string = status_code_to_string.at(tts_status.error_code()); - yInfo() << "Status string:" << status_string; - checkState("Done"); - if ( tts_status.ok() ) { - yInfo() << "Status returned OK"; - yInfo() << "\n------Response------\n"; - - std::string file = "test.mp3"; - std::ofstream mp3File(file, std::ios::out | std::ios::binary); - - mp3File.write( response.audio_content().data(), response.audio_content().size()); - - std::string command = "play test.mp3";// + file; - - system(command.c_str()); - - } - else { - yError() << "Status Returned Cancelled"; - checkState("Failure_" + status_string); - yInfo() << tts_status.error_message(); - } - request.release_input(); - request.release_voice(); - request.release_audio_config(); - - yInfo() << "\n------finished google query------\n"; + /********************************************************/ + void queryGoogleSynthesis(yarp::os::Bottle& text) + { + yDebug() << "in queryGoogleSynthesis"; + + yDebug() << "Phrase is " << text.toString().c_str(); + + std::string tmp = text.toString(); + + tmp.erase(std::remove(tmp.begin(),tmp.end(),'\"'),tmp.end()); + + yDebug() << "Phrase is now " << tmp.c_str(); + + std::string content = tmp; + + if (content.size()>0) + { + SynthesizeSpeechRequest request; + SynthesizeSpeechResponse response; + + grpc::Status status; + grpc::ClientContext context; + + auto creds = grpc::GoogleDefaultCredentials(); + auto channel = grpc::CreateChannel("texttospeech.googleapis.com", creds); + std::unique_ptr tts(TextToSpeech::NewStub(channel)); + + AudioConfig audio_config; + VoiceSelectionParams params; + + SynthesisInput input; + input.set_text(content); + + audio_config.set_audio_encoding(MP3); + params.set_language_code(language); + params.set_ssml_gender(NEUTRAL); + params.set_name(voice); + audio_config.set_speaking_rate(speed); + audio_config.set_pitch(pitch); + + request.set_allocated_input(&input); + request.set_allocated_voice(¶ms); + request.set_allocated_audio_config(&audio_config); + + checkState("Busy"); + yarp::os::Time::delay(0.2); + grpc::Status tts_status = tts->SynthesizeSpeech(&context, request, &response); + std::string status_string = status_code_to_string.at(tts_status.error_code()); + yInfo() << "Status string:" << status_string; + checkState("Done"); + + if ( tts_status.ok() ) + { + yInfo() << "Status returned OK"; + yInfo() << "\n------Response------\n"; + + if (playbackmode==playbackmode_t::playFromDisk) + { + std::string file = "test.mp3"; + std::ofstream mp3File(file, std::ios::out | std::ios::binary); + + mp3File.write( response.audio_content().data(), response.audio_content().size()); + + std::string command = "play test.mp3";// + file; + + system(command.c_str()); + } + else if (playbackmode==playbackmode_t::sendToPort_compressed) + { + yarp::os::Value v (response.audio_content().data(), response.audio_content().size()); + yarp::os::Bottle bot; bot.add(v); + soundOutputPort.write(bot); + } + else if (playbackmode==playbackmode_t::sendToPort_uncompressed) + { + yarp::sig::Sound snd; + yarp::sig::file::read_bytestream(snd, response.audio_content().data(), response.audio_content().size(), ".mp3"); + soundOutputPort.write(snd); + } + else + { + yError() << "Invalid playbackmode"; + } + } + else + { + yError() << "Status Returned Cancelled"; + checkState("Failure_" + status_string); + yInfo() << tts_status.error_message(); + } + + request.release_input(); + request.release_voice(); + request.release_audio_config(); + + yInfo() << "\n------finished google query------\n"; } - else if (content.size()==0) { + else if (content.size()==0) + { checkState("Empty_input"); } } @@ -274,7 +320,7 @@ class Processing : public yarp::os::BufferedPort /********************************************************/ bool checkState(std::string new_state) - { + { if(new_state!=state){ is_changed=true; state=new_state; @@ -293,7 +339,7 @@ class Module : public yarp::os::RFModule, public googleSynthesis_IDL yarp::os::RpcServer rpcPort; yarp::os::BufferedPort statePort; std::string state; - + Processing *processing; friend class processing; @@ -319,6 +365,8 @@ class Module : public yarp::os::RFModule, public googleSynthesis_IDL double speed = rf.check("speed", yarp::os::Value(1.0), "speed to use (double)").asFloat64(); double pitch = rf.check("pitch", yarp::os::Value(0.0), "pitch to use (double)").asFloat64(); + string playmode_string = rf.check("playbackmode", yarp::os::Value("playFromDisk"), "can be one of the following: `playFromDisk`(default), `sendToPort_compressed`, `sendToPort_uncompressed`").asString(); + if (rf.check("languageCodes", "Getting language codes")) { yarp::os::Bottle &grp=rf.findGroup("languageCodes"); @@ -344,7 +392,7 @@ class Module : public yarp::os::RFModule, public googleSynthesis_IDL closing = false; - processing = new Processing( moduleName, language, voice, speed, pitch, state ); + processing = new Processing( moduleName, language, voice, speed, pitch, state, playmode_string); /* now start the thread to do the work */ processing->open(); @@ -359,9 +407,9 @@ class Module : public yarp::os::RFModule, public googleSynthesis_IDL /************************************************************************/ bool attach(yarp::os::RpcServer &source) - { + { return this->yarp().attachAsServer(source); - } + } /**********************************************************/ bool close() @@ -471,12 +519,12 @@ class Module : public yarp::os::RFModule, public googleSynthesis_IDL { if(is_changed){ is_changed=false; - yarp::os::Bottle &outTargets = statePort.prepare(); - outTargets.clear(); + yarp::os::Bottle &outTargets = statePort.prepare(); + outTargets.clear(); outTargets.addString(state); yDebug() << "outTarget:" << outTargets.toString().c_str(); statePort.write(); - } + } return !closing; } }; From 6d20fb92494dc827d4665640772ec3a3f2ee6739 Mon Sep 17 00:00:00 2001 From: DatSpace Date: Wed, 2 Feb 2022 13:42:39 +0000 Subject: [PATCH 2/2] Added metadata to google speech2text --- speechInteraction/modules/googleSpeech/main.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/speechInteraction/modules/googleSpeech/main.cpp b/speechInteraction/modules/googleSpeech/main.cpp index ff6fe8b..f02ebd1 100644 --- a/speechInteraction/modules/googleSpeech/main.cpp +++ b/speechInteraction/modules/googleSpeech/main.cpp @@ -325,6 +325,14 @@ class Processing : public yarp::os::TypedReaderCallback config->set_language_code(language.c_str()); config->set_sample_rate_hertz(sample_rate); config->set_encoding(RecognitionConfig::LINEAR16); + + config->set_use_enhanced(true); // Can be used with the correct model. If true but no model specified, it does nothing. + auto metadata = config->mutable_metadata(); + + metadata->set_microphone_distance(google::cloud::speech::v1::RecognitionMetadata_MicrophoneDistance_MIDFIELD); + metadata->set_recording_device_type(google::cloud::speech::v1::RecognitionMetadata_RecordingDeviceType_OTHER_INDOOR_DEVICE); + metadata->set_interaction_type(google::cloud::speech::v1::RecognitionMetadata_InteractionType_VOICE_COMMAND); + metadata->set_original_media_type(google::cloud::speech::v1::RecognitionMetadata_OriginalMediaType_AUDIO); } /********************************************************/