From a54479c99e8f75f38062154ae6acfe0742462873 Mon Sep 17 00:00:00 2001
From: Marco Randazzo <marco.randazzo@iit.it>
Date: Tue, 23 Mar 2021 15:16:26 +0100
Subject: [PATCH 1/2] googleSynthesis can now stream audio to an external
 player

---
 .../modules/googleSynthesis/main.cpp          | 220 +++++++++++-------
 1 file changed, 134 insertions(+), 86 deletions(-)

diff --git a/speechInteraction/modules/googleSynthesis/main.cpp b/speechInteraction/modules/googleSynthesis/main.cpp
index d4babe9..37d1d79 100644
--- a/speechInteraction/modules/googleSynthesis/main.cpp
+++ b/speechInteraction/modules/googleSynthesis/main.cpp
@@ -45,7 +45,8 @@
 
 using namespace google::cloud::language::v1;
 using namespace google::cloud::texttospeech::v1;
-bool is_changed;
+bool  is_changed;
+using namespace std;
 
 
 static const std::map<grpc::StatusCode, std::string> status_code_to_string {
@@ -79,17 +80,27 @@ class Processing : public yarp::os::BufferedPort<yarp::os::Bottle>
     double pitch;
     yarp::os::RpcServer handlerPort;
     yarp::os::Port syncPort;
-
+    yarp::os::Port soundOutputPort;
+    enum playbackmode_t
+    {
+        playFromDisk=0,
+        sendToPort_compressed=1,
+        sendToPort_uncompressed=2
+    } playbackmode;
+    
 public:
     /********************************************************/
 
-    Processing( const std::string &moduleName, const std::string &language, const std::string &voice, const double &speed, const double &pitch, std::string &state ): state(state)
+    Processing( const std::string &moduleName, const std::string &language, const std::string &voice, const double &speed, const double &pitch, std::string &state, string playmode ): state(state)
     {
         this->moduleName = moduleName;
         this->language = language;
         this->voice = voice;
         this->speed = speed;
         this->pitch = pitch;
+             if (playmode=="playFromDisk")             {playbackmode=playbackmode_t::playFromDisk;}
+        else if (playmode=="sendToPort_compressed")    {playbackmode=playbackmode_t::sendToPort_compressed;}
+        else if (playmode=="sendToPort_uncompressed")  {playbackmode=playbackmode_t::sendToPort_uncompressed;}
     }
 
     /********************************************************/
@@ -104,7 +115,13 @@ class Processing : public yarp::os::BufferedPort<yarp::os::Bottle>
         this->useCallback();
         yarp::os::BufferedPort<yarp::os::Bottle >::open( "/" + moduleName + "/text:i" );
         syncPort.open( "/" + moduleName + "/sync:o" );
-
+        
+        if (playbackmode==playbackmode_t::sendToPort_compressed ||
+            playbackmode==playbackmode_t::sendToPort_uncompressed)
+        {
+            soundOutputPort.open("/"+moduleName+"/sound:o");
+        }
+        
         return true;
     }
 
@@ -113,6 +130,11 @@ class Processing : public yarp::os::BufferedPort<yarp::os::Bottle>
     {
         yarp::os::BufferedPort<yarp::os::Bottle >::close();
         syncPort.close();
+        if (playbackmode==playbackmode_t::sendToPort_compressed ||
+            playbackmode==playbackmode_t::sendToPort_uncompressed)
+        {
+            soundOutputPort.close();
+        }
     }
 
     /********************************************************/
@@ -130,81 +152,105 @@ class Processing : public yarp::os::BufferedPort<yarp::os::Bottle>
         sendDone();
     }
 
-    /********************************************************/
-    void queryGoogleSynthesis(yarp::os::Bottle& text)
-    {
-        yDebug() << "in queryGoogleSynthesis";
-
-        yDebug() << "Phrase is " << text.toString().c_str();
-
-        std::string tmp = text.toString();
-
-        tmp.erase(std::remove(tmp.begin(),tmp.end(),'\"'),tmp.end());
-
-        yDebug() << "Phrase is now " << tmp.c_str();
-
-        std::string content = tmp;
-
-        if (content.size()>0){
-            SynthesizeSpeechRequest request;
-            SynthesizeSpeechResponse response;
-
-            grpc::Status status;
-            grpc::ClientContext context;
-
-            auto creds = grpc::GoogleDefaultCredentials();
-            auto channel = grpc::CreateChannel("texttospeech.googleapis.com", creds);
-            std::unique_ptr<TextToSpeech::Stub> tts(TextToSpeech::NewStub(channel));
-
-            AudioConfig audio_config;
-            VoiceSelectionParams params;
-
-            SynthesisInput input;
-            input.set_text(content);
-
-            audio_config.set_audio_encoding(MP3);
-            params.set_language_code(language);
-            params.set_ssml_gender(NEUTRAL);
-            params.set_name(voice);
-            audio_config.set_speaking_rate(speed);
-            audio_config.set_pitch(pitch);
-
-            request.set_allocated_input(&input);
-            request.set_allocated_voice(&params);
-            request.set_allocated_audio_config(&audio_config);
-
-            checkState("Busy");
-            yarp::os::Time::delay(0.2);
-            grpc::Status tts_status = tts->SynthesizeSpeech(&context, request, &response);
-            std::string status_string = status_code_to_string.at(tts_status.error_code());
-            yInfo() << "Status string:" << status_string;
-            checkState("Done");
-            if ( tts_status.ok() ) {
-                yInfo() << "Status returned OK";
-                yInfo() << "\n------Response------\n";
-
-                std::string file = "test.mp3";
-                std::ofstream mp3File(file, std::ios::out | std::ios::binary);
-
-                mp3File.write( response.audio_content().data(), response.audio_content().size());
-
-                std::string command = "play test.mp3";// + file;
-
-                system(command.c_str());    
-
-            } 
-            else {
-                yError() << "Status Returned Cancelled";
-                checkState("Failure_" + status_string); 
-                yInfo() << tts_status.error_message();
-            }
-            request.release_input();
-            request.release_voice();
-            request.release_audio_config();
-
-            yInfo() << "\n------finished google query------\n";
+   /********************************************************/
+   void queryGoogleSynthesis(yarp::os::Bottle& text)
+   {
+       yDebug() << "in queryGoogleSynthesis";
+
+       yDebug() << "Phrase is " << text.toString().c_str();
+
+       std::string tmp = text.toString();
+
+       tmp.erase(std::remove(tmp.begin(),tmp.end(),'\"'),tmp.end());
+
+       yDebug() << "Phrase is now " << tmp.c_str();
+
+       std::string content = tmp;
+
+       if (content.size()>0)
+       {
+           SynthesizeSpeechRequest request;
+           SynthesizeSpeechResponse response;
+
+           grpc::Status status;
+           grpc::ClientContext context;
+
+           auto creds = grpc::GoogleDefaultCredentials();
+           auto channel = grpc::CreateChannel("texttospeech.googleapis.com", creds);
+           std::unique_ptr<TextToSpeech::Stub> tts(TextToSpeech::NewStub(channel));
+
+           AudioConfig audio_config;
+           VoiceSelectionParams params;
+
+           SynthesisInput input;
+           input.set_text(content);
+
+           audio_config.set_audio_encoding(MP3);
+           params.set_language_code(language);
+           params.set_ssml_gender(NEUTRAL);
+           params.set_name(voice);
+           audio_config.set_speaking_rate(speed);
+           audio_config.set_pitch(pitch);
+
+           request.set_allocated_input(&input);
+           request.set_allocated_voice(&params);
+           request.set_allocated_audio_config(&audio_config);
+
+           checkState("Busy");
+           yarp::os::Time::delay(0.2);
+           grpc::Status tts_status = tts->SynthesizeSpeech(&context, request, &response);
+           std::string status_string = status_code_to_string.at(tts_status.error_code());
+           yInfo() << "Status string:" << status_string;
+           checkState("Done");
+           
+           if ( tts_status.ok() )
+           {
+               yInfo() << "Status returned OK";
+               yInfo() << "\n------Response------\n";
+
+               if (playbackmode==playbackmode_t::playFromDisk)
+               {
+                   std::string file = "test.mp3";
+                   std::ofstream mp3File(file, std::ios::out | std::ios::binary);
+
+               mp3File.write( response.audio_content().data(), response.audio_content().size());
+
+                   std::string command = "play test.mp3";// + file;
+
+                   system(command.c_str());
+               }
+               else if (playbackmode==playbackmode_t::sendToPort_compressed)
+               {
+                    yarp::os::Value v (response.audio_content().data(), response.audio_content().size());
+                    yarp::os::Bottle bot; bot.add(v);
+                    soundOutputPort.write(bot);
+               }
+               else if (playbackmode==playbackmode_t::sendToPort_uncompressed)
+               {
+                    yarp::sig::Sound snd;
+                    yarp::sig::file::read_bytestream(snd, response.audio_content().data(), response.audio_content().size(), ".mp3");
+                    soundOutputPort.write(snd);
+               }
+               else
+               {
+                   yError() << "Invalid playbackmode";
+               }
+           }
+           else
+           {
+               yError() << "Status Returned Cancelled";
+               checkState("Failure_" + status_string); 
+               yInfo() << tts_status.error_message();
+           }
+
+           request.release_input();
+           request.release_voice();
+           request.release_audio_config();
+
+           yInfo() << "\n------finished google query------\n";
         }
-        else if (content.size()==0) {
+        else if (content.size()==0)
+        {
             checkState("Empty_input");
         }
     }
@@ -274,7 +320,7 @@ class Processing : public yarp::os::BufferedPort<yarp::os::Bottle>
 
     /********************************************************/
     bool checkState(std::string new_state)
-    {
+    {   
         if(new_state!=state){
             is_changed=true;
             state=new_state;
@@ -293,7 +339,7 @@ class Module : public yarp::os::RFModule, public googleSynthesis_IDL
     yarp::os::RpcServer         rpcPort;
     yarp::os::BufferedPort<yarp::os::Bottle> statePort;
     std::string state;
-     
+
     Processing                  *processing;
     friend class                processing;
 
@@ -319,6 +365,8 @@ class Module : public yarp::os::RFModule, public googleSynthesis_IDL
         double speed = rf.check("speed", yarp::os::Value(1.0), "speed to use (double)").asFloat64();
         double pitch = rf.check("pitch", yarp::os::Value(0.0), "pitch to use (double)").asFloat64();
 
+        string playmode_string = rf.check("playbackmode", yarp::os::Value("playFromDisk"), "can be one of the following: `playFromDisk`(default), `sendToPort_compressed`, `sendToPort_uncompressed`").asString();
+
         if (rf.check("languageCodes", "Getting language codes"))
         {
             yarp::os::Bottle &grp=rf.findGroup("languageCodes");
@@ -344,7 +392,7 @@ class Module : public yarp::os::RFModule, public googleSynthesis_IDL
 
         closing = false;
 
-        processing = new Processing( moduleName, language, voice, speed, pitch, state );
+        processing = new Processing( moduleName, language, voice, speed, pitch, state, playmode_string);
 
         /* now start the thread to do the work */
         processing->open();
@@ -359,9 +407,9 @@ class Module : public yarp::os::RFModule, public googleSynthesis_IDL
 
     /************************************************************************/
     bool attach(yarp::os::RpcServer &source)
-    {
+        {
         return this->yarp().attachAsServer(source);
-    }
+        }
 
     /**********************************************************/
     bool close()
@@ -471,12 +519,12 @@ class Module : public yarp::os::RFModule, public googleSynthesis_IDL
     {
         if(is_changed){
             is_changed=false;
-            yarp::os::Bottle &outTargets = statePort.prepare();
-            outTargets.clear();
+            yarp::os::Bottle &outTargets = statePort.prepare();   
+            outTargets.clear();  
             outTargets.addString(state);
             yDebug() << "outTarget:" << outTargets.toString().c_str();
             statePort.write();
-        }
+        }  
         return !closing;
     }
 };

From 6d20fb92494dc827d4665640772ec3a3f2ee6739 Mon Sep 17 00:00:00 2001
From: DatSpace <konkarapas@outlook.com>
Date: Wed, 2 Feb 2022 13:42:39 +0000
Subject: [PATCH 2/2] Added metadata to google speech2text

---
 speechInteraction/modules/googleSpeech/main.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/speechInteraction/modules/googleSpeech/main.cpp b/speechInteraction/modules/googleSpeech/main.cpp
index ff6fe8b..f02ebd1 100644
--- a/speechInteraction/modules/googleSpeech/main.cpp
+++ b/speechInteraction/modules/googleSpeech/main.cpp
@@ -325,6 +325,14 @@ class Processing : public yarp::os::TypedReaderCallback<yarp::sig::Sound>
         config->set_language_code(language.c_str());
         config->set_sample_rate_hertz(sample_rate);
         config->set_encoding(RecognitionConfig::LINEAR16);
+
+        config->set_use_enhanced(true); // Can be used with the correct model. If true but no model specified, it does nothing.
+        auto metadata = config->mutable_metadata();
+
+        metadata->set_microphone_distance(google::cloud::speech::v1::RecognitionMetadata_MicrophoneDistance_MIDFIELD);
+        metadata->set_recording_device_type(google::cloud::speech::v1::RecognitionMetadata_RecordingDeviceType_OTHER_INDOOR_DEVICE);
+        metadata->set_interaction_type(google::cloud::speech::v1::RecognitionMetadata_InteractionType_VOICE_COMMAND);
+        metadata->set_original_media_type(google::cloud::speech::v1::RecognitionMetadata_OriginalMediaType_AUDIO);
     }
 
     /********************************************************/