Merge 0a37e81 into 13539d3

MycroftAI · Sep 6, 2021 · d1e916c · d1e916c
2 parents 13539d3 + 0a37e81
commit d1e916c
Show file tree

Hide file tree

Showing 3 changed files with 113 additions and 16 deletions.
diff --git a/mycroft/res/text/en-us/noise_words.list b/mycroft/res/text/en-us/noise_words.list
@@ -0,0 +1,30 @@
+where
+what's
+which
+them
+they
+when
+what
+that
+will
+from
+that
+also
+who
+how
+did
+and
+but
+the
+too
+why
+for
+is
+it
+do
+or
+to
+of
+a
+
+
diff --git a/mycroft/skills/common_query_skill.py b/mycroft/skills/common_query_skill.py
@@ -11,11 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import time
 
 from enum import IntEnum
 from abc import ABC, abstractmethod
 from .mycroft_skill import MycroftSkill
 
+from mycroft.configuration import Configuration
+from mycroft.util.file_utils import resolve_resource_file
+
 
 class CQSMatchLevel(IntEnum):
     EXACT = 1  # Skill could find a specific answer for the question
@@ -34,6 +38,20 @@ def is_CQSVisualMatchLevel(match_level):
 
 VISUAL_DEVICES = ['mycroft_mark_2']
 
+"""these are for the confidence calculation"""
+# how much each topic word is worth
+# when found in the answer
+TOPIC_MATCH_RELEVANCE = 5
+
+# elevate relevance above all else
+RELEVANCE_MULTIPLIER = 2
+
+# we like longer articles but only so much
+MAX_ANSWER_LEN_FOR_CONFIDENCE = 50
+
+# higher number - less bias for word length
+WORD_COUNT_DIVISOR = 100
+
 
 def handles_visuals(platform):
     return platform in VISUAL_DEVICES
@@ -49,8 +67,25 @@ class CommonQuerySkill(MycroftSkill, ABC):
     This class works in conjunction with skill-query which collects
     answers from several skills presenting the best one available.
     """
+
     def __init__(self, name=None, bus=None):
         super().__init__(name, bus)
+        noise_words_filepath = "text/%s/noise_words.list" % (self.lang,)
+        noise_words_filename = resolve_resource_file(noise_words_filepath)
+        self.translated_noise_words = []
+        try:
+            with open(noise_words_filename) as f:
+                self.translated_noise_words = f.read().strip()
+            self.translated_noise_words = self.translated_noise_words.split()
+        except FileNotFoundError:
+            self.log.warning("Missing noise_words.list file in res/text/lang")
+
+        # these should probably be configurable
+        self.level_confidence = {
+            CQSMatchLevel.EXACT: 0.9,
+            CQSMatchLevel.CATEGORY: 0.6,
+            CQSMatchLevel.GENERAL: 0.5
+        }
 
     def bind(self, bus):
         """Overrides the default bind method of MycroftSkill.
@@ -80,7 +115,8 @@ def __handle_question_query(self, message):
             level = result[1]
             answer = result[2]
             callback = result[3] if len(result) > 3 else None
-            confidence = self.__calc_confidence(match, search_phrase, level)
+            confidence = self.__calc_confidence(
+                match, search_phrase, level, answer)
             self.bus.emit(message.response({"phrase": search_phrase,
                                             "skill_id": self.skill_id,
                                             "answer": answer,
@@ -92,27 +128,58 @@ def __handle_question_query(self, message):
                                             "skill_id": self.skill_id,
                                             "searching": False}))
 
-    def __calc_confidence(self, match, phrase, level):
+    def remove_noise(self, phrase):
+        """remove noise to produce essence of question"""
+        phrase = ' ' + phrase + ' '
+        for word in self.translated_noise_words:
+            mtch = ' ' + word + ' '
+            if phrase.find(mtch) > -1:
+                phrase = phrase.replace(mtch, " ")
+        phrase = ' '.join(phrase.split())
+        return phrase.strip()
+
+    def __calc_confidence(self, match, phrase, level, answer):
         # Assume the more of the words that get consumed, the better the match
         consumed_pct = len(match.split()) / len(phrase.split())
         if consumed_pct > 1.0:
             consumed_pct = 1.0
+        consumed_pct /= 10
+
+        # bonus for more sentences
+        num_sentences = float(float(len(answer.split("."))) / float(10))
 
         # Add bonus if match has visuals and the device supports them.
-        platform = self.config_core.get('enclosure', {}).get('platform')
+        platform = self.config_core.get("enclosure", {}).get("platform")
+        bonus = 0.0
         if is_CQSVisualMatchLevel(level) and handles_visuals(platform):
             bonus = 0.1
-        else:
-            bonus = 0
-
-        if int(level) == int(CQSMatchLevel.EXACT):
-            return 0.9 + (consumed_pct / 10) + bonus
-        elif int(level) == int(CQSMatchLevel.CATEGORY):
-            return 0.6 + (consumed_pct / 10) + bonus
-        elif int(level) == int(CQSMatchLevel.GENERAL):
-            return 0.5 + (consumed_pct / 10) + bonus
-        else:
-            return 0.0  # should never happen
+
+        # extract topic
+        topic = self.remove_noise(match)
+
+        # calculate relevance
+        answer = answer.lower()
+        matches = 0
+        for word in topic.split(' '):
+            if answer.find(word) > -1:
+                matches += TOPIC_MATCH_RELEVANCE
+
+        answer_size = len(answer.split(" "))
+        answer_size = min(MAX_ANSWER_LEN_FOR_CONFIDENCE, answer_size)
+
+        relevance = 0.0
+        if answer_size > 0:
+            relevance = float(float(matches) / float(answer_size))
+
+        relevance = relevance * RELEVANCE_MULTIPLIER
+
+        # extra credit for more words up to a point
+        wc_mod = float(float(answer_size) / float(WORD_COUNT_DIVISOR)) * 2
+
+        confidence = self.level_confidence[level] + \
+            consumed_pct + bonus + num_sentences + relevance + wc_mod
+
+        return confidence
 
     def __handle_query_action(self, message):
         """Message handler for question:action.

diff --git a/test/unittests/skills/test_common_query_skill.py b/test/unittests/skills/test_common_query_skill.py
@@ -96,7 +96,7 @@ def test_successful_match_query_phrase(self):
                          'What\'s the meaning of life')
         self.assertEqual(response.data['skill_id'], self.skill.skill_id)
         self.assertEqual(response.data['answer'], '42')
-        self.assertEqual(response.data['conf'], 1.0)
+        self.assertEqual(response.data['conf'], 1.12)
 
     def test_successful_visual_match_query_phrase(self):
         self.skill.config_core['enclosure']['platform'] = 'mycroft_mark_2'
@@ -120,7 +120,7 @@ def test_successful_visual_match_query_phrase(self):
                          'What\'s the meaning of life')
         self.assertEqual(response.data['skill_id'], self.skill.skill_id)
         self.assertEqual(response.data['answer'], '42')
-        self.assertEqual(response.data['conf'], 1.1)
+        self.assertEqual(response.data['conf'], 1.2200000000000002)
 
 
 class CQSTest(CommonQuerySkill):