From 1cfc4c1014a80f2d8cd89a3aa166180c11a6804b Mon Sep 17 00:00:00 2001
From: Uchechukwu Orji <orjiuchechukwu52@yahoo.com>
Date: Thu, 24 Oct 2024 14:31:02 +0100
Subject: [PATCH 1/2] use language threshold to compute zim language metadata

---
 CHANGELOG.md              |  1 +
 src/ted2zim/entrypoint.py | 10 ++++++++++
 src/ted2zim/scraper.py    | 18 +++++++++++++++++-
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ef2cfbc..aa6ad77 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Change log level of "Video at {url} has not yet been translated into {requested_lang_code}" messages from warning to debug (way too verbose)
 - Disable preloading of subtitles in video.js
+- Add `--language-threshold` CLI argument for considering languages that appear in at least specified percentage of videos in `compute_zim_languages` (#212)
 
 ### Fixed
 
diff --git a/src/ted2zim/entrypoint.py b/src/ted2zim/entrypoint.py
index b1fd0ce..efb3ba8 100755
--- a/src/ted2zim/entrypoint.py
+++ b/src/ted2zim/entrypoint.py
@@ -175,6 +175,13 @@ def main():
         default=False,
     )
 
+    parser.add_argument(
+        "--language-threshold",
+        help="Consider languages present in at least percentage of videos",
+        default=0.5,
+        type=float,
+    )
+
     args = parser.parse_args()
     set_debug(args.debug)
     logger = get_logger()
@@ -201,6 +208,9 @@ def main():
         if not args.threads >= 1:
             parser.error("--threads must be provided a positive integer")
 
+        if not 0 < args.language_threshold <= 1:
+            parser.error("--language-threshold must be between 0 and 1.")
+
         scraper = Ted2Zim(**dict(args._get_kwargs()))
         scraper.run()
     except Exception as exc:
diff --git a/src/ted2zim/scraper.py b/src/ted2zim/scraper.py
index 794d9ff..d601bc3 100644
--- a/src/ted2zim/scraper.py
+++ b/src/ted2zim/scraper.py
@@ -78,6 +78,7 @@ def __init__(
         tmp_dir,
         threads,
         disable_metadata_checks,
+        language_threshold,
     ):
         # video-encoding info
         self.video_format = video_format
@@ -98,6 +99,7 @@ def __init__(
         self.publisher = publisher
         self.name = name
         self.disable_metadata_checks = disable_metadata_checks
+        self.language_threshold = language_threshold
 
         if not self.disable_metadata_checks:
             # Validate ZIM metadata early so that we do not waste time doing operations
@@ -364,10 +366,15 @@ def compute_zim_languages(self):
         }
 
         # Attribute 10 "points" score to language in video audio and 1 "point" score
-        # to language in video subtitle
+        # to language in video subtitle if language is present in at least
+        # "threshold" percentage of videos.
         scored_languages = {
             k: 10 * audio_lang_counts.get(k, 0) + subtitle_lang_counts.get(k, 0)
             for k in list(audio_lang_counts.keys()) + list(subtitle_lang_counts.keys())
+            if self.is_language_above_threshold(
+                max(audio_lang_counts.get(k, 0), subtitle_lang_counts.get(k, 0)),
+                len(self.videos),
+            )
         }
 
         sorted_ted_languages = [
@@ -396,6 +403,15 @@ def compute_zim_languages(self):
             # Validate ZIM languages
             validate_language("Language", self.zim_languages)
 
+    def is_language_above_threshold(self, language_count: int, nb_videos: int) -> bool:
+        """check if a language appears in at least threshold percentage of videos"""
+        epsilon = 1e-5
+        appearance_fraction = language_count / nb_videos
+        return (
+            appearance_fraction >= self.language_threshold
+            or (abs(appearance_fraction - self.language_threshold)) <= epsilon
+        )
+
     def get_subtitle_dict(self, lang):
         """dict of language name and code from a larger dict lang
 

From 66c6fe4a96eeb550d2ebfb671b3470e02d3f878c Mon Sep 17 00:00:00 2001
From: Uchechukwu Orji <orjiuchechukwu52@yahoo.com>
Date: Thu, 24 Oct 2024 19:46:43 +0100
Subject: [PATCH 2/2] sort before grouping in language counts

---
 src/ted2zim/scraper.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ted2zim/scraper.py b/src/ted2zim/scraper.py
index d601bc3..4ab80f3 100644
--- a/src/ted2zim/scraper.py
+++ b/src/ted2zim/scraper.py
@@ -349,7 +349,7 @@ def compute_zim_languages(self):
         audio_lang_counts = {
             lang: len(list(group))
             for lang, group in groupby(
-                [video["native_talk_language"] for video in self.videos]
+                sorted(video["native_talk_language"] for video in self.videos)
             )
         }
 
@@ -357,11 +357,11 @@ def compute_zim_languages(self):
         subtitle_lang_counts = {
             lang: len(list(group))
             for lang, group in groupby(
-                [
+                sorted(
                     subtitle["languageCode"]
                     for video in self.videos
                     for subtitle in video["subtitles"]
-                ]
+                )
             )
         }