From 1cfc4c1014a80f2d8cd89a3aa166180c11a6804b Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Thu, 24 Oct 2024 14:31:02 +0100 Subject: [PATCH 1/2] use language threshold to compute zim language metadata --- CHANGELOG.md | 1 + src/ted2zim/entrypoint.py | 10 ++++++++++ src/ted2zim/scraper.py | 18 +++++++++++++++++- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ef2cfbc..aa6ad77 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Change log level of "Video at {url} has not yet been translated into {requested_lang_code}" messages from warning to debug (way too verbose) - Disable preloading of subtitles in video.js +- Add `--language-threshold` CLI argument for considering languages that appear in at least specified percentage of videos in `compute_zim_languages` (#212) ### Fixed diff --git a/src/ted2zim/entrypoint.py b/src/ted2zim/entrypoint.py index b1fd0ce..efb3ba8 100755 --- a/src/ted2zim/entrypoint.py +++ b/src/ted2zim/entrypoint.py @@ -175,6 +175,13 @@ def main(): default=False, ) + parser.add_argument( + "--language-threshold", + help="Consider languages present in at least percentage of videos", + default=0.5, + type=float, + ) + args = parser.parse_args() set_debug(args.debug) logger = get_logger() @@ -201,6 +208,9 @@ def main(): if not args.threads >= 1: parser.error("--threads must be provided a positive integer") + if not 0 < args.language_threshold <= 1: + parser.error("--language-threshold must be between 0 and 1.") + scraper = Ted2Zim(**dict(args._get_kwargs())) scraper.run() except Exception as exc: diff --git a/src/ted2zim/scraper.py b/src/ted2zim/scraper.py index 794d9ff..d601bc3 100644 --- a/src/ted2zim/scraper.py +++ b/src/ted2zim/scraper.py @@ -78,6 +78,7 @@ def __init__( tmp_dir, threads, disable_metadata_checks, + language_threshold, ): # video-encoding info self.video_format = video_format @@ -98,6 +99,7 @@ def __init__( self.publisher = publisher self.name = name self.disable_metadata_checks = disable_metadata_checks + self.language_threshold = language_threshold if not self.disable_metadata_checks: # Validate ZIM metadata early so that we do not waste time doing operations @@ -364,10 +366,15 @@ def compute_zim_languages(self): } # Attribute 10 "points" score to language in video audio and 1 "point" score - # to language in video subtitle + # to language in video subtitle if language is present in at least + # "threshold" percentage of videos. scored_languages = { k: 10 * audio_lang_counts.get(k, 0) + subtitle_lang_counts.get(k, 0) for k in list(audio_lang_counts.keys()) + list(subtitle_lang_counts.keys()) + if self.is_language_above_threshold( + max(audio_lang_counts.get(k, 0), subtitle_lang_counts.get(k, 0)), + len(self.videos), + ) } sorted_ted_languages = [ @@ -396,6 +403,15 @@ def compute_zim_languages(self): # Validate ZIM languages validate_language("Language", self.zim_languages) + def is_language_above_threshold(self, language_count: int, nb_videos: int) -> bool: + """check if a language appears in at least threshold percentage of videos""" + epsilon = 1e-5 + appearance_fraction = language_count / nb_videos + return ( + appearance_fraction >= self.language_threshold + or (abs(appearance_fraction - self.language_threshold)) <= epsilon + ) + def get_subtitle_dict(self, lang): """dict of language name and code from a larger dict lang From 66c6fe4a96eeb550d2ebfb671b3470e02d3f878c Mon Sep 17 00:00:00 2001 From: Uchechukwu Orji Date: Thu, 24 Oct 2024 19:46:43 +0100 Subject: [PATCH 2/2] sort before grouping in language counts --- src/ted2zim/scraper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ted2zim/scraper.py b/src/ted2zim/scraper.py index d601bc3..4ab80f3 100644 --- a/src/ted2zim/scraper.py +++ b/src/ted2zim/scraper.py @@ -349,7 +349,7 @@ def compute_zim_languages(self): audio_lang_counts = { lang: len(list(group)) for lang, group in groupby( - [video["native_talk_language"] for video in self.videos] + sorted(video["native_talk_language"] for video in self.videos) ) } @@ -357,11 +357,11 @@ def compute_zim_languages(self): subtitle_lang_counts = { lang: len(list(group)) for lang, group in groupby( - [ + sorted( subtitle["languageCode"] for video in self.videos for subtitle in video["subtitles"] - ] + ) ) }