diff --git a/recipes/joint_training_vox_populi/prepare_data/common_voice_to_wav2letter.py b/recipes/joint_training_vox_populi/prepare_data/common_voice_to_wav2letter.py index 3b6cc34b..6de823a4 100644 --- a/recipes/joint_training_vox_populi/prepare_data/common_voice_to_wav2letter.py +++ b/recipes/joint_training_vox_populi/prepare_data/common_voice_to_wav2letter.py @@ -46,7 +46,6 @@ def to_wav2letterFormat(data: torch.tensor, sr: int) -> torch.tensor: def get_base_data_from_csv(pathTSV) -> List[Dict[str, str]]: - out = [] with open(pathTSV, "r", encoding="utf-8") as tsvfile: reader = csv.DictReader(tsvfile, dialect="excel-tab") @@ -64,7 +63,6 @@ def norm_text( replace_set: Optional[Dict[str, str]] = None, del_set: Optional[Set[str]] = None, ) -> Tuple[bool, str]: - text = text.lower() if replace_set is not None: for char_, val in replace_set.items(): @@ -98,7 +96,6 @@ def get_full_audio_data( del_set: Optional[Set[str]] = None, file_extension: str = None, ) -> List[FileInfo]: - output = [] for audio_data in tqdm(base_data, total=len(base_data)): path_audio = path_dir_audio / audio_data["local_path"] @@ -130,7 +127,6 @@ def get_full_audio_data( def convert_audio_data( input_list: List[FileInfo], out_dir_audio: Path ) -> List[FileInfo]: - out_dir_audio.mkdir(exist_ok=True) output = [] for file_info in tqdm(input_list, total=len(input_list)): @@ -153,13 +149,11 @@ def convert_audio_data( def load_filter(path_filter: Path) -> List[str]: - with open(path_filter, "r") as f: return [x.strip() for x in f.readlines()] def filter_data_by_id(input_lst: List[FileInfo], to_filter: List[str]): - input_lst.sort(key=lambda x: x.id_) to_filter.sort() @@ -183,7 +177,6 @@ def filter_data_by_id(input_lst: List[FileInfo], to_filter: List[str]): def main(args): - letters = load_letters(Path(args.path_tokens)) data = get_base_data_from_csv(Path(args.path_tsv)) audio_data = get_full_audio_data( @@ -207,7 +200,6 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( description="Build the lst input files for common voices datasets" ) diff --git a/recipes/joint_training_vox_populi/prepare_data/get_tokens.py b/recipes/joint_training_vox_populi/prepare_data/get_tokens.py index efcbfdcd..5ba8cdb8 100644 --- a/recipes/joint_training_vox_populi/prepare_data/get_tokens.py +++ b/recipes/joint_training_vox_populi/prepare_data/get_tokens.py @@ -18,7 +18,6 @@ def get_tokens_from_str(str_in) -> Set[str]: def get_tokens_from_str_list(list_str: List[str]) -> Set[str]: - out = set() for str_in in list_str: out = out.union(get_tokens_from_str(str_in)) @@ -27,7 +26,6 @@ def get_tokens_from_str_list(list_str: List[str]) -> Set[str]: def save_tokens(tokens, path_out, eow_token="|") -> None: - with open(path_out, "w") as f: for x in tokens: f.write(x + "\n") @@ -35,7 +33,6 @@ def save_tokens(tokens, path_out, eow_token="|") -> None: def main(args): - data = get_base_data_from_csv(args.input_csv) all_tokens = get_tokens_from_str_list([x["text"] for x in data]) @@ -48,7 +45,6 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser("Token builder") parser.add_argument("input_csv") parser.add_argument("output") diff --git a/recipes/joint_training_vox_populi/prepare_data/lst_utils.py b/recipes/joint_training_vox_populi/prepare_data/lst_utils.py index 48817210..a7df5b5b 100644 --- a/recipes/joint_training_vox_populi/prepare_data/lst_utils.py +++ b/recipes/joint_training_vox_populi/prepare_data/lst_utils.py @@ -19,14 +19,12 @@ class FileInfo: def save_lst(lst_data: List[FileInfo], path_out: Path) -> None: - with open(path_out, "w") as file: for data in lst_data: file.write(f"{data.id_} {data.path_} {data.size*3600 * 1000} {data.text}\n") def load_lst(path_file: Path) -> List[FileInfo]: - with open(path_file, "r") as file: data = [x.strip() for x in file.readlines()] diff --git a/recipes/joint_training_vox_populi/prepare_data/make_lexicon.py b/recipes/joint_training_vox_populi/prepare_data/make_lexicon.py index d8924dbe..d7950f06 100644 --- a/recipes/joint_training_vox_populi/prepare_data/make_lexicon.py +++ b/recipes/joint_training_vox_populi/prepare_data/make_lexicon.py @@ -19,7 +19,6 @@ def has_valid_tokens(word: str, tokens: Set[str]) -> bool: def read_token_file(path_token_file: Path, eow_char: str) -> Set[str]: - with path_token_file.open("r") as file: data = [x.strip() for x in file.readlines()] @@ -29,7 +28,6 @@ def read_token_file(path_token_file: Path, eow_char: str) -> Set[str]: def save_lexicon( lexicon: Set[str], path_out: Path, eow_char: str, tokens: Set[str] ) -> None: - list_lexicon = list(lexicon) list_lexicon.sort() @@ -98,7 +96,6 @@ def lexicon_from_lst( min_occ: int = 10, is_raw_text: bool = False, ) -> None: - out_lexicon = set() tokens = read_token_file(path_tokens, eow_char) log.info("Token file loaded") diff --git a/recipes/lexicon_free/utilities/compute_upper_ppl_convlm.py b/recipes/lexicon_free/utilities/compute_upper_ppl_convlm.py index 70cf292d..3fc789f5 100644 --- a/recipes/lexicon_free/utilities/compute_upper_ppl_convlm.py +++ b/recipes/lexicon_free/utilities/compute_upper_ppl_convlm.py @@ -102,8 +102,9 @@ def compute_ppl_upper_limit_char_convlm( print("Upper word perplexity for all words: {}".format(ppl_word)) print("Upper word perplexity for unknown words: {}".format(ppl_word_unk)) print( - "(Reported in the paper) " - "Upper word perplexity for known words: {}".format(ppl_word_no_unk) + "(Reported in the paper) " "Upper word perplexity for known words: {}".format( + ppl_word_no_unk + ) ) @@ -142,8 +143,9 @@ def compute_ppl_upper_limit_word_convlm(model, input_wordlm): print("Word perplexity for all words: {}".format(ppl_word)) print("Word perplexity for unknown words: {}".format(ppl_word_unk)) print( - "(Reported in the paper) " - "Word perplexity for known words: {}".format(ppl_word_no_unk) + "(Reported in the paper) " "Word perplexity for known words: {}".format( + ppl_word_no_unk + ) )