Skip to content

Commit

Permalink
Convert directory fbcode/deeplearning to use the Ruff Formatter
Browse files Browse the repository at this point in the history
Summary:
X-link: flashlight/flashlight#1176

X-link: pytorch/FBGEMM#3242

Converts the directory specified to use the Ruff formatter in pyfmt

ruff_dog

If this diff causes merge conflicts when rebasing, please run
`hg status -n -0 --change . -I '**/*.{py,pyi}' | xargs -0 arc pyfmt`
on your diff, and amend any changes before rebasing onto latest.
That should help reduce or eliminate any merge conflicts.

allow-large-files
bypass-github-export-checks

Differential Revision: D63766623
  • Loading branch information
Thomas Polasek authored and facebook-github-bot committed Oct 11, 2024
1 parent f197d16 commit f8a766c
Show file tree
Hide file tree
Showing 5 changed files with 6 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def to_wav2letterFormat(data: torch.tensor, sr: int) -> torch.tensor:


def get_base_data_from_csv(pathTSV) -> List[Dict[str, str]]:

out = []
with open(pathTSV, "r", encoding="utf-8") as tsvfile:
reader = csv.DictReader(tsvfile, dialect="excel-tab")
Expand All @@ -64,7 +63,6 @@ def norm_text(
replace_set: Optional[Dict[str, str]] = None,
del_set: Optional[Set[str]] = None,
) -> Tuple[bool, str]:

text = text.lower()
if replace_set is not None:
for char_, val in replace_set.items():
Expand Down Expand Up @@ -98,7 +96,6 @@ def get_full_audio_data(
del_set: Optional[Set[str]] = None,
file_extension: str = None,
) -> List[FileInfo]:

output = []
for audio_data in tqdm(base_data, total=len(base_data)):
path_audio = path_dir_audio / audio_data["local_path"]
Expand Down Expand Up @@ -130,7 +127,6 @@ def get_full_audio_data(
def convert_audio_data(
input_list: List[FileInfo], out_dir_audio: Path
) -> List[FileInfo]:

out_dir_audio.mkdir(exist_ok=True)
output = []
for file_info in tqdm(input_list, total=len(input_list)):
Expand All @@ -153,13 +149,11 @@ def convert_audio_data(


def load_filter(path_filter: Path) -> List[str]:

with open(path_filter, "r") as f:
return [x.strip() for x in f.readlines()]


def filter_data_by_id(input_lst: List[FileInfo], to_filter: List[str]):

input_lst.sort(key=lambda x: x.id_)
to_filter.sort()

Expand All @@ -183,7 +177,6 @@ def filter_data_by_id(input_lst: List[FileInfo], to_filter: List[str]):


def main(args):

letters = load_letters(Path(args.path_tokens))
data = get_base_data_from_csv(Path(args.path_tsv))
audio_data = get_full_audio_data(
Expand All @@ -207,7 +200,6 @@ def main(args):


if __name__ == "__main__":

parser = argparse.ArgumentParser(
description="Build the lst input files for common voices datasets"
)
Expand Down
4 changes: 0 additions & 4 deletions recipes/joint_training_vox_populi/prepare_data/get_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ def get_tokens_from_str(str_in) -> Set[str]:


def get_tokens_from_str_list(list_str: List[str]) -> Set[str]:

out = set()
for str_in in list_str:
out = out.union(get_tokens_from_str(str_in))
Expand All @@ -27,15 +26,13 @@ def get_tokens_from_str_list(list_str: List[str]) -> Set[str]:


def save_tokens(tokens, path_out, eow_token="|") -> None:

with open(path_out, "w") as f:
for x in tokens:
f.write(x + "\n")
f.write(eow_token)


def main(args):

data = get_base_data_from_csv(args.input_csv)
all_tokens = get_tokens_from_str_list([x["text"] for x in data])

Expand All @@ -48,7 +45,6 @@ def main(args):


if __name__ == "__main__":

parser = argparse.ArgumentParser("Token builder")
parser.add_argument("input_csv")
parser.add_argument("output")
Expand Down
2 changes: 0 additions & 2 deletions recipes/joint_training_vox_populi/prepare_data/lst_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,12 @@ class FileInfo:


def save_lst(lst_data: List[FileInfo], path_out: Path) -> None:

with open(path_out, "w") as file:
for data in lst_data:
file.write(f"{data.id_} {data.path_} {data.size*3600 * 1000} {data.text}\n")


def load_lst(path_file: Path) -> List[FileInfo]:

with open(path_file, "r") as file:
data = [x.strip() for x in file.readlines()]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ def has_valid_tokens(word: str, tokens: Set[str]) -> bool:


def read_token_file(path_token_file: Path, eow_char: str) -> Set[str]:

with path_token_file.open("r") as file:
data = [x.strip() for x in file.readlines()]

Expand All @@ -29,7 +28,6 @@ def read_token_file(path_token_file: Path, eow_char: str) -> Set[str]:
def save_lexicon(
lexicon: Set[str], path_out: Path, eow_char: str, tokens: Set[str]
) -> None:

list_lexicon = list(lexicon)
list_lexicon.sort()

Expand Down Expand Up @@ -98,7 +96,6 @@ def lexicon_from_lst(
min_occ: int = 10,
is_raw_text: bool = False,
) -> None:

out_lexicon = set()
tokens = read_token_file(path_tokens, eow_char)
log.info("Token file loaded")
Expand Down
10 changes: 6 additions & 4 deletions recipes/lexicon_free/utilities/compute_upper_ppl_convlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,9 @@ def compute_ppl_upper_limit_char_convlm(
print("Upper word perplexity for all words: {}".format(ppl_word))
print("Upper word perplexity for unknown words: {}".format(ppl_word_unk))
print(
"(Reported in the paper) "
"Upper word perplexity for known words: {}".format(ppl_word_no_unk)
"(Reported in the paper) " "Upper word perplexity for known words: {}".format(
ppl_word_no_unk
)
)


Expand Down Expand Up @@ -142,8 +143,9 @@ def compute_ppl_upper_limit_word_convlm(model, input_wordlm):
print("Word perplexity for all words: {}".format(ppl_word))
print("Word perplexity for unknown words: {}".format(ppl_word_unk))
print(
"(Reported in the paper) "
"Word perplexity for known words: {}".format(ppl_word_no_unk)
"(Reported in the paper) " "Word perplexity for known words: {}".format(
ppl_word_no_unk
)
)


Expand Down

0 comments on commit f8a766c

Please sign in to comment.