apt-get update
apt-get install openjdk-8-jdk
pip install evaluate sacrebleu==1.5.1
pip install pycocoevalcap
pip install edit_distance editdistance
mkdir -p eval_audio/caption_evaluation_tools
git clone https://github.com/audio-captioning/caption-evaluation-tools.git eval_audio/caption_evaluation_tools
cd eval_audio/caption_evaluation_toolscoco_caption/
./get_stanford_models.sh
cd ../..
pip install sacrebleu
pip install sacrebleu\[ja\]
pip install sed_eval
pip install dcase_util
- Data
LibriSpeech: https://www.openslr.org/12
Aishell1: https://www.aishelltech.com/kysjcp
Aishell2: https://www.aishelltech.com/aishell_2
mkdir -p data/asr && cd data/asr
# download audios from above links
# download converted files
wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/evaluation/librispeech_eval.jsonl
wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/evaluation/aishell1_eval.jsonl
wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/evaluation/aishell2_eval.jsonl
cd ../..
for ds in "librispeech" "aishell1" "aishell2"
do
python -m torch.distributed.launch --use_env \
--nproc_per_node ${NPROC_PER_NODE:-8} --nnodes 1 \
evaluate_asr.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 20 \
--num-workers 2
done
- Data
CoVoST 2: https://github.com/facebookresearch/covost
mkdir -p data/st && cd data/st
# download audios from https://commonvoice.mozilla.org/en/datasets
# download converted files
wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/evaluation/covost2_eval.jsonl
cd ../..
- Evaluate
ds="covost2"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} --nnodes 1 \
evaluate_st.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
- Data
industrial SRWT data: This is a closed-source dataset from https://arxiv.org/pdf/2305.11013.pdf
Please organize the evaluation data into a json lines format, where each json format is as follows:
{"audio": "001001.wav",
"gt": "<|0.00|><|sil|><|0.54|>面<|0.86|>条<|1.17|>机<|1.46|><|sil|><|2.01|>",
"source": "aishell2"}
- Evaluate
ds="covost2"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} --nnodes 1 \
evaluate_st.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
- Data
Clotho: https://github.com/audio-captioning/clotho-dataset
mkdir -p data/caption && cd data/caption
# download audios from https://zenodo.org/record/3490684
# download converted files
wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/evaluation/clotho_eval.jsonl
cd ../..
- Evaluate
ds="clotho"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} --nnodes 1 \
evaluate_caption.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2 \
- Data
TUT2017: https://dcase.community/challenge2017/task-acoustic-scene-classification
CochlScene: https://zenodo.org/records/7080122
mkdir -p data/asc && cd data/asc
# download TUT2017 datasets from https://zenodo.org/records/1040168
# download CochlScene datasets from https://zenodo.org/records/7080122
# download converted files
wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/evaluation/tut2017_eval.jsonl
wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/evaluation/cochlscene_eval.jsonl
cd ../..
- Evaluate
for ds in "cochlscene" "tut2017"
do
python -m torch.distributed.launch --use_env \
--nproc_per_node ${NPROC_PER_NODE:-8} --nnodes 1 \
evaluate_scene.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 20 \
--num-workers 2
done
- Data
mkdir -p data/ser && cd data/ser
# download MELD datasets from above link
# download converted files
wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/evaluation/meld_eval.jsonl
cd ../..
- Evaluate
ds="meld"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} --nnodes 1 \
evaluate_emotion.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2 \
--few-shot 0
- Data
Clotho-AQA: https://zenodo.org/records/6473207
mkdir -p data/aqa && cd data/aqa
# download Clotho-AQA dataset from https://zenodo.org/records/6473207
# download converted files
wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/evaluation/clothoaqa_eval.jsonl
cd ../..
- Evaluate
ds="clothoaqa"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} --nnodes 1 \
evaluate_aqa.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
- Data
VocalSound: https://github.com/YuanGongND/vocalsound
mkdir -p data/vsc && cd data/vsc
# download dataset from the above link
# download converted files
wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/evaluation/vocalsound_eval.jsonl
cd ../..
- Evaluate
ds="clothoaqa"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} --nnodes 1 \
evaluate_aqa.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
- Data
NSynth: https://magenta.tensorflow.org/datasets/nsynth#files
mkdir -p data/mna && cd data/mna
# download NSynth datasets from above link
# download converted files
wget https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/evaluation/nsynth_eval.jsonl
cd ../..
- Evaluate
ds="nsynth"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} --nnodes 1 \
evaluate_note_analysis.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
Part of these codes are borrowed from HEAR Benchmark , OFA-SYS and Audio captioning, thanks for their wonderful work.