Here're are some resources about Text-to-Audio and Audio-to-Text modeling, understanding, generation in Multi-Modal LLMs
tag: Moonshine
| Useful Sensors
paper link: here
blog link: here
github link: here
citation:
@misc{jeffries2024moonshinespeechrecognitionlive,
title={Moonshine: Speech Recognition for Live Transcription and Voice Commands},
author={Nat Jeffries and Evan King and Manjunath Kudlur and Guy Nicholson and James Wang and Pete Warden},
year={2024},
eprint={2410.15608},
archivePrefix={arXiv},
primaryClass={cs.SD},
url={https://arxiv.org/abs/2410.15608},
}
tag: Qwen2-Audio
paper link: here
github link: here
model links:
model name | link |
---|---|
Qwen2-Audio-7B-Instruct | here |
Qwen2-Audio-7B | here |
citation:
@misc{chu2024qwen2audiotechnicalreport,
title={Qwen2-Audio Technical Report},
author={Yunfei Chu and Jin Xu and Qian Yang and Haojie Wei and Xipin Wei and Zhifang Guo and Yichong Leng and Yuanjun Lv and Jinzheng He and Junyang Lin and Chang Zhou and Jingren Zhou},
year={2024},
eprint={2407.10759},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2407.10759},
}
tag: Spirit LM
paper link: here
github link: here
citation:
@misc{nguyen2024spiritlminterleavedspoken,
title={Spirit LM: Interleaved Spoken and Written Language Model},
author={Tu Anh Nguyen and Benjamin Muller and Bokai Yu and Marta R. Costa-jussa and Maha Elbayad and Sravya Popuri and Christophe Ropers and Paul-Ambroise Duquenne and Robin Algayres and Ruslan Mavlyutov and Itai Gat and Mary Williamson and Gabriel Synnaeve and Juan Pino and Benoit Sagot and Emmanuel Dupoux},
year={2024},
eprint={2402.05755},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2402.05755},
}
tag: AIR-Bench
paper link: here
github link: here
citation:
@misc{yang2024airbenchbenchmarkinglargeaudiolanguage,
title={AIR-Bench: Benchmarking Large Audio-Language Models via Generative Comprehension},
author={Qian Yang and Jin Xu and Wenrui Liu and Yunfei Chu and Ziyue Jiang and Xiaohuan Zhou and Yichong Leng and Yuanjun Lv and Zhou Zhao and Chang Zhou and Jingren Zhou},
year={2024},
eprint={2402.07729},
archivePrefix={arXiv},
primaryClass={eess.AS},
url={https://arxiv.org/abs/2402.07729},
}