{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:01:55Z","timestamp":1775199715698,"version":"3.50.1"},"reference-count":31,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434639","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["SLM-S2ST: A multimodal language model for direct speech-to-speech translation"],"prefix":"10.1109","author":[{"given":"Yuxuan","family":"Hu","sequence":"first","affiliation":[{"name":"Microsoft,USA"}]},{"given":"Haibin","family":"Wu","sequence":"additional","affiliation":[{"name":"Microsoft,USA"}]},{"given":"Ruchao","family":"Fan","sequence":"additional","affiliation":[{"name":"Microsoft,USA"}]},{"given":"Xiaofei","family":"Wang","sequence":"additional","affiliation":[{"name":"Microsoft,USA"}]},{"given":"Heng","family":"Lu","sequence":"additional","affiliation":[{"name":"Microsoft,USA"}]},{"given":"Yao","family":"Qian","sequence":"additional","affiliation":[{"name":"Microsoft,USA"}]},{"given":"Jinyu","family":"Li","sequence":"additional","affiliation":[{"name":"Microsoft,USA"}]}],"member":"263","reference":[{"key":"ref1","article-title":"On the landscape of spoken language models: A comprehensive survey","author":"Arora","year":"2025","journal-title":"arXiv preprint arXiv:2504.08528"},{"key":"ref2","article-title":"Moshi: a speechtext foundation model for real-time dialogue","author":"D\u00e9fossez","year":"2024","journal-title":"arXiv preprint arXiv:2410.00037"},{"key":"ref3","article-title":"Wavchat: A survey of spoken dialogue models","author":"Ji","year":"2024","journal-title":"arXiv preprint arXiv:2411.13577"},{"key":"ref4","article-title":"Towards audio language modeling-an overview","author":"Wu","year":"2024","journal-title":"arXiv preprint arXiv:2402.13236"},{"key":"ref5","article-title":"A survey on speech large language models","author":"Peng","year":"2024","journal-title":"arXiv preprint arXiv:2410.18908"},{"key":"ref6","article-title":"Hello gpt-4o","journal-title":"OpenAI"},{"key":"ref7","article-title":"SALMONN: Towards generic hearing abilities for large language models","author":"Tang","year":"2024"},{"key":"ref8","article-title":"Listen, think, and understand","author":"Gong","year":"2023","journal-title":"arXiv preprint arXiv:2305.10790"},{"key":"ref9","article-title":"Qwen-Audio: Advancing universal audio understanding via unified large-scale audiolanguage models","author":"Chu","year":"2023","journal-title":"arXiv preprint arXiv:2311.07919"},{"key":"ref10","article-title":"Audio flamingo 2: An audio-language model with long-audio understanding and expert reasoning abilities","author":"Ghosh","year":"2025","journal-title":"arXiv preprint arXiv:2503.03983"},{"key":"ref11","article-title":"Phi-4-mini technical report: Compact yet powerful multimodal language models via mixture-of-loras","volume-title":"Microsoft","year":"2025"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref13","article-title":"Kimiaudio technical report","volume-title":"arXiv preprint arXiv:2504.18425","author":"Ding","year":"2025"},{"key":"ref14","article-title":"Cvss corpus and massively multilingual speech-to- speech translation","author":"Jia","year":"2022","journal-title":"arXiv preprint arXiv:2201.03713"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.593"},{"key":"ref16","article-title":"Finite scalar quantization: VQ-VAE made simple","volume-title":"The Twelfth International Conference on Learning Representations","author":"Mentzer"},{"key":"ref17","first-page":"28492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"International conference on machine learning.","author":"Radford"},{"key":"ref18","article-title":"Cosyvoice 2: Scalable streaming speech synthesis with large language models","volume-title":"arXiv preprint arXiv:2412.10117","author":"Du","year":"2024"},{"key":"ref19","article-title":"Flow matching for generative modeling","volume-title":"The Eleventh International Conference on Learning Representations","author":"Lipman"},{"key":"ref20","first-page":"17022","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Kong"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-2027"},{"issue":"2","key":"ref22","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu","year":"2022","journal-title":"ICLR"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023141"},{"key":"ref24","article-title":"Seamless: Multilingual expressive and streaming speech translation","author":"Barrault","year":"2023","journal-title":"arXiv preprint arXiv:2312.05187"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.235"},{"key":"ref26","first-page":"10120","article-title":"Translatotron 2: High-quality direct speech-to-speech translation with voice preservation","volume-title":"International Conference on Machine Learning.","author":"Jia"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.52202\/075280-3173"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.872"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.485"},{"key":"ref30","article-title":"Qwen2-audio technical report","volume-title":"arXiv preprint arXiv:2407.10759","author":"Chu","year":"2024"},{"key":"ref31","article-title":"Qwen2. 5-omni technical report","volume-title":"arXiv preprint arXiv:2503.20215","author":"Xu","year":"2025"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434639.pdf?arnumber=11434639","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:57:36Z","timestamp":1775192256000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434639\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":31,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434639","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}