{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,1,21]],"date-time":"2025-01-21T05:21:08Z","timestamp":1737436868499,"version":"3.33.0"},"reference-count":21,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,2]]},"DOI":"10.1109\/slt61566.2024.10832300","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:27Z","timestamp":1737052287000},"page":"1137-1143","source":"Crossref","is-referenced-by-count":0,"title":["Just ASR + LLM? A Study on Speech Large Language Models\u2019 Ability to Identify And Understand Speaker in Spoken Dialogue"],"prefix":"10.1109","author":[{"given":"Junkai","family":"Wu","sequence":"first","affiliation":[{"name":"University of Washington"}]},{"given":"Xulin","family":"Fan","sequence":"additional","affiliation":[{"name":"University of Illinois-Urbana Champaign"}]},{"given":"Bo-Ru","family":"Lu","sequence":"additional","affiliation":[{"name":"University of Washington"}]},{"given":"Xilin","family":"Jiang","sequence":"additional","affiliation":[{"name":"Columbia University"}]},{"given":"Nima","family":"Mesgarani","sequence":"additional","affiliation":[{"name":"Columbia University"}]},{"given":"Mark","family":"Hasegawa-Johnson","sequence":"additional","affiliation":[{"name":"University of Illinois-Urbana Champaign"}]},{"given":"Mari","family":"Ostendorf","sequence":"additional","affiliation":[{"name":"University of Washington"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Qwen-Audio: Advancing universal audio understanding via unified large-scale audio-language models","author":"Chu","year":"2023","journal-title":"arXiv preprint arXiv:2311.07919"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.263"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389742"},{"key":"ref4","article-title":"Listen, think, and understand","author":"Gong","year":"2023","journal-title":"arXiv preprint arXiv:2305.10790"},{"key":"ref5","article-title":"Robust speech recognition via large-scale weak supervision","author":"Radford","year":"2022","journal-title":"arXiv preprint arXiv:2212.04356"},{"key":"ref6","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023","journal-title":"arXiv preprint arXiv:2302.13971"},{"key":"ref7","article-title":"LoRA: Low-rank adaptation of large language models","author":"Hu","year":"2021","journal-title":"arXiv preprint arXiv:2106.09685"},{"key":"ref8","article-title":"Qwen technical report","volume-title":"arXiv preprint arXiv:2309.16609","author":"Bai","year":"2023"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref10","article-title":"SD-Eval: A benchmark dataset for spoken dialogue understanding beyond words","author":"Ao","year":"2024","journal-title":"arXiv preprint arXiv:2406.13340"},{"key":"ref11","article-title":"AudioBench: A universal benchmark for audio large language models","author":"Wang","year":"2024","journal-title":"arXiv preprint arXiv:2406.16020"},{"article-title":"SALMONN: Towards generic hearing abilities for large language models","volume-title":"The Twelfth International Conference on Learning Representations","author":"Tang","key":"ref12"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/s42001-024-00345-9"},{"key":"ref14","article-title":"ChatGPT-4 outperforms experts and crowd workers in annotating political twitter messages with zero-shot learning","author":"T\u00f6rnberg","year":"2023","journal-title":"arXiv preprint arXiv:2304.06588"},{"key":"ref15","article-title":"GPT-4 technical report","volume-title":"arXiv preprint arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461471"},{"key":"ref17","first-page":"19594","article-title":"StyleTTS 2: Towards human-level text-to-speech through style diffusion and adversarial training with large speech language models","volume":"36","author":"Li","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"CSTR VCTK Corpus: English multi-speaker corpus for CSTR voice cloning toolkit (version 0.92)","year":"2019","author":"Yamagishi","key":"ref18"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref20","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv preprint arXiv:2307.09288"},{"year":"2024","key":"ref21","article-title":"Llama 3 model card"}],"event":{"name":"2024 IEEE Spoken Language Technology Workshop (SLT)","start":{"date-parts":[[2024,12,2]]},"location":"Macao","end":{"date-parts":[[2024,12,5]]}},"container-title":["2024 IEEE Spoken Language Technology Workshop (SLT)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10830790\/10830793\/10832300.pdf?arnumber=10832300","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,20]],"date-time":"2025-01-20T18:39:21Z","timestamp":1737398361000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10832300\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"references-count":21,"URL":"https:\/\/doi.org\/10.1109\/slt61566.2024.10832300","relation":{},"subject":[],"published":{"date-parts":[[2024,12,2]]}}}