{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,10]],"date-time":"2026-06-10T16:47:18Z","timestamp":1781110038723,"version":"3.54.1"},"reference-count":34,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,4,6]]},"DOI":"10.1109\/icassp49660.2025.10888900","type":"proceedings-article","created":{"date-parts":[[2025,3,12]],"date-time":"2025-03-12T13:52:43Z","timestamp":1741787563000},"page":"1-5","source":"Crossref","is-referenced-by-count":5,"title":["Speech Retrieval-Augmented Generation without Automatic Speech Recognition"],"prefix":"10.1109","author":[{"given":"Do June","family":"Min","sequence":"first","affiliation":[{"name":"University of Michigan"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Karel","family":"Mundnich","sequence":"additional","affiliation":[{"name":"AWS AI Labs"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andy","family":"Lapastora","sequence":"additional","affiliation":[{"name":"AWS AI Labs"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Erfan","family":"Soltanmohammadi","sequence":"additional","affiliation":[{"name":"AWS AI Labs"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Srikanth","family":"Ronanki","sequence":"additional","affiliation":[{"name":"AWS AI Labs"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kyu","family":"Han","sequence":"additional","affiliation":[{"name":"AWS AI Labs"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","first-page":"9459","article-title":"Retrievalaugmented generation for knowledge-intensive nlp tasks","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Lewis","year":"2020"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.375"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICAAIC60222.2024.10574972"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2015.2438543"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2008.917992"},{"key":"ref6","author":"Radford","year":"2022","journal-title":"Robust speech recognition via large-scale weak supervision"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.eacl-main.148"},{"key":"ref8","first-page":"95","article-title":"OLISIA: a cascade system for spoken dialogue state tracking","volume-title":"Proceedings of The Eleventh Dialog System Technology Challenge","author":"Jacqmin"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461785"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.98"},{"key":"ref11","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning","author":"Radford"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/icassp49357.2023.10095969"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref14","article-title":"Contrastive learning with hard negative samples","volume-title":"International Conference on Learning Representations","author":"Robinson"},{"key":"ref15","article-title":"Why do we need large batchsizes in contrastive learning? a gradient-bias perspective","author":"Chen","year":"2022","journal-title":"Neural Information Processing Systems"},{"key":"ref16","author":"Duquenne","year":"2023","journal-title":"SONAR: sentence-level multimodal and language-agnostic representations"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-2227"},{"key":"ref18","article-title":"Speechbert: Cross-modal pre-trained language model for end-to-end spoken question answering","volume":"abs\/1910.11559","author":"Chuang","year":"2019","journal-title":"ArXiv"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448030"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447448"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448210"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447448"},{"key":"ref23","author":"Das","year":"2024","journal-title":"Speechverse: A large-scale generalizable audio language model"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2213"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.642"},{"key":"ref28","article-title":"Qwen-audio: Advancing universal audio understanding via unified large-scale audio-language models","author":"Chu","year":"2023"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1714"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d16-1264"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.3115\/1119176.1119195"},{"key":"ref33","author":"Roychowdhury","year":"2024","journal-title":"Evaluation of rag metrics for question answering in the telecom domain"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"}],"event":{"name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Hyderabad, India","start":{"date-parts":[[2025,4,6]]},"end":{"date-parts":[[2025,4,11]]}},"container-title":["ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10887540\/10887541\/10888900.pdf?arnumber=10888900","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T05:21:35Z","timestamp":1774416095000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10888900\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,6]]},"references-count":34,"URL":"https:\/\/doi.org\/10.1109\/icassp49660.2025.10888900","relation":{},"subject":[],"published":{"date-parts":[[2025,4,6]]}}}