{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T07:10:54Z","timestamp":1775200254118,"version":"3.50.1"},"reference-count":33,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434765","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-7","source":"Crossref","is-referenced-by-count":0,"title":["Transcribe, Translate, or Transliterate: An Investigation of Intermediate Representations in Spoken Language Models"],"prefix":"10.1109","author":[{"given":"Tol\u00fal\u1ecdp\u00e9","family":"\u00d2g\u00fanrem\u00ed","sequence":"first","affiliation":[{"name":"Stanford University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Christopher D.","family":"Manning","sequence":"additional","affiliation":[{"name":"Stanford University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dan","family":"Jurafsky","sequence":"additional","affiliation":[{"name":"Stanford University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Karen","family":"Livescu","sequence":"additional","affiliation":[{"name":"Toyota Technological Institute at,Chicago"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"On the landscape of spoken language models: A comprehensive survey","author":"Arora","year":"2025","journal-title":"arXiv preprint arXiv:2504.08528"},{"key":"ref2","article-title":"Qwen2-audio technical report","volume-title":"arXiv preprint arXiv:2407.10759","author":"Chu","year":"2024"},{"key":"ref3","article-title":"SALMONN: Towards generic hearing abilities for large language models","volume-title":"The Twelfth International Conference on Learning Representations","author":"Tang"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389742"},{"key":"ref5","article-title":"Phi-4-mini technical report: Compact yet powerful multimodal language models via mixture-of-LoRAs","volume-title":"arXiv preprint arXiv:2503.01743","author":"Abouelenin","year":"2025"},{"key":"ref6","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022","journal-title":"ICML"},{"key":"ref7","article-title":"Common voice: A massively-multilingual speech corpus","volume-title":"Proceedings of the Twelfth Language Resources and Evaluation Conference","author":"Ardila"},{"key":"ref8","article-title":"Fleurs: Few-shot learning evaluation of universal representations of speech","author":"Conneau","year":"2022","journal-title":"arXiv preprint arXiv:2205.12446"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00656"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414321"},{"key":"ref12","article-title":"Eliciting latent predictions from transformers with the tuned lens","author":"Belrose","year":"2023","journal-title":"arXiv preprint arXiv:2303.08112"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.820"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.309"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.blackboxnlp-1.22"},{"key":"ref16","volume-title":"Interpreting GPT: The logit lens"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-2131"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10019"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1280"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1182"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.naacl-main.273"},{"key":"ref22","volume-title":"Robust speech recognition via large-scale weak supervision","author":"Radford","year":"2022"},{"key":"ref23","article-title":"BEATs: Audio pre-training with acoustic tokenizers","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Chen"},{"key":"ref24","article-title":"Qwen2 technical report","volume-title":"arXiv preprint arXiv:2407.10671","author":"Yang","year":"2024"},{"key":"ref25","article-title":"VoxCommunis: A corpus for cross-linguistic phonetic analysis","volume-title":"Proceedings of the Thirteenth Language Resources and Evaluation Conference","author":"Ahn"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00474"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.181"},{"key":"ref29","article-title":"Unsupervised machine translation using monolingual corpora only","author":"Lample","year":"2017","journal-title":"arXiv preprint arXiv:1711.00043"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1162\/COLI_a_00237"},{"key":"ref31","article-title":"Epitran: Precision G2P for many languages","volume-title":"Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)","author":"Mortensen"},{"key":"ref32","volume-title":"SpokenSTS","author":"Merkx","year":"2021"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2020"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434765.pdf?arnumber=11434765","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:59:31Z","timestamp":1775192371000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434765\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434765","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}