{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,6]],"date-time":"2026-04-06T05:21:52Z","timestamp":1775452912810,"version":"3.50.1"},"reference-count":33,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434725","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-4","source":"Crossref","is-referenced-by-count":1,"title":["AURA: Agent for Understanding, Reasoning, and Automated Tool Use in Voice-Driven Tasks"],"prefix":"10.1109","author":[{"given":"Leander Melroy","family":"Maben","sequence":"first","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Gayathri Ganesh","family":"Lakshmy","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Srijith","family":"Radhakrishnan","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Siddhant","family":"Arora","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Espnet-sds: Unified toolkit and demo for spoken dialogue systems","author":"Arora","year":"2025"},{"key":"ref2","volume-title":"Speechgpt: Empowering large language models with intrinsic cross-modal conversational abilities","author":"Zhang","year":"2023"},{"key":"ref3","volume-title":"Baichuanaudio: A unified framework for end-to-end speech interaction","author":"Li","year":"2025"},{"key":"ref4","volume-title":"Moshi: a speech-text foundation model for real-time dialogue","author":"D\u00e9fossez","year":"2024"},{"key":"ref5","volume-title":"Mini-omni: Language models can hear, talk while thinking in streaming","author":"Xie","year":"2024"},{"key":"ref6","volume-title":"Openomni: Advancing open-source omnimodal large language models with progressive multimodal alignment and real-time self-aware emotional speech synthesis","author":"Luo","year":"2025"},{"key":"ref7","volume-title":"Voicebench: Benchmarking llm-based voice assistants","author":"Chen","year":"2024"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1547"},{"key":"ref9","volume-title":"Spokenwoz: A large-scale speech-text benchmark for spoken task-oriented dialogue agents","author":"Si","year":"2024"},{"key":"ref10","volume-title":"Training language models to follow instructions with human feedback","author":"Ouyang","year":"2022"},{"key":"ref11","volume-title":"Hugginggpt: Solving ai tasks with chatgpt and its friends in hugging face","author":"Shen","year":"2023"},{"key":"ref12","volume-title":"Api-bank: A comprehensive benchmark for tool-augmented llms","author":"Li","year":"2023"},{"key":"ref13","volume-title":"Tooldial: Multi-turn dialogue generation method for tool-augmented language models","author":"Shim","year":"2025"},{"key":"ref14","first-page":"2748","article-title":"Rethinking task-oriented dialogue systems: From complex modularity to zero-shot autonomous agent","volume-title":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Xu"},{"key":"ref15","article-title":"React: Synergizing reasoning and acting in language models","volume-title":"International Conference on Learning Representations (ICLR)","author":"Yao"},{"key":"ref16","volume-title":"Audio-cot: Exploring chain-of-thought reasoning in large audio language model","author":"Ma","year":"2025"},{"key":"ref17","volume-title":"Audio-reasoner: Improving reasoning capability in large audio language models","author":"Xie","year":"2025"},{"key":"ref18","volume-title":"Respact: Harmonizing reasoning, speaking, and acting towards building large language model-based conversational ai agents","author":"Dongre","year":"2025"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1260"},{"key":"ref20","volume-title":"Robust speech recognition via large-scale weak supervision","author":"Radford","year":"2022"},{"key":"ref21","volume-title":"Gpt-4o: Openai\u2019s new multimodal flagship model","year":"2024"},{"key":"ref22","volume-title":"Owsm v3.1: Better and faster open whisper-style speech models based on e-branchformer","author":"Peng","year":"2024"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053512"},{"key":"ref24","volume-title":"The llama 3 herd of models","author":"A. G","year":"2024"},{"key":"ref25","doi-asserted-by":"crossref","DOI":"10.1145\/3600006.3613165","volume-title":"Efficient memory management for large language model serving with pagedattention","author":"Kwon","year":"2023"},{"key":"ref26","volume-title":"Alpacaeval: An automatic evaluator of instruction-following models","author":"Li","year":"2023"},{"key":"ref27","volume-title":"Gpt-4o: Openai\u2019s new multimodal flagship model","year":"2024"},{"key":"ref28","volume-title":"Gpt-4o: Openai\u2019s new multimodal flagship model","year":"2024"},{"key":"ref29","volume-title":"Moshi: a speech-text foundation model for real-time dialogue","author":"D\u00e9fossez","year":"2024"},{"key":"ref30","volume-title":"Mini-omni2: Towards open-source gpt-4o with vision, speech and duplex capabilities","author":"Xie","year":"2024"},{"key":"ref31","volume-title":"Kimi-audio technical report","author":"KimiTeam","year":"2025"},{"key":"ref32","volume-title":"Qwen3 technical report","year":"2025"},{"key":"ref33","volume-title":"Parakeet-tdt-0.6b-v2","year":"2024"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434725.pdf?arnumber=11434725","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:58:55Z","timestamp":1775192335000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434725\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434725","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}