{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T23:29:39Z","timestamp":1780961379829,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":26,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T00:00:00Z","timestamp":1726444800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,9,16]]},"DOI":"10.1145\/3652988.3696198","type":"proceedings-article","created":{"date-parts":[[2024,12,26]],"date-time":"2024-12-26T12:22:57Z","timestamp":1735215777000},"page":"1-3","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Estuary: A Framework For Building Multimodal Low-Latency Real-Time Socially Interactive Agents"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-0974-3395","authenticated-orcid":false,"given":"Spencer","family":"Lin","sequence":"first","affiliation":[{"name":"University of Southern California, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7345-2516","authenticated-orcid":false,"given":"Basem","family":"Rizk","sequence":"additional","affiliation":[{"name":"University of Southern California, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1406-6873","authenticated-orcid":false,"given":"Miru","family":"Jun","sequence":"additional","affiliation":[{"name":"University of Southern California, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5003-6639","authenticated-orcid":false,"given":"Andy","family":"Artze","sequence":"additional","affiliation":[{"name":"University of Southern California, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8248-3948","authenticated-orcid":false,"given":"Caitl\u00edn","family":"Sullivan","sequence":"additional","affiliation":[{"name":"University of Southern California, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3308-2474","authenticated-orcid":false,"given":"Sharon","family":"Mozgai","sequence":"additional","affiliation":[{"name":"University of Southern California, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8772-7433","authenticated-orcid":false,"given":"Scott","family":"Fisher","sequence":"additional","affiliation":[{"name":"University of Southern California, United States of America"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,12,26]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"[n. d.]. Faster Whisper. https:\/\/github.com\/SYSTRAN\/faster-whisper?tab=readme-ov-file."},{"key":"e_1_3_2_2_2_1","unstructured":"[n. d.]. NVIDIA Ace. https:\/\/developer.nvidia.com\/ace."},{"key":"e_1_3_2_2_3_1","unstructured":"[n. d.]. Socket.IO. https:\/\/socket.io\/"},{"key":"e_1_3_2_2_4_1","first-page":"i","volume":"202","author":"Abdin Marah","unstructured":"Marah Abdin, Sam\u00a0Ade Jacobs, Ammar\u00a0Ahmad Awan, Jyoti Aneja, Ahmed Awadallah, Hany Awadalla, Nguyen Bach, Amit Bahree, Arash Bakhtiari, Harkirat Behl, 2024. Phi-3 technical report: A highly capable language model locally on your phone. arXiv preprint arXiv:2404.14219 (2024).","journal-title":"Harkirat Behl"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2018.00019"},{"key":"e_1_3_2_2_6_1","volume-title":"Language Models are Few-Shot Learners. CoRR abs\/2005.14165","author":"Brown B.","year":"2020","unstructured":"Tom\u00a0B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel\u00a0M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. CoRR abs\/2005.14165 (2020). arXiv:2005.14165https:\/\/arxiv.org\/abs\/2005.14165"},{"key":"e_1_3_2_2_7_1","unstructured":"Zhe Cao Gines Hidalgo Tomas Simon Shih-En Wei and Yaser Sheikh. 2019. OpenPose: Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields. arxiv:1812.08008"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"crossref","unstructured":"Edresson Casanova Kelly Davis Eren G\u00f6lge G\u00f6rkem G\u00f6knar Iulian Gulea Logan Hart Aya Aljafari Joshua Meyer Reuben Morais Samuel Olayemi and Julian Weber. 2024. XTTS: a Massively Multilingual Zero-Shot Text-to-Speech Model. arxiv:2406.04904\u00a0[eess.AS]","DOI":"10.21437\/Interspeech.2024-2016"},{"key":"e_1_3_2_2_9_1","volume-title":"Estimating the reproducibility of psychological science. Science 349, 6251","author":"Collaboration Open\u00a0Science","year":"2015","unstructured":"Open\u00a0Science Collaboration. 2015. Estimating the reproducibility of psychological science. Science 349, 6251 (2015), aac4716."},{"key":"e_1_3_2_2_10_1","unstructured":"Pierre\u00a0Nicolas Durette. 2024. gTTS. https:\/\/github.com\/pndurette\/gTTS."},{"key":"e_1_3_2_2_11_1","volume-title":"Jon Taylor, Thomas B., Liza, James Hush, and Rahul Nair.","author":"Flaqu\u00e9 Aleix\u00a0Conchillo","year":"2024","unstructured":"Aleix\u00a0Conchillo Flaqu\u00e9, Moishe Lettvin, Kwindla\u00a0Hultman Kramer, chadbailey59, Jon Taylor, Thomas B., Liza, James Hush, and Rahul Nair. 2024. pipecat-ai\/pipecat. https:\/\/github.com\/pipecat-ai\/pipecat"},{"key":"e_1_3_2_2_12_1","volume-title":"S3: Social-network Simulation System with Large Language Model-Empowered Agents. arXiv preprint arXiv:2307.14984","author":"Gao Chen","year":"2023","unstructured":"Chen Gao, Xiaochong Lan, Zhihong Lu, Jinzhu Mao, Jinghua Piao, Huandong Wang, Depeng Jin, and Yong Li. 2023. S3: Social-network Simulation System with Large Language Model-Empowered Agents. arXiv preprint arXiv:2307.14984 (2023)."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-40415-3_33"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACIIW59127.2023.10388188"},{"key":"e_1_3_2_2_15_1","unstructured":"Nian Li Chen Gao Mingyu Li Yong Li and Qingmin Liao. [n. d.]. EconAgent: Large Language Model-Empowered Agents for Simulating Macroeconomic Activities. ([n. d.])."},{"key":"e_1_3_2_2_16_1","volume-title":"Encouraging divergent thinking in large language models through multi-agent debate. arXiv preprint arXiv:2305.19118","author":"Liang Tian","year":"2023","unstructured":"Tian Liang, Zhiwei He, Wenxiang Jiao, Xing Wang, Yan Wang, Rui Wang, Yujiu Yang, Zhaopeng Tu, and Shuming Shi. 2023. Encouraging divergent thinking in large language models through multi-agent debate. arXiv preprint arXiv:2305.19118 (2023)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"crossref","unstructured":"Birgit Lugrin Catherine Pelachaud and David Traum. 2022. The Handbook on Socially Interactive Agents: 20 Years of Research on Embodied Conversational Agents Intelligent Virtual Agents and Social Robotics Volume 2: Interactivity Platforms Application. ACM.","DOI":"10.1145\/3563659"},{"key":"e_1_3_2_2_18_1","unstructured":"OpenAI. 2024. Hello GPT-4o. https:\/\/openai.com\/index\/hello-gpt-4o\/"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606763"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACIT49673.2020.9208978"},{"key":"e_1_3_2_2_21_1","volume-title":"Multimodal intelligent information presentation","author":"Poggi Isabella","unstructured":"Isabella Poggi, Catherine Pelachaud, Fiorella de Rosis, Valeria Carofiglio, and Berardina De\u00a0Carolis. 2005. Greta. a believable embodied conversational agent. In Multimodal intelligent information presentation. Springer, 3\u201325."},{"key":"e_1_3_2_2_22_1","volume-title":"International Conference on Machine Learning. PMLR, 28492\u201328518","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. In International Conference on Machine Learning. PMLR, 28492\u201328518."},{"key":"e_1_3_2_2_23_1","unstructured":"Basem Rizk. 2019. Evaluation of state of art open-source ASR engines with local inferencing. In Evaluation of State Of Art Open-source ASR Engines with Local Inferencing. Vol.\u00a08."},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642159"},{"key":"e_1_3_2_2_25_1","unstructured":"Unity. 2024. UnityARKitDocumentation. https:\/\/docs.unity3d.com\/Packages\/com.unity.xr.arkit@5.1\/manual\/"},{"key":"e_1_3_2_2_26_1","volume-title":"Humanoid agents: Platform for simulating human-like generative agents. arXiv preprint arXiv:2310.05418","author":"Wang Zhilin","year":"2023","unstructured":"Zhilin Wang, Yu\u00a0Ying Chiu, and Yu\u00a0Cheung Chiu. 2023. Humanoid agents: Platform for simulating human-like generative agents. arXiv preprint arXiv:2310.05418 (2023)."}],"event":{"name":"IVA '24: ACM International Conference on Intelligent Virtual Agents","location":"GLASGOW United Kingdom","acronym":"IVA '24","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence"]},"container-title":["Proceedings of the ACM International Conference on Intelligent Virtual Agents"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652988.3696198","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652988.3696198","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T17:10:38Z","timestamp":1755882638000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652988.3696198"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,16]]},"references-count":26,"alternative-id":["10.1145\/3652988.3696198","10.1145\/3652988"],"URL":"https:\/\/doi.org\/10.1145\/3652988.3696198","relation":{},"subject":[],"published":{"date-parts":[[2024,9,16]]},"assertion":[{"value":"2024-12-26","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}