{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T21:05:12Z","timestamp":1778274312393,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T00:00:00Z","timestamp":1745539200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,26]]},"DOI":"10.1145\/3706598.3714317","type":"proceedings-article","created":{"date-parts":[[2025,4,24]],"date-time":"2025-04-24T04:45:58Z","timestamp":1745469958000},"page":"1-19","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":11,"title":["Persistent Assistant: Seamless Everyday AI Interactions via Intent Grounding and Multimodal Feedback"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4521-2766","authenticated-orcid":false,"given":"Hyunsung","family":"Cho","sequence":"first","affiliation":[{"name":"Meta Reality Labs, Redmond, Washington, USA and Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8083-7692","authenticated-orcid":false,"given":"Jacqui","family":"Fashimpaur","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Redmond, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3534-890X","authenticated-orcid":false,"given":"Naveen","family":"Sendhilnathan","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Redmond, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1724-6915","authenticated-orcid":false,"given":"Jonathan","family":"Browder","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Redmond, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0809-9696","authenticated-orcid":false,"given":"David","family":"Lindlbauer","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, Pennsylvania, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8646-5076","authenticated-orcid":false,"given":"Tanya R.","family":"Jonker","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Redmond, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6174-2089","authenticated-orcid":false,"given":"Kashyap","family":"Todi","sequence":"additional","affiliation":[{"name":"Meta Reality Labs, Redmond, Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,4,25]]},"reference":[{"key":"e_1_3_3_3_2_2","doi-asserted-by":"publisher","DOI":"10.1145\/3332165.3347884"},{"key":"e_1_3_3_3_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3501940"},{"key":"e_1_3_3_3_4_2","first-page":"262","volume-title":"Proceedings of the 7th annual conference on Computer graphics and interactive techniques","author":"Bolt Richard\u00a0A","year":"1980","unstructured":"Richard\u00a0A Bolt. 1980. \u201cPut-that-there\u201d Voice and gesture at the graphics interface. In Proceedings of the 7th annual conference on Computer graphics and interactive techniques. 262\u2013270."},{"key":"e_1_3_3_3_5_2","first-page":"201","volume-title":"Social and cognitive approaches to interpersonal communication","author":"Brennan Susan\u00a0E","year":"2014","unstructured":"Susan\u00a0E Brennan. 2014. The grounding problem in conversations with and through computers. In Social and cognitive approaches to interpersonal communication. Psychology Press, 201\u2013225."},{"key":"e_1_3_3_3_6_2","series-title":"(AUIC \u201904)","first-page":"15","volume-title":"Proceedings of the Fifth Conference on Australasian User Interface - Volume 28","author":"Brewster Stephen","year":"2004","unstructured":"Stephen Brewster and Lorna\u00a0M. Brown. 2004. Tactons: structured tactile messages for non-visual information display. In Proceedings of the Fifth Conference on Australasian User Interface - Volume 28 (Dunedin, New Zealand) (AUIC \u201904). Australian Computer Society, Inc., AUS, 15\u201323."},{"key":"e_1_3_3_3_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/2858036.2858046"},{"key":"e_1_3_3_3_8_2","doi-asserted-by":"crossref","unstructured":"Kaifei Chen Jonathan F\u00fcrst John Kolb Hyung-Sin Kim Xin Jin David\u00a0E Culler and Randy\u00a0H Katz. 2018. Snaplink: Fast and accurate vision-based appliance control in large commercial buildings. Proceedings of the ACM on Interactive Mobile Wearable and Ubiquitous Technologies 1 4 (2018) 1\u201327.","DOI":"10.1145\/3161173"},{"key":"e_1_3_3_3_9_2","doi-asserted-by":"publisher","unstructured":"Hyunsung Cho Naveen Sendhilnathan Michael Nebeling Tianyi Wang Purnima Padmanabhan Jonathan Browder David Lindlbauer Tanya\u00a0R. Jonker and Kashyap Todi. 2024. SonoHaptics: An Audio-Haptic Cursor for Gaze-Based Object Selection in XR(UIST \u201924). Association for Computing Machinery New York NY USA. 10.1145\/3654777.3676384","DOI":"10.1145\/3654777.3676384"},{"key":"e_1_3_3_3_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/2858036.2858177"},{"key":"e_1_3_3_3_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462806"},{"key":"e_1_3_3_3_12_2","doi-asserted-by":"publisher","DOI":"10.1145\/3462244.3481001"},{"key":"e_1_3_3_3_13_2","unstructured":"Mustafa\u00a0Doga Dogan Eric\u00a0J Gonzalez Andrea Colaco Karan Ahuja Ruofei Du Johnny Lee Mar Gonzalez-Franco and David Kim. 2024. Augmented Object Intelligence: Making the Analog World Interactable with XR-Objects. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.13274 (2024)."},{"key":"e_1_3_3_3_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580870"},{"key":"e_1_3_3_3_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/2858036.2858308"},{"key":"e_1_3_3_3_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3136815"},{"key":"e_1_3_3_3_17_2","doi-asserted-by":"crossref","unstructured":"Taesik Gong Hyunsung Cho Bowon Lee and Sung-Ju Lee. 2019. Knocker: Vibroacoustic-based object recognition with smartphones. Proceedings of the ACM on interactive mobile wearable and ubiquitous technologies 3 3 (2019) 1\u201321.","DOI":"10.1145\/3351240"},{"key":"e_1_3_3_3_18_2","doi-asserted-by":"publisher","DOI":"10.1145\/3441000.3441012"},{"key":"e_1_3_3_3_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3517684"},{"key":"e_1_3_3_3_20_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-07788-8_11"},{"key":"e_1_3_3_3_21_2","doi-asserted-by":"crossref","unstructured":"Violet\u00a0Yinuo Han Hyunsung Cho Kiyosu Maeda Alexandra Ion and David Lindlbauer. 2023. BlendMR: A Computational Method to Create Ambient Mixed Reality Interfaces. Proceedings of the ACM on Human-Computer Interaction 7 ISS (2023) 217\u2013241.","DOI":"10.1145\/3626472"},{"key":"e_1_3_3_3_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR52148.2021.00060"},{"key":"e_1_3_3_3_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/2494091.2494185"},{"key":"e_1_3_3_3_24_2","doi-asserted-by":"publisher","DOI":"10.1145\/1322192.1322222"},{"key":"e_1_3_3_3_25_2","doi-asserted-by":"publisher","DOI":"10.1007\/11941354_28"},{"key":"e_1_3_3_3_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3174052"},{"key":"e_1_3_3_3_27_2","unstructured":"Robert Konrad Nitish Padmanaban J\u00a0Gabriel Buckmaster Kevin\u00a0C Boyle and Gordon Wetzstein. 2024. Gazegpt: Augmenting human capabilities using gaze-contingent contextual ai for smart eyewear. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.17217 (2024)."},{"key":"e_1_3_3_3_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173655"},{"key":"e_1_3_3_3_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/2984511.2984582"},{"key":"e_1_3_3_3_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3462244.3479902"},{"key":"e_1_3_3_3_31_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642230"},{"key":"e_1_3_3_3_32_2","unstructured":"Mina Lee Megha Srivastava Amelia Hardy John Thickstun Esin Durmus Ashwin Paranjape Ines Gerard-Ursin Xiang\u00a0Lisa Li Faisal Ladhak Frieda Rong et\u00a0al. 2022. Evaluating human-language model interaction. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2212.09746 (2022)."},{"key":"e_1_3_3_3_33_2","doi-asserted-by":"publisher","DOI":"10.1109\/RFID.2019.8719279"},{"key":"e_1_3_3_3_34_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642068"},{"key":"e_1_3_3_3_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642065"},{"key":"e_1_3_3_3_36_2","doi-asserted-by":"crossref","unstructured":"Yang Liu Chengdong Lin and Zhenjiang Li. 2021. WR-Hand: Wearable armband can track user\u2019s hand. Proceedings of the ACM on Interactive Mobile Wearable and Ubiquitous Technologies 5 3 (2021) 1\u201327.","DOI":"10.1145\/3478112"},{"key":"e_1_3_3_3_37_2","doi-asserted-by":"publisher","DOI":"10.1145\/2702123.2702128"},{"key":"e_1_3_3_3_38_2","doi-asserted-by":"crossref","unstructured":"Mathias\u00a0N Lystb\u00e6k Ken Pfeuffer Jens Emil\u00a0Sloth Gr\u00f8nb\u00e6k and Hans Gellersen. 2022. Exploring gaze for assisting freehand selection-based text entry in ar. Proceedings of the ACM on Human-Computer Interaction 6 ETRA (2022) 1\u201316.","DOI":"10.1145\/3530882"},{"key":"e_1_3_3_3_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3015783.3015788"},{"key":"e_1_3_3_3_40_2","doi-asserted-by":"crossref","unstructured":"Richard\u00a0E Mayer and Roxana Moreno. 2003. Nine ways to reduce cognitive load in multimedia learning. Educational psychologist 38 1 (2003) 43\u201352.","DOI":"10.1207\/S15326985EP3801_6"},{"key":"e_1_3_3_3_41_2","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376479"},{"key":"e_1_3_3_3_42_2","unstructured":"Meta. 2024. https:\/\/www.meta.com\/blog\/quest\/surface-emg-wristband-electromyography-human-computer-interaction-hci\/?srsltid=AfmBOopWgP7BHvzY22cES0E15K1IROuOVLxk1zA6ztCjII2hdv4SdiyI"},{"key":"e_1_3_3_3_43_2","doi-asserted-by":"publisher","DOI":"10.1109\/IE.2016.42"},{"key":"e_1_3_3_3_44_2","doi-asserted-by":"publisher","DOI":"10.1145\/3152832.3152863"},{"key":"e_1_3_3_3_45_2","doi-asserted-by":"publisher","DOI":"10.1145\/3526114.3558658"},{"key":"e_1_3_3_3_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/WHC.2019.8816098"},{"key":"e_1_3_3_3_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/2371574.2371624"},{"key":"e_1_3_3_3_48_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR.2014.6948411"},{"key":"e_1_3_3_3_49_2","unstructured":"Nikhila Ravi Valentin Gabeur Yuan-Ting Hu Ronghang Hu Chaitanya Ryali Tengyu Ma Haitham Khedr Roman R\u00e4dle Chloe Rolland Laura Gustafson Eric Mintun Junting Pan Kalyan\u00a0Vasudev Alwala Nicolas Carion Chao-Yuan Wu Ross Girshick Piotr Doll\u00e1r and Christoph Feichtenhofer. 2024. SAM 2: Segment Anything in Images and Videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.00714 (2024). https:\/\/arxiv.org\/abs\/2408.00714"},{"key":"e_1_3_3_3_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3239092.3265968"},{"key":"e_1_3_3_3_51_2","doi-asserted-by":"publisher","DOI":"10.1145\/2470654.2481352"},{"key":"e_1_3_3_3_52_2","doi-asserted-by":"publisher","DOI":"10.1109\/WHC.2015.7177722"},{"key":"e_1_3_3_3_53_2","doi-asserted-by":"crossref","unstructured":"Jannis Strecker Khakim Akhunov Federico Carbone Kimberly Garc\u00eda Kenan Bekta\u015f Andres Gomez Simon Mayer and Kasim\u00a0Sinan Yildirim. 2023. MR Object Identification and Interaction: Fusing Object Situation Information from Heterogeneous Sources. Proceedings of the ACM on Interactive Mobile Wearable and Ubiquitous Technologies 7 3 (2023) 1\u201326.","DOI":"10.1145\/3610879"},{"key":"e_1_3_3_3_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676373"},{"key":"e_1_3_3_3_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISMAR62088.2024.00111"},{"key":"e_1_3_3_3_56_2","doi-asserted-by":"publisher","DOI":"10.1109\/VR.2019.8798036"},{"key":"e_1_3_3_3_57_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642376"},{"key":"e_1_3_3_3_58_2","doi-asserted-by":"publisher","DOI":"10.1145\/3654777.3676416"},{"key":"e_1_3_3_3_59_2","doi-asserted-by":"crossref","unstructured":"Adam\u00a0S Williams Jason Garcia and Francisco Ortega. 2020. Understanding multimodal user gesture and speech behavior for object manipulation in augmented reality using elicitation. IEEE Transactions on Visualization and Computer Graphics 26 12 (2020) 3479\u20133489.","DOI":"10.1109\/TVCG.2020.3023566"},{"key":"e_1_3_3_3_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/3025453.3025828"},{"key":"e_1_3_3_3_61_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642735"},{"key":"e_1_3_3_3_62_2","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581500"},{"key":"e_1_3_3_3_63_2","unstructured":"Cheng-Yen Yang Hsiang-Wei Huang Wenhao Chai Zhongyu Jiang and Jenq-Neng Hwang. 2024. SAMURAI: Adapting Segment Anything Model for Zero-Shot Visual Tracking with Motion-Aware Memory. arxiv:https:\/\/arXiv.org\/abs\/2411.11922\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2411.11922"},{"key":"e_1_3_3_3_64_2","doi-asserted-by":"publisher","DOI":"10.1145\/3332165.3347954"},{"key":"e_1_3_3_3_65_2","doi-asserted-by":"publisher","unstructured":"Eunhye Youn Taejun Kim and Geehyuk Lee. 2024. WristMenu with Tactons: An Eyes- and Ears-Free Menu with Tactons Describing Menu Items in the Wrist Rotation Space. International Journal of Human\u2013Computer Interaction 40 9 (2024) 2314\u20132325. 10.1080\/10447318.2022.2159780 arXiv:10.1080\/10447318.2022.2159780","DOI":"10.1080\/10447318.2022.2159780"},{"key":"e_1_3_3_3_66_2","doi-asserted-by":"publisher","unstructured":"Eunhye Youn Sangyoon Lee Sunbum Kim Youngbo\u00a0Aram Shim Liwei Chan and Geehyuk Lee. 2021. WristDial: An Eyes-Free Integer-Value Input Method by Quantizing the Wrist Rotation. International Journal of Human\u2013Computer Interaction 37 17 (2021) 1607\u20131624. 10.1080\/10447318.2021.1898848 arXiv:10.1080\/10447318.2021.1898848","DOI":"10.1080\/10447318.2021.1898848"},{"key":"e_1_3_3_3_67_2","doi-asserted-by":"publisher","DOI":"10.1109\/WHC.2019.8816075"},{"key":"e_1_3_3_3_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/302979.303053"},{"key":"e_1_3_3_3_69_2","doi-asserted-by":"publisher","unstructured":"Kaixing Zhao Marcos Serrano Bernard Oriola and Christophe Jouffrais. 2020. VibHand: On-Hand Vibrotactile Interface Enhancing Non-Visual Exploration of Digital Graphics. Proc. ACM Hum.-Comput. Interact. 4 ISS Article 207 (nov 2020) 19\u00a0pages. 10.1145\/3427335","DOI":"10.1145\/3427335"},{"key":"e_1_3_3_3_70_2","doi-asserted-by":"publisher","DOI":"10.1145\/3613904.3642450"}],"event":{"name":"CHI 2025: CHI Conference on Human Factors in Computing Systems","location":"Yokohama Japan","acronym":"CHI '25","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3714317","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3706598.3714317","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T05:46:39Z","timestamp":1751607999000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3706598.3714317"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,25]]},"references-count":69,"alternative-id":["10.1145\/3706598.3714317","10.1145\/3706598"],"URL":"https:\/\/doi.org\/10.1145\/3706598.3714317","relation":{},"subject":[],"published":{"date-parts":[[2025,4,25]]},"assertion":[{"value":"2025-04-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}