{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,16]],"date-time":"2025-10-16T00:55:17Z","timestamp":1760576117576,"version":"build-2065373602"},"publisher-location":"New York, NY, USA","reference-count":22,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T00:00:00Z","timestamp":1730678400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,4]]},"DOI":"10.1145\/3686215.3690152","type":"proceedings-article","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T12:17:01Z","timestamp":1730290621000},"page":"41-45","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["\"Is This It?\": Towards Ecologically Valid Benchmarks for Situated Collaboration"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6283-0590","authenticated-orcid":false,"given":"Dan","family":"Bohus","sequence":"first","affiliation":[{"name":"Microsoft Research, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4972-7027","authenticated-orcid":false,"given":"Sean","family":"Andrist","sequence":"additional","affiliation":[{"name":"Microsoft Research, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7500-5944","authenticated-orcid":false,"given":"Yuwei","family":"Bao","sequence":"additional","affiliation":[{"name":"University of Michigan, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8823-0614","authenticated-orcid":false,"given":"Eric","family":"Horvitz","sequence":"additional","affiliation":[{"name":"Microsoft, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5092-8603","authenticated-orcid":false,"given":"Ann","family":"Paradiso","sequence":"additional","affiliation":[{"name":"Microsoft Research, United States"}]}],"member":"320","published-online":{"date-parts":[[2024,11,4]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Marah Abdin Sam\u00a0Ade Jacobs Ammar\u00a0Ahmad Awan Jyoti Aneja Ahmed Awadallah Hany Awadalla Nguyen Bach Amit Bahree Arash Bakhtiari Jianmin Bao Harkirat Behl Alon Benhaim Misha Bilenko Johan Bjorck S\u00e9bastien Bubeck Qin Cai Martin Cai Caio C\u00e9sar\u00a0Teodoro Mendes Weizhu Chen Vishrav Chaudhary Dong Chen Dongdong Chen Yen-Chun Chen Yi-Ling Chen Parul Chopra Xiyang Dai Allie\u00a0Del Giorno Gustavo de Rosa Matthew Dixon Ronen Eldan Victor Fragoso Dan Iter Mei Gao Min Gao Jianfeng Gao Amit Garg Abhishek Goswami Suriya Gunasekar Emman Haider Junheng Hao Russell\u00a0J. Hewett Jamie Huynh Mojan Javaheripi Xin Jin Piero Kauffmann Nikos Karampatziakis Dongwoo Kim Mahoud Khademi Lev Kurilenko James\u00a0R. Lee Yin\u00a0Tat Lee Yuanzhi Li Yunsheng Li Chen Liang Lars Liden Ce Liu Mengchen Liu Weishung Liu Eric Lin Zeqi Lin Chong Luo Piyush Madan Matt Mazzola Arindam Mitra Hardik Modi Anh Nguyen Brandon Norick Barun Patra Daniel Perez-Becker Thomas Portet Reid Pryzant Heyang Qin Marko Radmilac Corby Rosset Sambudha Roy Olatunji Ruwase Olli Saarikivi Amin Saied Adil Salim Michael Santacroce Shital Shah Ning Shang Hiteshi Sharma Swadheen Shukla Xia Song Masahiro Tanaka Andrea Tupini Xin Wang Lijuan Wang Chunyu Wang Yu Wang Rachel Ward Guanhua Wang Philipp Witte Haiping Wu Michael Wyatt Bin Xiao Can Xu Jiahang Xu Weijian Xu Sonali Yadav Fan Yang Jianwei Yang Ziyi Yang Yifan Yang Donghan Yu Lu Yuan Chengruidong Zhang Cyril Zhang Jianwen Zhang Li\u00a0Lyna Zhang Yi Zhang Yue Zhang Yunan Zhang and Xiren Zhou. 2024. Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone. arxiv:2404.14219\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2404.14219"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.824"},{"key":"e_1_3_2_1_3_1","volume-title":"SIGMA: An Open-Source Interactive System for Mixed-Reality Task Assistance Research. arxiv:2405.13035\u00a0[cs.HC] https:\/\/arxiv.org\/abs\/2405.13035","author":"Bohus Dan","year":"2024","unstructured":"Dan Bohus, Sean Andrist, Nick Saw, Ann Paradiso, Ishani Chakraborty, and Mahdi Rad. 2024. SIGMA: An Open-Source Interactive System for Mixed-Reality Task Assistance Research. arxiv:2405.13035\u00a0[cs.HC] https:\/\/arxiv.org\/abs\/2405.13035"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/VRW62533.2024.00241"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2023.3327396"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3563703.3596625"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Huang Jiangyong","year":"2024","unstructured":"Jiangyong Huang, Silong Yong, Xiaojian Ma, Xiongkun Linghu, Puhao Li, Yan Wang, Qing Li, Song-Chun Zhu, Baoxiong Jia, and Siyuan Huang. 2024. An Embodied Generalist Agent in 3D World. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01765"},{"key":"e_1_3_2_1_9_1","volume-title":"Seed-bench: Benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125","author":"Li Bohao","year":"2023","unstructured":"Bohao Li, Rui Wang, Guangzhi Wang, Yuying Ge, Yixiao Ge, and Ying Shan. 2023. Seed-bench: Benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125 (2023)."},{"key":"e_1_3_2_1_10_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2023. Visual Instruction Tuning. In NeurIPS."},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=IDJx97BC38","author":"Ma Xiaojian","year":"2023","unstructured":"Xiaojian Ma, Silong Yong, Zilong Zheng, Qing Li, Yitao Liang, Song-Chun Zhu, and Siyuan Huang. 2023. SQA3D: Situated Question Answering in 3D Scenes. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=IDJx97BC38"},{"key":"e_1_3_2_1_12_1","volume-title":"Dynamic Mixed Reality Assembly Guidance Using Optical Recognition Methods. Applied Sciences","author":"Maffei A.","year":"2023","unstructured":"A. Maffei, Michela\u00a0Dalle Mura, Fabio\u00a0Marco Monetti, and Eleonora Boffa. 2023. Dynamic Mixed Reality Assembly Guidance Using Optical Recognition Methods. Applied Sciences (2023). https:\/\/api.semanticscholar.org\/CorpusID:256451054"},{"key":"e_1_3_2_1_13_1","volume-title":"OpenEQA: Embodied Question Answering in the Era of Foundation Models. In Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Majumdar Arjun","year":"2024","unstructured":"Arjun Majumdar, Anurag Ajay, Xiaohan Zhang, Pranav Putta, Sriram Yenamandra, Mikael Henaff, Sneha Silwal, Paul Mcvay, Oleksandr Maksymets, Sergio Arnaud, Karmesh Yadav, Qiyang Li, Ben Newman, Mohit Sharma, Vincent Berges, Shiqi Zhang, Pulkit Agrawal, Yonatan Bisk, Dhruv Batra, Mrinal Kalakrishnan, Franziska Meier, Chris Paxton, Sasha Sax, and Aravind Rajeswaran. 2024. OpenEQA: Embodied Question Answering in the Era of Foundation Models. In Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_14_1","unstructured":"OpenAI. 2023. GPT-4V(ision) System Card. https:\/\/cdn.openai.com\/papers\/GPTV_System_Card.pdf"},{"key":"e_1_3_2_1_15_1","unstructured":"Rohith Peddi Shivvrat Arya Bharath Challa Likhitha Pallapothula Akshay Vyas Jikai Wang Qifan Zhang Vasundhara Komaragiri Eric Ragan Nicholas Ruozzi Yu Xiang and Vibhav Gogate. 2023. CaptainCook4D: A dataset for understanding errors in procedural activities. arxiv:2312.14556\u00a0[cs.CV]"},{"key":"e_1_3_2_1_16_1","unstructured":"Zhiliang Peng Wenhui Wang Li Dong Yaru Hao Shaohan Huang Shuming Ma and Furu Wei. 2023. Kosmos-2: Grounding Multimodal Large Language Models to the World. arxiv:2306.14824\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2306.14824"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3385378.3385379"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00431"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01271"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01854"},{"key":"e_1_3_2_1_21_1","volume-title":"STAR: A Benchmark for Situated Reasoning in Real-World Videos. In Thirty-fifth Conference on Neural Information Processing Systems (NeurIPS).","author":"Wu Bo","year":"2021","unstructured":"Bo Wu, Shoubin Yu, Tenenbaum Joshua\u00a0B Chen, Zhenfang, and Chuang Gan. 2021. STAR: A Benchmark for Situated Reasoning in Real-World Videos. In Thirty-fifth Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_22_1","volume-title":"MMBench: Is Your Multi-modal Model an All-around Player?arXiv:2307.06281","author":"Bo Li Songyang Zhang Yuanhan Zhang","year":"2023","unstructured":"Yuanhan Zhang Bo Li Songyang Zhang Wangbo Zhao Yike Yuan Jiaqi Wang Conghui He Ziwei Liu Kai Chen Dahua\u00a0Lin Yuan\u00a0Liu, Haodong\u00a0Duan. 2023. MMBench: Is Your Multi-modal Model an All-around Player?arXiv:2307.06281 (2023)."}],"event":{"name":"ICMI '24: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"],"location":"San Jose Costa Rica","acronym":"ICMI '24"},"container-title":["Companion Proceedings of the 26th International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3686215.3690152","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3686215.3690152","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,15]],"date-time":"2025-10-15T16:21:36Z","timestamp":1760545296000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3686215.3690152"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,4]]},"references-count":22,"alternative-id":["10.1145\/3686215.3690152","10.1145\/3686215"],"URL":"https:\/\/doi.org\/10.1145\/3686215.3690152","relation":{},"subject":[],"published":{"date-parts":[[2024,11,4]]},"assertion":[{"value":"2024-11-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}