{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:57:28Z","timestamp":1781539048925,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810869","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"911-920","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["TAVEN: Task-driven Adaptive Viewpoint Exploration for Training-Free 3D Spatial Reasoning and Understanding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-0953-4088","authenticated-orcid":false,"given":"Shuyi","family":"Jiang","sequence":"first","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3100-1252","authenticated-orcid":false,"given":"Zhihao","family":"Yuan","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Shenzhen, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2329-7014","authenticated-orcid":false,"given":"Na","family":"Zhao","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design, Singapore, Singapore"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katherine Millican Malcolm Reynolds et\u00a0al. 2022. Flamingo: a visual language model for few-shot learning. Advances in neural information processing systems 35 (2022) 23716\u201323736.","DOI":"10.52202\/068431-1723"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01854"},{"key":"e_1_3_3_1_4_2","unstructured":"Shuai Bai Keqin Chen Xuejing Liu Jialin Wang Wenbin Ge Sibo Song Kai Dang Peng Wang Shijie Wang Jun Tang et\u00a0al. 2025. Qwen2. 5-vl technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2502.13923 (2025)."},{"key":"e_1_3_3_1_5_2","first-page":"65","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65\u201372."},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2016.XII.041"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Angel Chang Angela Dai Thomas Funkhouser Maciej Halber Matthias Niessner Manolis Savva Shuran Song Andy Zeng and Yinda Zhang. 2017. Matterport3d: Learning from rgb-d data in indoor environments. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1709.06158 (2017).","DOI":"10.1109\/3DV.2017.00081"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58565-5_13"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"crossref","unstructured":"Fei-Long Chen Du-Zhen Zhang Ming-Lun Han Xiu-Yi Chen Jing Shi Shuang Xu and Bo Xu. 2023. Vlp: A survey on vision-language pre-training. Machine Intelligence Research 20 1 (2023) 38\u201356.","DOI":"10.1007\/s11633-022-1369-5"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02496"},{"key":"e_1_3_3_1_11_2","unstructured":"Yilun Chen Shuai Yang Haifeng Huang Tai Wang Runsen Xu Ruiyuan Lyu Dahua Lin and Jiangmiao Pang. 2024. Grounded 3d-llm with referent tokens. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.10370 (2024)."},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00321"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"e_1_3_3_1_14_2","unstructured":"Hyung\u00a0Won Chung Le Hou Shayne Longpre Barret Zoph Yi Tay William Fedus Yunxuan Li Xuezhi Wang Mostafa Dehghani Siddhartha Brahma et\u00a0al. 2024. Scaling instruction-finetuned language models. Journal of Machine Learning Research 25 70 (2024) 1\u201353."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.261"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01263"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"crossref","unstructured":"Zhiyang Guo Yingping Huang Xing Hu Hongjian Wei and Baigan Zhao. 2021. A survey on deep learning based approaches for scene understanding in autonomous driving. Electronics 10 4 (2021) 471.","DOI":"10.3390\/electronics10040471"},{"key":"e_1_3_3_1_18_2","unstructured":"Qingrong He Kejun Lin Shizhe Chen Anwen Hu and Qin Jin. 2024. Think-Program-reCtify: 3D Situated Reasoning with Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.14705 (2024)."},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"crossref","unstructured":"Yining Hong Haoyu Zhen Peihao Chen Shuhong Zheng Yilun Du Zhenfang Chen and Chuang Gan. 2023. 3d-llm: Injecting the 3d world into large language models. Advances in Neural Information Processing Systems 36 (2023) 20482\u201320494.","DOI":"10.52202\/075280-0900"},{"key":"e_1_3_3_1_20_2","unstructured":"Haifeng Huang Yilun Chen Zehan Wang Rongjie Huang Runsen Xu Tai Wang Luping Liu Xize Cheng Yang Zhao Jiangmiao Pang et\u00a0al. 2023. Chat-Scene: Bridging 3D Scene and Large Language Models with Object Identifiers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2312.08168 (2023)."},{"key":"e_1_3_3_1_21_2","unstructured":"Jiangyong Huang Silong Yong Xiaojian Ma Xiongkun Linghu Puhao Li Yan Wang Qing Li Song-Chun Zhu Baoxiong Jia and Siyuan Huang. 2023. An embodied generalist agent in 3d world. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.12871 (2023)."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Denis Kalkofen Erick Mendez and Dieter Schmalstieg. 2008. Comprehensible visualization for augmented reality. IEEE transactions on visualization and computer graphics 15 2 (2008) 193\u2013204.","DOI":"10.1109\/TVCG.2008.96"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"crossref","unstructured":"Bernhard Kerbl Georgios Kopanas Thomas Leimk\u00fchler and George Drettakis. 2023. 3d gaussian splatting for real-time radiance field rendering. ACM Trans. Graph. 42 4 (2023) 139\u20131.","DOI":"10.1145\/3592433"},{"key":"e_1_3_3_1_24_2","first-page":"19730","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730\u201319742."},{"key":"e_1_3_3_1_25_2","unstructured":"Rong Li Shijie Li Lingdong Kong Xulei Yang and Junwei Liang. 2024. Seeground: See and ground for zero-shot open-vocabulary 3d visual grounding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.04383 (2024)."},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01351"},{"key":"e_1_3_3_1_27_2","first-page":"74","volume-title":"Text summarization branches out","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74\u201381."},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"crossref","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong\u00a0Jae Lee. 2023. Visual instruction tuning. Advances in neural information processing systems 36 (2023) 34892\u201334916.","DOI":"10.52202\/075280-1516"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Pan Lu Swaroop Mishra Tanglin Xia Liang Qiu Kai-Wei Chang Song-Chun Zhu Oyvind Tafjord Peter Clark and Ashwin Kalyan. 2022. Learn to explain: Multimodal reasoning via thought chains for science question answering. Advances in Neural Information Processing Systems 35 (2022) 2507\u20132521.","DOI":"10.52202\/068431-0182"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01560"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Muzammal Naseer Salman Khan and Fatih Porikli. 2018. Indoor scene understanding in 2.5\/3d for autonomous agents: A survey. IEEE access 7 (2018) 1859\u20131887.","DOI":"10.1109\/ACCESS.2018.2886133"},{"key":"e_1_3_3_1_32_2","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arxiv:https:\/\/arXiv.org\/abs\/2303.08774\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2303.08774"},{"key":"e_1_3_3_1_33_2","unstructured":"OpenAI. 2023. Introducing ChatGPT. https:\/\/openai.com\/blog\/chatgpt. Accessed: 2025-08-02."},{"key":"e_1_3_3_1_34_2","first-page":"311","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00937"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02495"},{"key":"e_1_3_3_1_37_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_1_38_2","unstructured":"Simranjit Singh Georgios Pavlakos and Dimitrios Stamoulis. 2024. Evaluating zero-shot gpt-4v performance on 3d visual question answering benchmarks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.18831 (2024)."},{"key":"e_1_3_3_1_39_2","unstructured":"Vikram Sundar. 2018. PyRender: A Python Renderer for 3D Models. https:\/\/github.com\/mmatl\/pyrender. Accessed: 2024-04-10."},{"key":"e_1_3_3_1_40_2","unstructured":"Ay\u00e7a Takmaz Elisabetta Fedele Robert\u00a0W Sumner Marc Pollefeys Federico Tombari and Francis Engelmann. 2023. OpenMask3D: Open-Vocabulary 3D Instance Segmentation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.13631 (2023)."},{"key":"e_1_3_3_1_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_3_1_42_2","unstructured":"Xinyi Wang Xun Yang Yanlong Xu Yuchen Wu Zhen Li and Na Zhao. 2025. AffordBot: 3D Fine-grained Embodied Reasoning via Multimodal Large Language Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2511.10017 (2025)."},{"key":"e_1_3_3_1_43_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32863"},{"key":"e_1_3_3_1_44_2","first-page":"131","volume-title":"European Conference on Computer Vision","author":"Xu Runsen","year":"2024","unstructured":"Runsen Xu, Xiaolong Wang, Tai Wang, Yilun Chen, Jiangmiao Pang, and Dahua Lin. 2024. Pointllm: Empowering large language models to understand point clouds. In European Conference on Computer Vision. Springer, 131\u2013147."},{"key":"e_1_3_3_1_45_2","doi-asserted-by":"crossref","unstructured":"Xu Yan Zhihao Yuan Yuhao Du Yinghong Liao Yao Guo Shuguang Cui and Zhen Li. 2023. Comprehensive visual question answering on point clouds through compositional scene manipulation. IEEE Transactions on Visualization and Computer Graphics (2023).","DOI":"10.1109\/TVCG.2023.3340679"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10610443"},{"key":"e_1_3_3_1_47_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00142"},{"key":"e_1_3_3_1_48_2","unstructured":"Zhihao Yuan Shuyi Jiang Chun-Mei Feng Yaolun Zhang Shuguang Cui Zhen Li and Na Zhao. 2025. Scene-r1: Video-grounded large language models for 3d scene reasoning without 3d annotations. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2506.17545 (2025)."},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"crossref","unstructured":"Zhihao Yuan Yibo Peng Jinke Ren Yinghong Liao Yatong Han Chun-Mei Feng Hengshuang Zhao Guanbin Li Shuguang Cui and Zhen Li. 2025. Empowering Large Language Models with 3D Situation Awareness. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2503.23024 (2025).","DOI":"10.1109\/CVPR52734.2025.01810"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01949"},{"key":"e_1_3_3_1_51_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00181"},{"key":"e_1_3_3_1_52_2","first-page":"186","volume-title":"European Conference on Computer Vision","author":"Zhang Sha","year":"2024","unstructured":"Sha Zhang, Di Huang, Jiajun Deng, Shixiang Tang, Wanli Ouyang, Tong He, and Yanyong Zhang. 2024. Agent3d-zero: An agent for zero-shot 3d understanding. In European Conference on Computer Vision. Springer, 186\u2013202."},{"key":"e_1_3_3_1_53_2","unstructured":"Yuheng Zhang Zhecan Li Xiaoxia Yang Xingdi Wang Yao Lu Siyu Tang Zhenguo Zhang and Jie Zhou. 2023. GPT-4V(ision) as a Generalist Evaluator for Vision-Language Tasks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.01361 (2023). https:\/\/arxiv.org\/abs\/2311.01361"},{"key":"e_1_3_3_1_54_2","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen Xiang Li and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.10592 (2023)."},{"key":"e_1_3_3_1_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02748"}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:40:19Z","timestamp":1781538019000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810869"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":54,"alternative-id":["10.1145\/3805622.3810869","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810869","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}