{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:56:03Z","timestamp":1781535363934,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/legalcode"}],"funder":[{"name":"National High-Level Young Talent Program","award":["2025HY00260104"],"award-info":[{"award-number":["2025HY00260104"]}]},{"name":"Fundamental Research Funds for Higher Education Institutions allocated to Sun Yat-sen University","award":["25hytd007"],"award-info":[{"award-number":["25hytd007"]}]},{"name":"Guangdong Provincial High-Level Young Talent Program","award":["2025HYSPT0707"],"award-info":[{"award-number":["2025HYSPT0707"]}]},{"name":"Tuoyuna Grant","award":["HT-99982025-0564"],"award-info":[{"award-number":["HT-99982025-0564"]}]},{"name":"Faculty Start-up Research Fund","award":["67000-12255002"],"award-info":[{"award-number":["67000-12255002"]}]},{"name":"Huawei Strategic Research Institute Talent Fund"},{"name":"Key Development Project of the Artificial Intelligence Institute of Sun Yat-sen University","award":["2025RGZN009"],"award-info":[{"award-number":["2025RGZN009"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810797","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"558-566","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["OOWM: Structuring Embodied Reasoning and Planning via Object-Oriented Programmatic World Modeling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6762-4617","authenticated-orcid":false,"given":"Hongyu","family":"Chen","sequence":"first","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2248-3755","authenticated-orcid":false,"given":"Liang","family":"Lin","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China; X-Era AI Lab, Guangzhou, China and Guangdong Key Laboratory of Big Data Analysis and Processing, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7760-1339","authenticated-orcid":false,"given":"Guangrun","family":"Wang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China; X-Era AI Lab, Guangzhou, China and Guangdong Key Laboratory of Big Data Analysis and Processing, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Charles Ashbacher. 2004. \"The Unified Modeling Language Reference Manual Second Edition\" by James Rumbaugh. J. Object Technol. 3 10 (2004) 193\u2013195.","DOI":"10.5381\/jot.2004.3.10.r1"},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29720"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0399"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"crossref","unstructured":"Kevin Black Noah Brown Danny Driess Adnan Esmail Michael Equi Chelsea Finn Niccolo Fusai Lachy Groom Karol Hausman Brian Ichter et\u00a0al. 2024. \u03c00: A Vision-Language-Action Flow Model for General Robot Control. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.24164 (2024).","DOI":"10.15607\/RSS.2025.XXI.010"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"crossref","unstructured":"Anthony Brohan Noah Brown Justice Carbajal Yevgen Chebotar Joseph Dabis Chelsea Finn Keerthana Gopalakrishnan Karol Hausman Alex Herzog Jasmine Hsu et\u00a0al. 2022. Rt-1: Robotics transformer for real-world control at scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2212.06817 (2022).","DOI":"10.15607\/RSS.2023.XIX.025"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Beiqi Chen Shuai Shao Haitang Feng Jianhuang Lai Jianlou Si and Guangcong Wang. 2025. Style4D-Bench: A Benchmark Suite for 4D Stylization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.19243 (2025).","DOI":"10.1609\/aaai.v40i4.37266"},{"key":"e_1_3_3_1_8_2","volume-title":"RADAR: Benchmarking Vision-Language-Action Generalization via Real-World Dynamics, Spatial-Physical Intelligence, and Autonomous Evaluation","author":"Chen Yuhao","year":"2026","unstructured":"Yuhao Chen, Zhihao Zhan, Xiaoxin Lin, Zijian Song, Hao Liu, Qinhan Lyu, Yubo Zu, Xiao Chen, Zhiyuan Liu, Tao Pu, Tianshui Chen, Keze Wang, Liang Lin, and Guangrun Wang. 2026. RADAR: Benchmarking Vision-Language-Action Generalization via Real-World Dynamics, Spatial-Physical Intelligence, and Autonomous Evaluation. Technical Report. Sun Yat-sen University. Technical Report."},{"key":"e_1_3_3_1_9_2","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et\u00a0al. 2024. Expanding Performance Boundaries of Open-Source Multimodal Models with Model Data and Test-Time Scaling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.05271 (2024)."},{"key":"e_1_3_3_1_10_2","unstructured":"Kun Chu Xufeng Zhao Cornelius Weber and Stefan Wermter. 2025. LLM+MAP: Bimanual Robot Task Planning using Large Language Models and Planning Domain Definition Language. CoRR abs\/2503.17309 (2025)."},{"key":"e_1_3_3_1_11_2","volume-title":"ICLR","author":"Creswell Antonia","year":"2023","unstructured":"Antonia Creswell, Murray Shanahan, and Irina Higgins. 2023. Selection-Inference: Exploiting Large Language Models for Interpretable Logical Reasoning. In ICLR. OpenReview.net."},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.15607\/RSS.2024.XX.034"},{"key":"e_1_3_3_1_13_2","unstructured":"Physical Intelligence Kevin Black Noah Brown James Darpinian Karan Dhabalia Danny Driess Adnan Esmail Michael Equi Chelsea Finn Niccolo Fusai et\u00a0al. [n. d.]. \u03c0 0. 5: a vision-language-action model with open-world generalization 2025. URL https:\/\/arxiv. org\/abs\/2504.16054 1 2 ([n. d.]) 3."},{"key":"e_1_3_3_1_14_2","volume-title":"NeurIPS","author":"Kojima Takeshi","year":"2022","unstructured":"Takeshi Kojima, Shixiang\u00a0Shane Gu, Machel Reid, Yutaka Matsuo, and Yusuke Iwasawa. 2022. Large Language Models are Zero-Shot Reasoners. In NeurIPS."},{"key":"e_1_3_3_1_15_2","unstructured":"Weiqi Li Quande Zhang Ruifeng Zhai Liang Lin and Guangrun Wang. 2025. VLA Models Are More Generalizable Than You Think: Revisiting Physical and Spatial Modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2512.02902 (2025)."},{"key":"e_1_3_3_1_16_2","unstructured":"Xiao Li Jiaqi Zhang Shuxiang Zhang Tianshui Chen Liang Lin and Guangrun Wang. 2025. In-Situ Tweedie Discrete Diffusion Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2510.01047 (2025)."},{"key":"e_1_3_3_1_17_2","volume-title":"NeurIPS","author":"Ouyang Long","year":"2022","unstructured":"Long Ouyang, Jeffrey Wu, Xu Jiang, Diogo Almeida, Carroll\u00a0L. Wainwright, Pamela Mishkin, Chong Zhang, Sandhini Agarwal, Katarina Slama, Alex Ray, John Schulman, Jacob Hilton, Fraser Kelton, Luke Miller, Maddie Simens, Amanda Askell, Peter Welinder, Paul\u00a0F. Christiano, Jan Leike, and Ryan Lowe. 2022. Training language models to follow instructions with human feedback. In NeurIPS."},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.248"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206537"},{"key":"e_1_3_3_1_20_2","unstructured":"Zhihong Shao Peiyi Wang Qihao Zhu Runxin Xu Junxiao Song Mingchuan Zhang Y.\u00a0K. Li Y. Wu and Daya Guo. 2024. DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models. CoRR abs\/2402.03300 (2024)."},{"key":"e_1_3_3_1_21_2","unstructured":"Haozhan Shen Peng Liu Jingcheng Li Chunxin Fang Yibo Ma Jiajia Liao Qiaoli Shen Zilun Zhang Kangjia Zhao Qianqian Zhang Ruochen Xu and Tiancheng Zhao. 2025. VLM-R1: A Stable and Generalizable R1-style Large Vision-Language Model. CoRR abs\/2504.07615 (2025)."},{"key":"e_1_3_3_1_22_2","unstructured":"Zijian Song Sihan Qin Tianshui Chen Liang Lin and Guangrun Wang. 2025. Physical autoregressive model for robotic manipulation without action pretraining. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.09822 (2025)."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.475"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.147"},{"key":"e_1_3_3_1_25_2","unstructured":"Weiyun Wang Zhe Chen Wenhai Wang Yue Cao Yangzhou Liu Zhangwei Gao Jinguo Zhu Xizhou Zhu Lewei Lu Yu Qiao and Jifeng Dai. 2024. Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.10442 (2024)."},{"key":"e_1_3_3_1_26_2","volume-title":"ICLR","author":"Wang Xuezhi","year":"2023","unstructured":"Xuezhi Wang, Jason Wei, Dale Schuurmans, Quoc\u00a0V. Le, Ed\u00a0H. Chi, Sharan Narang, Aakanksha Chowdhery, and Denny Zhou. 2023. Self-Consistency Improves Chain of Thought Reasoning in Language Models. In ICLR. OpenReview.net."},{"key":"e_1_3_3_1_27_2","volume-title":"NeurIPS","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Brian Ichter, Fei Xia, Ed\u00a0H. Chi, Quoc\u00a0V. Le, and Denny Zhou. 2022. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. In NeurIPS."},{"key":"e_1_3_3_1_28_2","unstructured":"Lintao Xiang Xinkai Chen Jianhuang Lai and Guangcong Wang. 2025. Distilled-3DGS: distilled 3D Gaussian splatting. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.14037 (2025)."},{"key":"e_1_3_3_1_29_2","unstructured":"Siheng Xiong Jieyu Zhou Zhangding Liu and Yusen Su. 2025. SymPlanner: Deliberate Planning in Language Models with Symbolic Representation. CoRR abs\/2505.01479 (2025)."},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.720"},{"key":"e_1_3_3_1_31_2","unstructured":"Yuanfeng Xu Yuhao Chen Liang Lin and Guangrun Wang. 2026. Bridging the Discrete-Continuous Gap: Unified Multimodal Generation via Coupled Manifold Discrete Absorbing Diffusion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2601.04056 (2026)."},{"key":"e_1_3_3_1_32_2","volume-title":"NeurIPS","author":"Yao Shunyu","year":"2023","unstructured":"Shunyu Yao, Dian Yu, Jeffrey Zhao, Izhak Shafran, Tom Griffiths, Yuan Cao, and Karthik Narasimhan. 2023. Tree of Thoughts: Deliberate Problem Solving with Large Language Models. In NeurIPS."},{"key":"e_1_3_3_1_33_2","unstructured":"Zheng Yuan Hongyi Yuan Chuanqi Tan Wei Wang Songfang Huang and Fei Huang. 2023. RRHF: Rank Responses to Align Language Models with Human Feedback without tears. CoRR abs\/2304.05302 (2023)."},{"key":"e_1_3_3_1_34_2","volume-title":"NeurIPS","author":"Zelikman Eric","year":"2022","unstructured":"Eric Zelikman, Yuhuai Wu, Jesse Mu, and Noah\u00a0D. Goodman. 2022. STaR: Bootstrapping Reasoning With Reasoning. In NeurIPS."},{"key":"e_1_3_3_1_35_2","unstructured":"Zhihao Zhan Yuhao Chen Jiaying Zhou Qinhan Lv Hao Liu Keze Wang Liang Lin and Guangrun Wang. 2026. Stable Language Guidance for Vision-Language-Action Models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2601.04052 (2026)."},{"key":"e_1_3_3_1_36_2","unstructured":"Zhihao Zhan Jiaying Zhou Likui Zhang Qinhan Lv Hao Liu Jusheng Zhang Weizheng Li Ziliang Chen Tianshui Chen Keze Wang et\u00a0al. 2025. \\(\\mathcal {E}_0\\): Enhancing Generalization and Fine-Grained Control in VLA Models via Continuized Discrete Diffusion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2511.21542 (2025)."},{"key":"e_1_3_3_1_37_2","unstructured":"Li Zhang. 2024. Structured Event Reasoning with Large Language Models. CoRR abs\/2408.16098 (2024)."},{"key":"e_1_3_3_1_38_2","unstructured":"Zhuosheng Zhang Aston Zhang Mu Li Hai Zhao George Karypis and Alex Smola. 2024. Multimodal Chain-of-Thought Reasoning in Language Models. Trans. Mach. Learn. Res. 2024 (2024)."},{"key":"e_1_3_3_1_39_2","volume-title":"ICLR","author":"Zhou Denny","year":"2023","unstructured":"Denny Zhou, Nathanael Sch\u00e4rli, Le Hou, Jason Wei, Nathan Scales, Xuezhi Wang, Dale Schuurmans, Claire Cui, Olivier Bousquet, Quoc\u00a0V. Le, and Ed\u00a0H. Chi. 2023. Least-to-Most Prompting Enables Complex Reasoning in Large Language Models. In ICLR. OpenReview.net."},{"key":"e_1_3_3_1_40_2","first-page":"2165","volume-title":"Conference on Robot Learning","author":"Zitkovich Brianna","year":"2023","unstructured":"Brianna Zitkovich, Tianhe Yu, Sichun Xu, Peng Xu, Ted Xiao, Fei Xia, Jialin Wu, Paul Wohlhart, Stefan Welker, Ayzaan Wahid, et\u00a0al. 2023. Rt-2: Vision-language-action models transfer web knowledge to robotic control. In Conference on Robot Learning. PMLR, 2165\u20132183."},{"key":"e_1_3_3_1_41_2","unstructured":"Heqing Zou Tianze Luo Guiyang Xie Fengmao Lv Guangcong Wang Junyang Chen Zhuochen Wang Hansheng Zhang Huaijian Zhang et\u00a0al. 2024. From seconds to hours: Reviewing multimodal large language models on comprehensive long video understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.18938 (2024)."}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:46:42Z","timestamp":1781534802000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810797"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":40,"alternative-id":["10.1145\/3805622.3810797","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810797","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}