{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:22:58Z","timestamp":1777656178980,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key Research and Development Project of New Generation Artificial Intelligence of China","award":["2018AAA0102504"],"award-info":[{"award-number":["2018AAA0102504"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612351","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"2838-2846","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Generating Explanations for Embodied Action Decision from Visual Observation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2396-0824","authenticated-orcid":false,"given":"Xiaohan","family":"Wang","sequence":"first","affiliation":[{"name":"Institute of Artificial Intelligence and Robotics, Xi'an Jiaotong University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1048-5115","authenticated-orcid":false,"given":"Yuehu","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence and Robotics, Xi'an Jiaotong University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0895-1076","authenticated-orcid":false,"given":"Xinhang","family":"Song","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5537-5869","authenticated-orcid":false,"given":"Beibei","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence and Robotics, Xi'an Jiaotong University, Xi'an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1596-4326","authenticated-orcid":false,"given":"Shuqiang","family":"Jiang","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"e_1_3_2_1_2_1","volume-title":"Interpnet: Neural introspection for interpretable deep learning. arXiv preprint arXiv:1710.09511","author":"Barratt Shane","year":"2017","unstructured":"Shane Barratt. 2017. Interpnet: Neural introspection for interpretable deep learning. arXiv preprint arXiv:1710.09511 (2017)."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the AutoML workshop at ICML","volume":"2014","author":"Biran Or","year":"2014","unstructured":"Or Biran and Kathleen McKeown. 2014. Justification narratives for individual classifications. In Proceedings of the AutoML workshop at ICML, Vol. 2014. 1--7."},{"key":"e_1_3_2_1_4_1","volume-title":"Matterport3d: Learning from rgb-d data in indoor environments. arXiv preprint arXiv:1709.06158","author":"Chang Angel","year":"2017","unstructured":"Angel Chang, Angela Dai, Thomas Funkhouser, Maciej Halber, Matthias Niessner, Manolis Savva, Shuran Song, Andy Zeng, and Yinda Zhang. 2017. Matterport3d: Learning from rgb-d data in indoor environments. arXiv preprint arXiv:1709.06158 (2017)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Matt Deitke Winson Han Alvaro Herrasti Aniruddha Kembhavi Eric Kolve Roozbeh Mottaghi Jordi Salvador Dustin Schwenk Eli VanderBilt Matthew Wallingford Luca Weihs Mark Yatskar and Ali Farhadi. 2020. RoboTHOR: An Open Simulation-to-Real Embodied AI Platform. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00323"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350943"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_1"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_17"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.68"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.5555\/2566972.2566993"},{"key":"e_1_3_2_1_14_1","volume-title":"Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907","author":"Kipf Thomas N","year":"2016","unstructured":"Thomas N Kipf and Max Welling. 2016. Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907 (2016)."},{"key":"e_1_3_2_1_15_1","volume-title":"AI2-THOR: An Interactive 3D Environment for Visual AI. arXiv","author":"Kolve Eric","year":"2017","unstructured":"Eric Kolve, Roozbeh Mottaghi, Winson Han, Eli VanderBilt, Luca Weihs, Alvaro Herrasti, Daniel Gordon, Yuke Zhu, Abhinav Gupta, and Ali Farhadi. 2017. AI2-THOR: An Interactive 3D Environment for Visual AI. arXiv (2017)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351017"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475575"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.13"},{"key":"e_1_3_2_1_23_1","volume-title":"MINOS: Multimodal Indoor Simulator for Navigation in Complex Environments. arXiv:1712.03931","author":"Savva Manolis","year":"2017","unstructured":"Manolis Savva, Angel X. Chang, Alexey Dosovitskiy, Thomas Funkhouser, and Vladlen Koltun. 2017. MINOS: Multimodal Indoor Simulator for Navigation in Complex Environments. arXiv:1712.03931 (2017)."},{"key":"e_1_3_2_1_24_1","unstructured":"Bokui Shen Fei Xia Chengshu Li Roberto Mart\u00edn-Mart\u00edn Linxi Fan Guanzhi Wang Shyamal Buch Claudia D'Arpino Sanjana Srivastava Lyne P Tchapmi et al. 2020. iGibson a Simulation Environment for Interactive Tasks in Large Realistic Scenes. arXiv preprint arXiv:2012.02924 (2020)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_44"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2964299"},{"key":"e_1_3_2_1_29_1","unstructured":"Peter Welinder Steve Branson Takeshi Mita Catherine Wah Florian Schroff Serge Belongie and Pietro Perona. 2010. Caltech-UCSD birds 200. (2010)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/LRA.2020.2965078"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00945"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00974"},{"key":"e_1_3_2_1_34_1","volume-title":"Tel Aviv, Israel","volume":"320","author":"Zhang Sixian","year":"2022","unstructured":"Sixian Zhang, Weijie Li, Xinhang Song, Yubing Bai, and Shuqiang Jiang. [n.,d.]. Generative Meta-Adversarial Network for Unseen Object Navigation. In Computer Vision - ECCV 2022 - 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXIX (Lecture Notes in Computer Science, Vol. 13699). 301--320."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01485"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01039"},{"key":"e_1_3_2_1_37_1","volume-title":"Places: A 10 million image database for scene recognition","author":"Zhou Bolei","year":"2017","unstructured":"Bolei Zhou, Agata Lapedriza, Aditya Khosla, Aude Oliva, and Antonio Torralba. 2017. Places: A 10 million image database for scene recognition. IEEE transactions on pattern analysis and machine intelligence, Vol. 40, 6 (2017), 1452--1464."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612351","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612351","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:01:02Z","timestamp":1755820862000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612351"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":37,"alternative-id":["10.1145\/3581783.3612351","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612351","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}