{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:42:51Z","timestamp":1776886971944,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","award":["2020AAA0109703"],"award-info":[{"award-number":["2020AAA0109703"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62076167"],"award-info":[{"award-number":["62076167"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"the Yuxiu Innovation Project of NCUT","award":["2024NCUTYXCX102"],"award-info":[{"award-number":["2024NCUTYXCX102"]}]},{"name":"Joint Fund Key Program of the National Natural Science Foundation of China","award":["U23B2029"],"award-info":[{"award-number":["U23B2029"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681479","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"4814-4822","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["An Entailment Tree Generation Approach for Multimodal Multi-Hop Question Answering with Mixture-of-Experts and Iterative Feedback Mechanism"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5368-339X","authenticated-orcid":false,"given":"Qing","family":"Zhang","sequence":"first","affiliation":[{"name":"North China University of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8261-3566","authenticated-orcid":false,"given":"Haocheng","family":"Lv","sequence":"additional","affiliation":[{"name":"North China University of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5953-4566","authenticated-orcid":false,"given":"Jie","family":"Liu","sequence":"additional","affiliation":[{"name":"North China University of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7035-7423","authenticated-orcid":false,"given":"Zhiyun","family":"Chen","sequence":"additional","affiliation":[{"name":"North China University of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2244-3764","authenticated-orcid":false,"given":"Jianyong","family":"Duan","sequence":"additional","affiliation":[{"name":"North China University of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0896-080X","authenticated-orcid":false,"given":"Hao","family":"Wang","sequence":"additional","affiliation":[{"name":"North China University of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3068-735X","authenticated-orcid":false,"given":"Li","family":"He","sequence":"additional","affiliation":[{"name":"North China University of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5018-0270","authenticated-orcid":false,"given":"Mingying","family":"Xu","sequence":"additional","affiliation":[{"name":"North China University of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_2_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners 1877--1901 pages."},{"key":"e_1_3_2_1_3_1","volume-title":"Webqa: Multihop and multimodal qa, 16495--16504 pages.","author":"Chang Yingshan","year":"2022","unstructured":"Yingshan Chang, Mridu Narang, Hisami Suzuki, Guihong Cao, Jianfeng Gao, and Yonatan Bisk. 2022. Webqa: Multihop and multimodal qa, 16495--16504 pages."},{"key":"e_1_3_2_1_4_1","volume-title":"All you may need for vqa are image captions. arXiv preprint arXiv:2205.01883","author":"Changpinyo Soravit","year":"2022","unstructured":"Soravit Changpinyo, Doron Kukliansky, Idan Szpektor, Xi Chen, Nan Ding, and Radu Soricut. 2022. All you may need for vqa are image captions. arXiv preprint arXiv:2205.01883 (2022)."},{"key":"e_1_3_2_1_5_1","volume-title":"Murag: Multimodal retrieval-augmented generator for open question answering over images and text. arXiv preprint arXiv:2210.02928","author":"Chen Wenhu","year":"2022","unstructured":"Wenhu Chen, Hexiang Hu, Xi Chen, Pat Verga, and William W Cohen. 2022. Murag: Multimodal retrieval-augmented generator for open question answering over images and text. arXiv preprint arXiv:2210.02928 (2022)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.585"},{"key":"e_1_3_2_1_7_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00501"},{"key":"e_1_3_2_1_9_1","volume-title":"Sparsely activated mixture-of-experts are robust multi-task learners. arXiv preprint arXiv:2204.07689","author":"Gupta Shashank","year":"2022","unstructured":"Shashank Gupta, Subhabrata Mukherjee, Krishan Subudhi, Eduardo Gonzalez, Damien Jose, Ahmed H Awadallah, and Jianfeng Gao. 2022. Sparsely activated mixture-of-experts are robust multi-task learners. arXiv preprint arXiv:2204.07689 (2022)."},{"key":"e_1_3_2_1_10_1","volume-title":"METGEN: A Module-Based Entailment Tree Generation Framework for Answer Explanation. In Findings of the Association for Computational Linguistics: NAACL 2022. 1887","author":"Hong Ruixin","year":"2022","unstructured":"Ruixin Hong, Hongming Zhang, Xintong Yu, and Changshui Zhang. 2022. METGEN: A Module-Based Entailment Tree Generation Framework for Answer Explanation. In Findings of the Association for Computational Linguistics: NAACL 2022. 1887--1905."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00277"},{"key":"e_1_3_2_1_12_1","volume-title":"The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=T5nUQDrM4u","author":"Komatsuzaki Aran","year":"2023","unstructured":"Aran Komatsuzaki, Joan Puigcerver, James Lee-Thorp, Carlos Riquelme Ruiz, Basil Mustafa, Joshua Ainslie, Yi Tay, Mostafa Dehghani, and Neil Houlsby. 2023. Sparse Upcycling: Training Mixture-of-Experts from Dense Checkpoints. In The Eleventh International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=T5nUQDrM4u"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i8.28714"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.483"},{"key":"e_1_3_2_1_16_1","volume-title":"MMHQA-ICL: Multimodal In-context Learning for Hybrid Question Answering over Text, Tables and Images. arXiv preprint arXiv:2309.04790","author":"Liu Weihao","year":"2023","unstructured":"Weihao Liu, Fangyu Lei, Tongxu Luo, Jiahe Lei, Shizhu He, Jun Zhao, and Kang Liu. 2023. MMHQA-ICL: Multimodal In-context Learning for Hybrid Question Answering over Text, Tables and Images. arXiv preprint arXiv:2309.04790 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"Decoupled Weight Decay Regularization. In International Conference on Learning Representations.","author":"Loshchilov Ilya","year":"2018","unstructured":"Ilya Loshchilov and Frank Hutter. 2018. Decoupled Weight Decay Regularization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.626"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3220007"},{"key":"e_1_3_2_1_20_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.35"},{"key":"e_1_3_2_1_22_1","volume-title":"Multimodalqa: Complex question answering over text, tables and images. arXiv preprint arXiv:2104.06039","author":"Talmor Alon","year":"2021","unstructured":"Alon Talmor, Ori Yoran, Amnon Catav, Dan Lahav, Yizhong Wang, Akari Asai, Gabriel Ilharco, Hannaneh Hajishirzi, and Jonathan Berant. 2021. Multimodalqa: Complex question answering over text, tables and images. arXiv preprint arXiv:2104.06039 (2021)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3383313.3412236"},{"key":"e_1_3_2_1_24_1","volume-title":"Openmoe: An early effort on open mixture-of-experts language models. arXiv preprint arXiv:2402.01739","author":"Xue Fuzhao","year":"2024","unstructured":"Fuzhao Xue, Zian Zheng, Yao Fu, Jinjie Ni, Zangwei Zheng, Wangchunshu Zhou, and Yang You. 2024. Openmoe: An early effort on open mixture-of-experts language models. arXiv preprint arXiv:2402.01739 (2024)."},{"key":"e_1_3_2_1_25_1","volume-title":"Enhancing Multi-modal and Multi-hop Question Answering via Structured Knowledge and Unified Retrieval-Generation. arXiv preprint arXiv:2212.08632","author":"Yang Qian","year":"2022","unstructured":"Qian Yang, Qian Chen, Wen Wang, Baotian Hu, and Min Zhang. 2022. Enhancing Multi-modal and Multi-hop Question Answering via Structured Knowledge and Unified Retrieval-Generation. arXiv preprint arXiv:2212.08632 (2022)."},{"key":"e_1_3_2_1_26_1","volume-title":"Progressive Evidence Refinement for Open-domain Multimodal Retrieval Question Answering. arXiv e-prints","author":"Yang Shuwen","year":"2023","unstructured":"Shuwen Yang, Anran Wu, Xingjiao Wu, Luwei Xiao, Tianlong Ma, Cheng Jin, and Liang He. 2023. Progressive Evidence Refinement for Open-domain Multimodal Retrieval Question Answering. arXiv e-prints (2023), arXiv--2310."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20215"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.292"},{"key":"e_1_3_2_1_29_1","volume-title":"St-moe: Designing stable and transferable sparse expert models. arXiv preprint arXiv:2202.08906","author":"Zoph Barret","year":"2022","unstructured":"Barret Zoph, Irwan Bello, Sameer Kumar, Nan Du, Yanping Huang, Jeff Dean, Noam Shazeer, and William Fedus. 2022. St-moe: Designing stable and transferable sparse expert models. arXiv preprint arXiv:2202.08906 (2022). gr"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681479","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681479","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:47Z","timestamp":1750294667000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681479"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":29,"alternative-id":["10.1145\/3664647.3681479","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681479","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}