{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:41:23Z","timestamp":1755823283857,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Major Science and Technology Innovation 2030 ``New Generation Artificial Intelligence' project","award":["2021ZD0112904"],"award-info":[{"award-number":["2021ZD0112904"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612844","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"9456-9461","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["VTQAGen: BART-based Generative Model For Visual Text Question Answering"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-0250-1426","authenticated-orcid":false,"given":"Haoru","family":"Chen","sequence":"first","affiliation":[{"name":"Chongqing University of Posts and Telecommunication, Chongqing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2423-4982","authenticated-orcid":false,"given":"Tianjiao","family":"Wan","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9882-9215","authenticated-orcid":false,"given":"Zhimin","family":"Lin","sequence":"additional","affiliation":[{"name":"MindRank AI Ltd., Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5997-5169","authenticated-orcid":false,"given":"Kele","family":"Xu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5993-6063","authenticated-orcid":false,"given":"Jin","family":"Wang","sequence":"additional","affiliation":[{"name":"Chongqing University of Posts and Telecommunications, Chongqing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3245-1901","authenticated-orcid":false,"given":"Huaimin","family":"Wang","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"VQA: Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 21--29","author":"Agrawal Aishwarya","year":"2017","unstructured":"Aishwarya Agrawal, Jiasen Lu, Stanislaw Antol, Margaret Mitchell, Dhruv Batra, C Lawrence Zitnick, and Devi Parikh. 2017. VQA: Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 21--29."},{"key":"e_1_3_2_2_2_1","volume-title":"Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 6077--6086","author":"Anderson Peter","year":"2018","unstructured":"Peter Anderson, Xiaodong He, Chris Buehler, Damien Teney, Mark Johnson, Stephen Gould, and Lei Zhang. 2018. Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 6077--6086."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0966-6"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01605"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2019.00251"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00439"},{"key":"e_1_3_2_2_7_1","volume-title":"VTQA: Visual Text Question Answering via Entity Alignment and Cross-Media Reasoning. arXiv preprint arXiv:2303.02635","author":"Chen Kang","year":"2023","unstructured":"Kang Chen and Xiangqian Wu. 2023. VTQA: Visual Text Question Answering via Entity Alignment and Cross-Media Reasoning. arXiv preprint arXiv:2303.02635 (2023)."},{"key":"e_1_3_2_2_8_1","volume-title":"Multimodal Word-Wise Attention for Visual Question Answering. In International Joint Conference on Artificial Intelligence (IJCAI). IJCAI","author":"Chen Xun","year":"2021","unstructured":"Xun Chen, Lin Ma, Wei Zhang, Hongxun Yang, and Liang Zheng. 2021b. Multimodal Word-Wise Attention for Visual Question Answering. In International Joint Conference on Artificial Intelligence (IJCAI). IJCAI, 1805--1812."},{"key":"e_1_3_2_2_9_1","volume-title":"UNITER: A Hierarchical Pretrained Language Model for Unified Image and Text Representation. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 10536--10546","author":"Chen Yen-Chun","year":"2021","unstructured":"Yen-Chun Chen, Zhe Gan, Linjie Li, Licheng Yu, Jingjing Li, Yu Cheng, and Jingjing Liu. 2021a. UNITER: A Hierarchical Pretrained Language Model for Unified Image and Text Representation. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 10536--10546."},{"key":"e_1_3_2_2_10_1","volume-title":"UNITER: Universal Image-TexT Representation Learning. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 13194--13204","author":"Chen Yen-Chun","year":"2020","unstructured":"Yen-Chun Chen, Linjie Li, Licheng Yu, Ahmed El Kholy, Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2020. UNITER: Universal Image-TexT Representation Learning. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 13194--13204."},{"key":"e_1_3_2_2_11_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3080920"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_2_14_1","volume-title":"Dataset and Methods for Multilingual Image Question. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 2296--2304","author":"Gao Peng","year":"2015","unstructured":"Peng Gao, Qi Mao, Yufeng Zhou, Zhiheng Huang, Lei Wang, and Wei Xu. 2015. Are You Talking to a Machine? Dataset and Methods for Multilingual Image Question. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 2296--2304."},{"key":"e_1_3_2_2_15_1","volume-title":"LMVR: Learning Multimodal Vision-Language Representation via Cross-Modal Relation Modeling. arXiv preprint arXiv:2104.10821","author":"Guo Yudong","year":"2021","unstructured":"Yudong Guo, Ziping Huang, Peng Li, Xiaobo Zhou, Houjing Wang, and Liang Lin. 2021. LMVR: Learning Multimodal Vision-Language Representation via Cross-Modal Relation Modeling. arXiv preprint arXiv:2104.10821 (2021)."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6294"},{"key":"e_1_3_2_2_17_1","volume-title":"Compositional Attention Networks for Machine Reasoning. arXiv preprint arXiv:1803.03067","author":"Hudson Drew A","year":"2018","unstructured":"Drew A Hudson and Christopher D Manning. 2018. Compositional Attention Networks for Machine Reasoning. arXiv preprint arXiv:1803.03067 (2018)."},{"key":"e_1_3_2_2_18_1","volume-title":"GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 6700--6709","author":"Hudson Drew A","year":"2019","unstructured":"Drew A Hudson and Christopher D Manning. 2019. GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 6700--6709."},{"key":"e_1_3_2_2_19_1","volume-title":"Ruart: A novel text-centered solution for text-based visual question answering","author":"Jin Zan-Xia","year":"2021","unstructured":"Zan-Xia Jin, Heran Wu, Chun Yang, Fang Zhou, Jingyan Qin, Lei Xiao, and Xu-Cheng Yin. 2021. Ruart: A novel text-centered solution for text-based visual question answering. IEEE Transactions on Multimedia (2021)."},{"volume-title":"CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 2901--2910","author":"Johnson Justin","key":"e_1_3_2_2_20_1","unstructured":"Justin Johnson, Bharath Hariharan, Laurens van der Maaten, Li Fei-Fei, C Lawrence Zitnick, and Ross Girshick. 2017. CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 2901--2910."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.217"},{"key":"e_1_3_2_2_22_1","volume-title":"Dense Relational Captioning: Triple-Stream Networks for Relationship-based Captioning. arXiv preprint arXiv:2004.12419","author":"Kim Jinsoo","year":"2020","unstructured":"Jinsoo Kim and Gunhee Lee. 2020. Dense Relational Captioning: Triple-Stream Networks for Relationship-based Captioning. arXiv preprint arXiv:2004.12419 (2020)."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"crossref","unstructured":"Ranjay Krishna Yuke Zhu Oliver Groth Justin Johnson Kenji Hata Joshua Kravitz Stephanie Chen Yannis Kalantidis Li-Jia Li David A Shamma et al. 2017. Visual genome: Connecting language and vision using crowdsourced dense image annotations. International journal of computer vision Vol. 123 (2017) 32--73.","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_2_24_1","volume-title":"Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 1217--1226","author":"Lei Jie","year":"2021","unstructured":"Jie Lei, Licheng Yu, Xiaojun Yang, Yangyang Ji, Zhe Gan, Yu Cheng, Jingjing Wang, and Jingjing Liu. 2021. Learning by Aligning Visual and Language Representations. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 1217--1226."},{"key":"e_1_3_2_2_25_1","volume-title":"Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461","author":"Lewis Mike","year":"2019","unstructured":"Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer. 2019. Bart: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. arXiv preprint arXiv:1910.13461 (2019)."},{"key":"e_1_3_2_2_26_1","volume-title":"Entangled Transformer for Image-and-Text Matching. In International Conference on Computer Vision (ICCV). IEEE, 6761--6770","author":"Li Yu","year":"2019","unstructured":"Yu Li, Licheng Li, Bo Li, Chunyan Liu, Fan Zhang, and Yu Cheng. 2019a. Entangled Transformer for Image-and-Text Matching. In International Conference on Computer Vision (ICCV). IEEE, 6761--6770."},{"key":"e_1_3_2_2_27_1","volume-title":"Relation-Aware Graph Attention Network for Visual Question Answering. In International Conference on Computer Vision (ICCV). IEEE, 5670--5679","author":"Li Zhen","year":"2019","unstructured":"Zhen Li, Feng Zhou, Xiaoli Li, Lin Ma, and Wei Zhang. 2019b. Relation-Aware Graph Attention Network for Visual Question Answering. In International Conference on Computer Vision (ICCV). IEEE, 5670--5679."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413924"},{"key":"e_1_3_2_2_30_1","volume-title":"Reasoning with Heterogeneous Graph Alignment for Video-and-Language Relationship Learning. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 9118--9127","author":"Liu Junyu","year":"2019","unstructured":"Junyu Liu, Jiawei Zhang, Liqiang Nie, Yan Yan, and Li Cheng. 2019. Reasoning with Heterogeneous Graph Alignment for Video-and-Language Relationship Learning. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 9118--9127."},{"key":"e_1_3_2_2_31_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00297"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.11"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21370"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01438"},{"key":"e_1_3_2_2_36_1","volume-title":"Multimodalqa: Complex question answering over text, tables and images. arXiv preprint arXiv:2104.06039","author":"Talmor Alon","year":"2021","unstructured":"Alon Talmor, Ori Yoran, Amnon Catav, Dan Lahav, Yizhong Wang, Akari Asai, Gabriel Ilharco, Hannaneh Hajishirzi, and Jonathan Berant. 2021. Multimodalqa: Complex question answering over text, tables and images. arXiv preprint arXiv:2104.06039 (2021)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00444"},{"key":"e_1_3_2_2_38_1","volume-title":"Mo Zhang, Jing Wei, Jorge Goncalves, Vassilis Kostakos, Zhanna Sarsenbayeva, Christina Schneegass, Alessandro Bozzon, and Evangelos Niforatos.","author":"Wang Chaofan","year":"2023","unstructured":"Chaofan Wang, Samuel Kernan Freire, Mo Zhang, Jing Wei, Jorge Goncalves, Vassilis Kostakos, Zhanna Sarsenbayeva, Christina Schneegass, Alessandro Bozzon, and Evangelos Niforatos. 2023. Safeguarding Crowdsourcing Surveys from ChatGPT with Prompt Injection. arXiv preprint arXiv:2306.08833 (2023)."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475390"},{"key":"e_1_3_2_2_40_1","volume-title":"Hierarchical Dynamic Graph Convolutional Network for Video Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 7209--7218","author":"Yang Xuanyi","year":"2019","unstructured":"Xuanyi Yang, Hanwang Huang, and James T Kwok. 2019b. Hierarchical Dynamic Graph Convolutional Network for Video Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 7209--7218."},{"key":"e_1_3_2_2_41_1","volume-title":"Stacked Attention Networks for Image Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 21--29","author":"Yang Zichao","year":"2015","unstructured":"Zichao Yang, Xiaodong He, Jianfeng Gao, Li Deng, and Alexander Smola. 2015. Stacked Attention Networks for Image Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 21--29."},{"key":"e_1_3_2_2_42_1","volume-title":"Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 6328--6337","author":"Yang Zuxuan","year":"2019","unstructured":"Zuxuan Yang, Xiaodong He, Jianfeng Gao, Li Deng, and Alexander J Smola. 2019a. Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 6328--6337."},{"key":"e_1_3_2_2_43_1","volume-title":"MATTN: A Modularized Attention Network for Referring Expression Comprehension. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 1327--1335","author":"Yu Zhou","year":"2018","unstructured":"Zhou Yu, Jimei Yang, Youngjun Choi, Xiaohui Xiong, Alexander C Berg, and Tamara L Berg. 2018a. MATTN: A Modularized Attention Network for Referring Expression Comprehension. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 1327--1335."},{"key":"e_1_3_2_2_44_1","first-page":"73","article-title":"Beyond Narrative Description: Generating Visual Answers from Object Scenes","volume":"28","author":"Yu Zhou","year":"2018","unstructured":"Zhou Yu, Jun Yu, Jianping Fan, and Dacheng Tao. 2018b. Beyond Narrative Description: Generating Visual Answers from Object Scenes. IEEE Transactions on Image Processing, Vol. 28, 1 (2018), 73--83.","journal-title":"IEEE Transactions on Image Processing"},{"key":"e_1_3_2_2_45_1","volume-title":"Rethinking Diversified and Discriminative Multi-Sentence Video Description Generation. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 4489--4497","author":"Yu Zhou","year":"2018","unstructured":"Zhou Yu, Jun Yu, Jianping Fan, and Dacheng Tao. 2018c. Rethinking Diversified and Discriminative Multi-Sentence Video Description Generation. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 4489--4497."},{"key":"e_1_3_2_2_46_1","volume-title":"Deep Modular Co-Attention Networks for Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 6281--6290","author":"Yu Zhou","year":"2019","unstructured":"Zhou Yu, Jun Yu, Jianping Fan, and Dacheng Tao. 2019. Deep Modular Co-Attention Networks for Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, 6281--6290."},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00134"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.542"},{"key":"e_1_3_2_2_49_1","first-page":"35549","article-title":"Towards video text visual question answering: benchmark and baseline","volume":"35","author":"Zhao Minyi","year":"2022","unstructured":"Minyi Zhao, Bingjia Li, Jie Wang, Wanqing Li, Wenjing Zhou, Lan Zhang, Shijie Xuyang, Zhihang Yu, Xinkun Yu, Guangze Li, et al. 2022. Towards video text visual question answering: benchmark and baseline. Advances in Neural Information Processing Systems, Vol. 35 (2022), 35549--35562.","journal-title":"Advances in Neural Information Processing Systems"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612844","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612844","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:11:27Z","timestamp":1755821487000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612844"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":49,"alternative-id":["10.1145\/3581783.3612844","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612844","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}