{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T17:12:52Z","timestamp":1768669972557,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Guangxi Natural Science Foundation","award":["2022GXNSFAA035627"],"award-info":[{"award-number":["2022GXNSFAA035627"]}]},{"name":"Open Research Fund of Guangxi Key Laboratory of Multimedia Communications and Network Technology"},{"name":"Fundamental Research Funds for the Central Universities, SCUT","award":["D2230080"],"award-info":[{"award-number":["D2230080"]}]},{"name":"Guangxi Scientific and Technological Bases and Talents Special Projects","award":["guikeAD23026230 and guikeAD23026213"],"award-info":[{"award-number":["guikeAD23026230 and guikeAD23026213"]}]},{"name":"CAAI-Huawei MindSpore Open Fund and the Science and Technology Planning Project of Guangdong Province","award":["2020B0101100002"],"award-info":[{"award-number":["2020B0101100002"]}]},{"name":"CCF-Zhipu AI Large Model Fund"},{"name":"Guangxi Natural Science Foundation Key Project","award":["No. 2024GXNSF"],"award-info":[{"award-number":["No. 2024GXNSF"]}]},{"name":"Open Research Fund of Key Laboratory of Big Data and Intelligent Robot (SCUT), Ministry of Education"},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276072, 62076100, and 62261003"],"award-info":[{"award-number":["62276072, 62076100, and 62261003"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612593","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"5562-5571","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Scene-text Oriented Visual Entailment: Task, Dataset and Solution"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-0576-1773","authenticated-orcid":false,"given":"Nan","family":"Li","sequence":"first","affiliation":[{"name":"Guangxi University, Nanning, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-1924-5248","authenticated-orcid":false,"given":"Pijian","family":"Li","sequence":"additional","affiliation":[{"name":"Guangxi University, Nanning, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2481-5669","authenticated-orcid":false,"given":"Dongsheng","family":"Xu","sequence":"additional","affiliation":[{"name":"Guangxi University, Nanning, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5482-6753","authenticated-orcid":false,"given":"Wenye","family":"Zhao","sequence":"additional","affiliation":[{"name":"Guangxi University, Nanning, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1767-789X","authenticated-orcid":false,"given":"Yi","family":"Cai","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7691-347X","authenticated-orcid":false,"given":"Qingbao","family":"Huang","sequence":"additional","affiliation":[{"name":"Guangxi University &amp; Guangxi Key Laboratory of Multimedia Communications and Network Technology, Nanning, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2339814"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_1_3_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E. Hinton","author":"Ba Lei Jimmy","year":"2016","unstructured":"Lei Jimmy Ba, Jamie Ryan Kiros, and Geoffrey E. Hinton. 2016. Layer Normalization. CoRR, Vol. abs\/1607.06450 (2016). showeprint[arXiv]1607.06450 http:\/\/arxiv.org\/abs\/1607.06450"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.285"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the Third Text Analysis Conference, TAC 2010","author":"Bentivogli Luisa","year":"2010","unstructured":"Luisa Bentivogli, Peter Clark, Ido Dagan, and Danilo Giampiccolo. 2010. The Sixth PASCAL Recognizing Textual Entailment Challenge. In Proceedings of the Third Text Analysis Conference, TAC 2010, Gaithersburg, Maryland, USA, November 15-16, 2010. NIST. https:\/\/tac.nist.gov\/publications\/2010\/additional.papers\/RTE6_overview.proceedings.pdf"},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of the Fourth Text Analysis Conference, TAC 2011","author":"Bentivogli Luisa","year":"2011","unstructured":"Luisa Bentivogli, Peter Clark, Ido Dagan, and Danilo Giampiccolo. 2011. The Seventh PASCAL Recognizing Textual Entailment Challenge. In Proceedings of the Fourth Text Analysis Conference, TAC 2011, Gaithersburg, Maryland, USA, November 14-15, 2011. NIST. https:\/\/tac.nist.gov\/publications\/2011\/additional.papers\/RTE7_overview.proceedings.pdf"},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the Second Text Analysis Conference, TAC 2009","author":"Bentivogli Luisa","year":"2009","unstructured":"Luisa Bentivogli, Bernardo Magnini, Ido Dagan, Hoa Trang Dang, and Danilo Giampiccolo. 2009. The Fifth PASCAL Recognizing Textual Entailment Challenge. In Proceedings of the Second Text Analysis Conference, TAC 2009, Gaithersburg, Maryland, USA, November 16-17, 2009. NIST. https:\/\/tac.nist.gov\/publications\/2009\/additional.papers\/RTE5_overview.proceedings.pdf"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00439"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00051"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/d15--1075"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3219642"},{"key":"e_1_3_2_1_12_1","volume-title":"Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018","author":"Camburu Oana-Maria","year":"2018","unstructured":"Oana-Maria Camburu, Tim Rockt\u00e4schel, Thomas Lukasiewicz, and Phil Blunsom. 2018a. e-SNLI: Natural Language Inference with Natural Language Explanations. In Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018, NeurIPS 2018, December 3-8, 2018, Montr\u00e9al, Canada, Samy Bengio, Hanna M. Wallach, Hugo Larochelle, Kristen Grauman, Nicol\u00f2 Cesa-Bianchi, and Roman Garnett (Eds.). 9560--9572. https:\/\/proceedings.neurips.cc\/paper\/2018\/hash\/4c7a167bb329bd92580a99ce422d6fa6-Abstract.html"},{"key":"e_1_3_2_1_13_1","volume-title":"Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018","author":"Camburu Oana-Maria","year":"2018","unstructured":"Oana-Maria Camburu, Tim Rockt\u00e4schel, Thomas Lukasiewicz, and Phil Blunsom. 2018b. e-SNLI: Natural Language Inference with Natural Language Explanations. In Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018, NeurIPS 2018, December 3-8, 2018, Montr\u00e9al, Canada, Samy Bengio, Hanna M. Wallach, Hugo Larochelle, Kristen Grauman, Nicol\u00f2 Cesa-Bianchi, and Roman Garnett (Eds.). 9560--9572. https:\/\/proceedings.neurips.cc\/paper\/2018\/hash\/4c7a167bb329bd92580a99ce422d6fa6-Abstract.html"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_15_1","volume-title":"Machine Learning Challenges. Evaluating Predictive Uncertainty, Visual Object Classification, and Recognising Tectual Entailment: First PASCAL Machine Learning Challenges Workshop, MLCW","author":"Dagan Ido","year":"2005","unstructured":"Ido Dagan, Oren Glickman, and Bernardo Magnini. 2006. The pascal recognising textual entailment challenge. In Machine Learning Challenges. Evaluating Predictive Uncertainty, Visual Object Classification, and Recognising Tectual Entailment: First PASCAL Machine Learning Challenges Workshop, MLCW 2005, Southampton, UK, April 11-13, 2005, Revised Selected Papers. Springer, 177--190."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/n19-1423"},{"key":"e_1_3_2_1_17_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems","author":"Gan Zhe","year":"2020","unstructured":"Zhe Gan, Yen-Chun Chen, Linjie Li, Chen Zhu, Yu Cheng, and Jingjing Liu. 2020. Large-Scale Adversarial Training for Vision-and-Language Representation Learning. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual, Hugo Larochelle, Marc'Aurelio Ranzato, Raia Hadsell, Maria-Florina Balcan, and Hsuan-Tien Lin (Eds.). https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/49562478de4c54fafd4ec46fdb297de5-Abstract.html"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3132034"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-020-00257-z"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the First Text Analysis Conference, TAC 2008","author":"Giampiccolo Danilo","year":"2008","unstructured":"Danilo Giampiccolo, Hoa Trang Dang, Bernardo Magnini, Ido Dagan, Elena Cabrio, and Bill Dolan. 2008. The Fourth PASCAL Recognizing Textual Entailment Challenge. In Proceedings of the First Text Analysis Conference, TAC 2008, Gaithersburg, Maryland, USA, November 17-19, 2008. NIST. https:\/\/tac.nist.gov\/publications\/2008\/additional.papers\/RTE-4_overview.proceedings.pdf"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.3115\/1654536.1654538"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the Second PASCAL Challenges Workshop on Recognising Textual Entailment","volume":"7","author":"Haim R Bar","year":"2006","unstructured":"R Bar Haim, Ido Dagan, Bill Dolan, Lisa Ferro, Danilo Giampiccolo, Bernardo Magnini, and Idan Szpektor. 2006. The second pascal recognising textual entailment challenge. In Proceedings of the Second PASCAL Challenges Workshop on Recognising Textual Entailment, Vol. 7."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01001"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01278"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58545-7_41"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00128"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.134"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1244"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021","volume":"8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event (Proceedings of Machine Learning Research, Vol. 139), Marina Meila and Tong Zhang (Eds.). PMLR, 8748--8763. http:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"e_1_3_2_1_30_1","volume-title":"Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross B. Girshick, and Jian Sun. 2015. Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks. In Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, December 7-12, 2015, Montreal, Quebec, Canada, Corinna Cortes, Neil D. Lawrence, Daniel D. Lee, Masashi Sugiyama, and Roman Garnett (Eds.). 91--99. https:\/\/proceedings.neurips.cc\/paper\/2015\/hash\/14bfa6bb14875e45bba028a21ed38046-Abstract.html"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_44"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.421"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20059-5_23"},{"key":"e_1_3_2_1_35_1","volume-title":"Show and tell: Lessons learned from the 2015 mscoco image captioning challenge","author":"Vinyals Oriol","year":"2016","unstructured":"Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. 2016. Show and tell: Lessons learned from the 2015 mscoco image captioning challenge. IEEE transactions on pattern analysis and machine intelligence, Vol. 39, 4 (2016), 652--663."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413753"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00136"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR56361.2022.9956709"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16389"},{"key":"e_1_3_2_1_40_1","volume-title":"Visual Entailment: A Novel Task for Fine-Grained Image Understanding. CoRR","author":"Xie Ning","year":"2019","unstructured":"Ning Xie, Farley Lai, Derek Doran, and Asim Kadav. 2019. Visual Entailment: A Novel Task for Fine-Grained Image Understanding. CoRR, Vol. abs\/1901.06706 (2019). [arXiv]1901.06706 http:\/\/arxiv.org\/abs\/1901.06706"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548284"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612593","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612593","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:01:00Z","timestamp":1755820860000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612593"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":41,"alternative-id":["10.1145\/3581783.3612593","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612593","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}