{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T17:30:17Z","timestamp":1770917417697,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":68,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the National Natural Science Foundation of China","award":["62036012, 62236008, U21B2044, 61721004, 62102415, 62072286, 62106262, and 62002355"],"award-info":[{"award-number":["62036012, 62236008, U21B2044, 61721004, 62102415, 62072286, 62106262, and 62002355"]}]},{"name":"the National Key Research and Development Plan of China","award":["2020AAA0106200"],"award-info":[{"award-number":["2020AAA0106200"]}]},{"name":"Beijing Natural Science Foundation","award":["L201001"],"award-info":[{"award-number":["L201001"]}]},{"name":"Open Research Projects of Zhejiang Lab","award":["2022RC0AB02"],"award-info":[{"award-number":["2022RC0AB02"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612345","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:26:54Z","timestamp":1698391614000},"page":"4240-4249","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Video Entailment via Reaching a Structure-Aware Cross-modal Consensus"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-8115-3954","authenticated-orcid":false,"given":"Xuan","family":"Yao","sequence":"first","affiliation":[{"name":"Institute of Automation, CAS &amp; School of Artificial Intelligence, UCAS, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8105-5497","authenticated-orcid":false,"given":"Junyu","family":"Gao","sequence":"additional","affiliation":[{"name":"Institute of Automation, CAS &amp; School of Artificial Intelligence, UCAS, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8155-0930","authenticated-orcid":false,"given":"Mengyuan","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute of Automation, CAS &amp; School of Artificial Intelligence, UCAS, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8343-9665","authenticated-orcid":false,"given":"Changsheng","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Automation, CAS, School of Artificial Intelligence, UCAS, &amp; Peng Cheng Laboratory, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Multimodal machine learning: A survey and taxonomy","author":"Tadas","year":"2018","unstructured":"Tadas Baltru?aitis, Chaitanya Ahuja, and Louis-Philippe Morency. 2018. Multimodal machine learning: A survey and taxonomy. IEEE transactions on pattern analysis and machine intelligence 41, 2 (2018), 423--443."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00293"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00203"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2832602"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3059295"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00298"},{"key":"e_1_3_2_1_9_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2017. Vse: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00210"},{"key":"e_1_3_2_1_11_1","volume-title":"European Conference on Computer Vision. Springer, 214--229","author":"Gabeur Valentin","year":"2020","unstructured":"Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. 2020. Multimodal transformer for video retrieval. In European Conference on Computer Vision. Springer, 214--229."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01805"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00688"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00155"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00478"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2985708"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01232"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.3390\/s22228804"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3169842"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3221292"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-54181-5_14"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475298"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6737"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.149"},{"key":"e_1_3_2_1_25_1","volume-title":"Tel Aviv","author":"Khan Aisha Urooj","year":"2022","unstructured":"Aisha Urooj Khan, Hilde Kuehne, Chuang Gan, Niels Da Vitoria Lobo, and Mubarak Shah. 2022. Weakly Supervised Grounding for VQA in Vision-Language Transformers. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXV. Springer, 652--670."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01336"},{"key":"e_1_3_2_1_27_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_29_1","volume-title":"Self-Attention Graph Pooling. ArXiv abs\/1904.08082","author":"Lee Junhyun","year":"2019","unstructured":"Junhyun Lee, Inyeop Lee, and Jaewoo Kang. 2019. Self-Attention Graph Pooling. ArXiv abs\/1904.08082 (2019)."},{"key":"e_1_3_2_1_30_1","volume-title":"Tvqa: Localized, compositional video question answering. arXiv preprint arXiv:1809.01696","author":"Lei Jie","year":"2018","unstructured":"Jie Lei, Licheng Yu, Mohit Bansal, and Tamara L Berg. 2018. Tvqa: Localized, compositional video question answering. arXiv preprint arXiv:1809.01696 (2018)."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings, Part XXI 16","author":"Lei Jie","year":"2020","unstructured":"Jie Lei, Licheng Yu, Tamara L Berg, and Mohit Bansal. 2020. Tvr: A large-scale dataset for video-subtitle moment retrieval. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XXI 16. Springer, 447--463."},{"key":"e_1_3_2_1_32_1","volume-title":"What is more likely to happen next? video-and-language future event prediction. arXiv preprint arXiv:2010.07999","author":"Lei Jie","year":"2020","unstructured":"Jie Lei, Licheng Yu, Tamara L Berg, and Mohit Bansal. 2020. What is more likely to happen next? video-and-language future event prediction. arXiv preprint arXiv:2010.07999 (2020)."},{"key":"e_1_3_2_1_33_1","volume-title":"A CLIP-Enhanced method for video-language understanding. arXiv preprint arXiv:2110.07137","author":"Li Guohao","year":"2021","unstructured":"Guohao Li, Feng He, and Zhifan Feng. 2021. A CLIP-Enhanced method for video-language understanding. arXiv preprint arXiv:2110.07137 (2021)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00188"},{"key":"e_1_3_2_1_35_1","volume-title":"Hero: Hierarchical encoder for video language omni-representation pre-training. arXiv preprint arXiv:2005.00200","author":"Li Linjie","year":"2020","unstructured":"Linjie Li, Yen-Chun Chen, Yu Cheng, Zhe Gan, Licheng Yu, and Jingjing Liu. 2020. Hero: Hierarchical encoder for video language omni-representation pre-training. arXiv preprint arXiv:2005.00200 (2020)."},{"key":"e_1_3_2_1_36_1","volume-title":"Value: A multi-task benchmark for video-and-language understanding evaluation. arXiv preprint arXiv:2106.04632","author":"Li Linjie","year":"2021","unstructured":"Linjie Li, Jie Lei, Zhe Gan, Licheng Yu, Yen-Chun Chen, Rohit Pillai, Yu Cheng, Luowei Zhou, Xin EricWang, William YangWang, et al. 2021. Value: A multi-task benchmark for video-and-language understanding evaluation. arXiv preprint arXiv:2106.04632 (2021)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548333"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018658"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01091"},{"key":"e_1_3_2_1_41_1","volume-title":"Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487","author":"Liu Yang","year":"2019","unstructured":"Yang Liu, Samuel Albanie, Arsha Nagrani, and Andrew Zisserman. 2019. Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487 (2019)."},{"key":"e_1_3_2_1_42_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_1_44_1","volume-title":"Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv preprint arXiv:1412.6632","author":"Mao Junhua","year":"2014","unstructured":"Junhua Mao, Wei Xu, Yi Yang, Jiang Wang, Zhiheng Huang, and Alan Yuille. 2014. Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv preprint arXiv:1412.6632 (2014)."},{"key":"e_1_3_2_1_45_1","first-page":"14200","article-title":"Attention bottlenecks for multimodal fusion","volume":"34","author":"Nagrani Arsha","year":"2021","unstructured":"Arsha Nagrani, Shan Yang, Anurag Arnab, Aren Jansen, Cordelia Schmid, and Chen Sun. 2021. Attention bottlenecks for multimodal fusion. Advances in Neural Information Processing Systems 34 (2021), 14200--14213.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01088"},{"key":"e_1_3_2_1_47_1","volume-title":"Winning the ICCV'2021 VALUE Challenge: Taskaware Ensemble and Transfer Learning with Visual Concepts. arXiv preprint arXiv:2110","author":"Shin Minchul","year":"2021","unstructured":"Minchul Shin, Jonghwan Mun, Kyoung-Woon On, Woo-Young Kang, Gunsoo Han, and Eun-Sol Kim. 2021. Winning the ICCV'2021 VALUE Challenge: Taskaware Ensemble and Transfer Learning with Visual Concepts. arXiv preprint arXiv:2110.06476 (2021)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01939"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2967577"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00118"},{"key":"e_1_3_2_1_51_1","volume-title":"Long-Form Video-Language Pre-Training with Multimodal Temporal Contrastive Learning. arXiv preprint arXiv:2210.06031","author":"Sun Yuchong","year":"2022","unstructured":"Yuchong Sun, Hongwei Xue, Ruihua Song, Bei Liu, Huan Yang, and Jianlong Fu. 2022. Long-Form Video-Language Pre-Training with Multimodal Temporal Contrastive Learning. arXiv preprint arXiv:2210.06031 (2022)."},{"key":"e_1_3_2_1_52_1","volume-title":"Learning language-visual embedding for movie understanding with natural-language. arXiv preprint arXiv:1609.08124","author":"Torabi Atousa","year":"2016","unstructured":"Atousa Torabi, Niket Tandon, and Leonid Sigal. 2016. Learning language-visual embedding for movie understanding with natural-language. arXiv preprint arXiv:1609.08124 (2016)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_1_54_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00795"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3097171"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.29"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747336"},{"key":"e_1_3_2_1_59_1","volume-title":"Visual entailment: A novel task for fine-grained image understanding. arXiv preprint arXiv:1901.06706","author":"Xie Ning","year":"2019","unstructured":"Ning Xie, Farley Lai, Derek Doran, and Asim Kadav. 2019. Visual entailment: A novel task for fine-grained image understanding. arXiv preprint arXiv:1901.06706 (2019)."},{"key":"e_1_3_2_1_60_1","volume-title":"Effective End-to-End Vision Language Pretraining with Semantic Visual Loss. arXiv preprint arXiv:2301.07236","author":"Yang Xiaofeng","year":"2023","unstructured":"Xiaofeng Yang, Fayao Liu, and Guosheng Lin. 2023. Effective End-to-End Vision Language Pretraining with Semantic Visual Loss. arXiv preprint arXiv:2301.07236 (2023)."},{"key":"e_1_3_2_1_61_1","volume-title":"Hierarchical graph representation learning with differentiable pooling. Advances in neural information processing systems 31","author":"Ying Zhitao","year":"2018","unstructured":"Zhitao Ying, Jiaxuan You, Christopher Morris, Xiang Ren, Will Hamilton, and Jure Leskovec. 2018. Hierarchical graph representation learning with differentiable pooling. Advances in neural information processing systems 31 (2018)."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"e_1_3_2_1_63_1","first-page":"23634","article-title":"Merlot: Multimodal neural script knowledge models","volume":"34","author":"Zellers Rowan","year":"2021","unstructured":"Rowan Zellers, Ximing Lu, Jack Hessel, Youngjae Yu, Jae Sung Park, Jize Cao, Ali Farhadi, and Yejin Choi. 2021. Merlot: Multimodal neural script knowledge models. Advances in Neural Information Processing Systems 34 (2021), 23634--23651.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00631"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"e_1_3_2_1_66_1","volume-title":"International Conference on Machine Learning. PMLR, 12747--12760","author":"Zheng Wenbo","year":"2021","unstructured":"Wenbo Zheng, Lan Yan, Chao Gou, and Fei-YueWang. 2021. Two heads are better than one: Hypergraph-enhanced graph reasoning for visual event ratiocination. In International Conference on Machine Learning. PMLR, 12747--12760."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3205457"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123292"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612345","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612345","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:01:01Z","timestamp":1755820861000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612345"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":68,"alternative-id":["10.1145\/3581783.3612345","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612345","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}