{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T06:19:31Z","timestamp":1770358771450,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key Research and Development Program of China","award":["2018AAA0100604"],"award-info":[{"award-number":["2018AAA0100604"]}]},{"name":"Key Research Program of Frontier Sciences of CAS","award":["QYZDJ-SSW-JSC039"],"award-info":[{"award-number":["QYZDJ-SSW-JSC039"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62036012, 61720106006, 62002355, 61721004, 61832002, 62072455, U1705262, U1836220"],"award-info":[{"award-number":["62036012, 61720106006, 62002355, 61721004, 61832002, 62072455, U1705262, U1836220"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Beijing Natural Science Foundation","award":["L201001"],"award-info":[{"award-number":["L201001"]}]},{"DOI":"10.13039\/501100012152","name":"National Postdoctoral Program for Innovative Talents","doi-asserted-by":"publisher","award":["BX20190367"],"award-info":[{"award-number":["BX20190367"]}],"id":[{"id":"10.13039\/501100012152","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475328","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T05:04:15Z","timestamp":1634533455000},"page":"1793-1802","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":18,"title":["Multi-Level Counterfactual Contrast for Visual Commonsense Reasoning"],"prefix":"10.1145","author":[{"given":"Xi","family":"Zhang","sequence":"first","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}]},{"given":"Feifei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"given":"Changsheng","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences; University of Chinese Academy of Sciences; &amp; Peng Cheng Laboratory, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 10044--10054","author":"Abbasnejad Ehsan","unstructured":"Ehsan Abbasnejad , Damien Teney , Amin Parvaneh , Javen Shi , and Anton van den Hengel. 2020. Counterfactual vision and language learning . In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 10044--10054 . Ehsan Abbasnejad, Damien Teney, Amin Parvaneh, Javen Shi, and Anton van den Hengel. 2020. Counterfactual vision and language learning. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 10044--10054."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00971"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1219"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_1_5_1","volume-title":"Learning representations by maximizing mutual information across views. arXiv preprint arXiv:1906.00910","author":"Bachman Philip","year":"2019","unstructured":"Philip Bachman , R Devon Hjelm , and William Buchwalter . 2019. Learning representations by maximizing mutual information across views. arXiv preprint arXiv:1906.00910 ( 2019 ). Philip Bachman, R Devon Hjelm, and William Buchwalter. 2019. Learning representations by maximizing mutual information across views. arXiv preprint arXiv:1906.00910 (2019)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.285"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01081"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1152"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the Conference on International Conference on Machine Learning (ICML). 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen , Simon Kornblith , Mohammad Norouzi , and Geoffrey Hinton . 2020 a. A simple framework for contrastive learning of visual representations . In Proceedings of the Conference on International Conference on Machine Learning (ICML). 1597--1607 . Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020 a. A simple framework for contrastive learning of visual representations. In Proceedings of the Conference on International Conference on Machine Learning (ICML). 1597--1607."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.5555\/3294771.3294857"},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the North American Chapter of the Association for Computational Linguistics (ACL). 4171--4186","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . 2019 . BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding . In Proceedings of the North American Chapter of the Association for Computational Linguistics (ACL). 4171--4186 . Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the North American Chapter of the Association for Computational Linguistics (ACL). 4171--4186."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.5555\/2968826.2968912"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/107"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5338"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.100"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR) .","author":"Hjelm R Devon","year":"2018","unstructured":"R Devon Hjelm , Alex Fedorov , Samuel Lavoie-Marchildon , Karan Grewal , Phil Bachman , Adam Trischler , and Yoshua Bengio . 2018 . Learning deep representations by mutual information estimation and maximization . In Proceedings of the International Conference on Learning Representations (ICLR) . R Devon Hjelm, Alex Fedorov, Samuel Lavoie-Marchildon, Karan Grewal, Phil Bachman, Adam Trischler, and Yoshua Bengio. 2018. Learning deep representations by mutual information estimation and maximization. In Proceedings of the International Conference on Learning Representations (ICLR) ."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_44"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"e_1_3_2_1_22_1","volume-title":"Noe Pion, Philippe Weinzaepfel, and Diane Larlus.","author":"Kalantidis Yannis","year":"2020","unstructured":"Yannis Kalantidis , Mert Bulent Sariyildiz , Noe Pion, Philippe Weinzaepfel, and Diane Larlus. 2020 . Hard negative mixing for contrastive learning. (2020). Yannis Kalantidis, Mert Bulent Sariyildiz, Noe Pion, Philippe Weinzaepfel, and Diane Larlus. 2020. Hard negative mixing for contrastive learning. (2020)."},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the Conference and Workshop on Neural Information Processing Systems (NIPS)","author":"Khosla Prannay","year":"2020","unstructured":"Prannay Khosla , Piotr Teterwak , Chen Wang , Aaron Sarna , Yonglong Tian , Phillip Isola , Aaron Maschinot , Ce Liu , and Dilip Krishnan . 2020 . Supervised Contrastive Learning . Proceedings of the Conference and Workshop on Neural Information Processing Systems (NIPS) (2020). Prannay Khosla, Piotr Teterwak, Chen Wang, Aaron Sarna, Yonglong Tian, Phillip Isola, Aaron Maschinot, Ce Liu, and Dilip Krishnan. 2020. Supervised Contrastive Learning. Proceedings of the Conference and Workshop on Neural Information Processing Systems (NIPS) (2020)."},{"key":"e_1_3_2_1_24_1","volume-title":"Hadamard product for low-rank bilinear pooling. arXiv preprint arXiv:1610.04325","author":"Kim Jin-Hwa","year":"2016","unstructured":"Jin-Hwa Kim , Kyoung-Woon On , Woosang Lim , Jeonghee Kim , Jung-Woo Ha , and Byoung-Tak Zhang . 2016. Hadamard product for low-rank bilinear pooling. arXiv preprint arXiv:1610.04325 ( 2016 ). Jin-Hwa Kim, Kyoung-Woon On, Woosang Lim, Jeonghee Kim, Jung-Woo Ha, and Byoung-Tak Zhang. 2016. Hadamard product for low-rank bilinear pooling. arXiv preprint arXiv:1610.04325 (2016)."},{"key":"e_1_3_2_1_25_1","volume-title":"Self-supervised pre-training and contrastive representation learning for multiple-choice video QA. arXiv preprint arXiv:2009.08043","author":"Kim Seonhoon","year":"2020","unstructured":"Seonhoon Kim , Seohyeong Jeong , Eunbyul Kim , Inho Kang , and Nojun Kwak . 2020. Self-supervised pre-training and contrastive representation learning for multiple-choice video QA. arXiv preprint arXiv:2009.08043 ( 2020 ). Seonhoon Kim, Seohyeong Jeong, Eunbyul Kim, Inho Kang, and Nojun Kwak. 2020. Self-supervised pre-training and contrastive representation learning for multiple-choice video QA. arXiv preprint arXiv:2009.08043 (2020)."},{"key":"e_1_3_2_1_26_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba . 2014 . Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014). Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_27_1","volume-title":"Cycle-Contrast for Self-Supervised Video Representation Learning. arXiv preprint arXiv:2010.14810","author":"Kong Quan","year":"2020","unstructured":"Quan Kong , Wenpeng Wei , Ziwei Deng , Tomoaki Yoshinaga , and Tomokazu Murakami . 2020. Cycle-Contrast for Self-Supervised Video Representation Learning. arXiv preprint arXiv:2010.14810 ( 2020 ). Quan Kong, Wenpeng Wei, Ziwei Deng, Tomoaki Yoshinaga, and Tomokazu Murakami. 2020. Cycle-Contrast for Self-Supervised Video Representation Learning. arXiv preprint arXiv:2010.14810 (2020)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00350"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.265"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3455686"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413924"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454289"},{"key":"e_1_3_2_1_33_1","volume-title":"Counterfactual vqa: A cause-effect look at language bias. arXiv preprint arXiv:2006.04315","author":"Niu Yulei","year":"2020","unstructured":"Yulei Niu , Kaihua Tang , Hanwang Zhang , Zhiwu Lu , Xian-Sheng Hua , and Ji-Rong Wen . 2020. Counterfactual vqa: A cause-effect look at language bias. arXiv preprint arXiv:2006.04315 ( 2020 ). Yulei Niu, Kaihua Tang, Hanwang Zhang, Zhiwu Lu, Xian-Sheng Hua, and Ji-Rong Wen. 2020. Counterfactual vqa: A cause-effect look at language bias. arXiv preprint arXiv:2006.04315 (2020)."},{"key":"e_1_3_2_1_34_1","volume-title":"Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord , Yazhe Li , and Oriol Vinyals . 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 ( 2018 ). Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of the Conference and Workshop on Neural Information Processing Systems (NIPS)","author":"Parvaneh Amin","year":"2020","unstructured":"Amin Parvaneh , Ehsan Abbasnejad , Damien Teney , Qinfeng Shi , and Anton van den Hengel. 2020. Counterfactual Vision-and-Language Navigation: Unravelling the Unseen . Proceedings of the Conference and Workshop on Neural Information Processing Systems (NIPS) ( 2020 ). Amin Parvaneh, Ehsan Abbasnejad, Damien Teney, Qinfeng Shi, and Anton van den Hengel. 2020. Counterfactual Vision-and-Language Navigation: Unravelling the Unseen. Proceedings of the Conference and Workshop on Neural Information Processing Systems (NIPS) (2020)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413850"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0987-1"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01043"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"e_1_3_2_1_40_1","volume-title":"Deep Hierarchical Attention Flow for Visual Commonsense Reasoning. In CCF International Conference on Natural Language Processing and Chinese Computing. 16--28","author":"Song Yuansheng","year":"2020","unstructured":"Yuansheng Song and Ping Jian . 2020 . Deep Hierarchical Attention Flow for Visual Commonsense Reasoning. In CCF International Conference on Natural Language Processing and Chinese Computing. 16--28 . Yuansheng Song and Ping Jian. 2020. Deep Hierarchical Attention Flow for Visual Commonsense Reasoning. In CCF International Conference on Natural Language Processing and Chinese Computing. 16--28."},{"key":"e_1_3_2_1_41_1","volume-title":"Contrastive multiview coding. arXiv preprint arXiv:1906.05849","author":"Tian Yonglong","year":"2019","unstructured":"Yonglong Tian , Dilip Krishnan , and Phillip Isola . 2019. Contrastive multiview coding. arXiv preprint arXiv:1906.05849 ( 2019 ). Yonglong Tian, Dilip Krishnan, and Phillip Isola. 2019. Contrastive multiview coding. arXiv preprint arXiv:1906.05849 (2019)."},{"key":"e_1_3_2_1_42_1","volume-title":"What makes for good views for contrastive learning. arXiv preprint arXiv:2005.10243","author":"Tian Yonglong","year":"2020","unstructured":"Yonglong Tian , Chen Sun , Ben Poole , Dilip Krishnan , Cordelia Schmid , and Phillip Isola . 2020. What makes for good views for contrastive learning. arXiv preprint arXiv:2005.10243 ( 2020 ). Yonglong Tian, Chen Sun, Ben Poole, Dilip Krishnan, Cordelia Schmid, and Phillip Isola. 2020. What makes for good views for contrastive learning. arXiv preprint arXiv:2005.10243 (2020)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413905"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01077"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.2991866"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454796"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"key":"e_1_3_2_1_48_1","volume-title":"What should not be contrastive in contrastive learning. arXiv preprint arXiv:2008.05659","author":"Xiao Tete","year":"2020","unstructured":"Tete Xiao , Xiaolong Wang , Alexei A Efros , and Trevor Darrell . 2020. What should not be contrastive in contrastive learning. arXiv preprint arXiv:2008.05659 ( 2020 ). Tete Xiao, Xiaolong Wang, Alexei A Efros, and Trevor Darrell. 2020. What should not be contrastive in contrastive learning. arXiv preprint arXiv:2008.05659 (2020)."},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of the Association for the Advance of Artificial Intelligence (AAAI)","author":"Ye Keren","year":"2021","unstructured":"Keren Ye and Adriana Kovashka . 2021 . A Case Study of the Shortcut Effects in Visual Commonsense Reasoning . Proceedings of the Association for the Advance of Artificial Intelligence (AAAI) (2021). Keren Ye and Adriana Kovashka. 2021. A Case Study of the Shortcut Effects in Visual Commonsense Reasoning. Proceedings of the Association for the Advance of Artificial Intelligence (AAAI) (2021)."},{"key":"e_1_3_2_1_50_1","volume-title":"ERNIE-ViL: Knowledge Enhanced Vision-Language Representations Through Scene Graph. arXiv preprint arXiv:2006.16934","author":"Yu Fei","year":"2020","unstructured":"Fei Yu , Jiji Tang , Weichong Yin , Yu Sun , Hao Tian , Hua Wu , and Haifeng Wang . 2020. ERNIE-ViL: Knowledge Enhanced Vision-Language Representations Through Scene Graph. arXiv preprint arXiv:2006.16934 ( 2020 ). Fei Yu, Jiji Tang, Weichong Yin, Yu Sun, Hao Tian, Hua Wu, and Haifeng Wang. 2020. ERNIE-ViL: Knowledge Enhanced Vision-Language Representations Through Scene Graph. arXiv preprint arXiv:2006.16934 (2020)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454536"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00688"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1009"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413885"},{"key":"e_1_3_2_1_55_1","volume-title":"Proceedings of the Conference on Neural Information Processing Systems (NIPS)","author":"Zhang Zhu","year":"2020","unstructured":"Zhu Zhang , Zhou Zhao , Zhijie Lin , Xiuqiang He , 2020 b. Counterfactual Contrastive Learning for Weakly-Supervised Vision-Language Grounding . Proceedings of the Conference on Neural Information Processing Systems (NIPS) (2020). Zhu Zhang, Zhou Zhao, Zhijie Lin, Xiuqiang He, et al. 2020 b. Counterfactual Contrastive Learning for Weakly-Supervised Vision-Language Grounding. Proceedings of the Conference on Neural Information Processing Systems (NIPS) (2020)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00610"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475328","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475328","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:49:18Z","timestamp":1750193358000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475328"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":56,"alternative-id":["10.1145\/3474085.3475328","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475328","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}