{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T17:29:08Z","timestamp":1770917348251,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 61976206, No. 61832017"],"award-info":[{"award-number":["No. 61976206, No. 61832017"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612495","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"6539-6547","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":20,"title":["Counterfactual Cross-modality Reasoning for Weakly Supervised Video Moment Localization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0207-8998","authenticated-orcid":false,"given":"Zezhong","family":"Lv","sequence":"first","affiliation":[{"name":"Renmin University of China &amp; Beijing Key Laboratory of Big Data Management and Analysis Methods, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8560-1910","authenticated-orcid":false,"given":"Bing","family":"Su","sequence":"additional","affiliation":[{"name":"Renmin University of China &amp; Beijing Key Laboratory of Big Data Management and Analysis Methods, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9777-9676","authenticated-orcid":false,"given":"Ji-Rong","family":"Wen","sequence":"additional","affiliation":[{"name":"Renmin University of China &amp; Beijing Key Laboratory of Big Data Management and Analysis Methods, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_2_1","volume-title":"Rubi: Reducing unimodal biases for visual question answering. Advances in neural information processing systems","author":"Cadene Remi","year":"2019","unstructured":"Remi Cadene, Corentin Dancette, Matthieu Cord, Devi Parikh, et al. 2019. Rubi: Reducing unimodal biases for visual question answering. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00124"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00832"},{"key":"e_1_3_2_1_5_1","volume-title":"Look closer to ground better: Weakly-supervised temporal grounding of sentence in video. arXiv preprint arXiv:2001.09308","author":"Chen Zhenfang","year":"2020","unstructured":"Zhenfang Chen, Lin Ma, Wenhan Luo, Peng Tang, and Kwan-Yee K Wong. 2020. Look closer to ground better: Weakly-supervised temporal grounding of sentence in video. arXiv preprint arXiv:2001.09308 (2020)."},{"key":"e_1_3_2_1_6_1","volume-title":"Advances in Neural Information Processing Systems","volume":"31","author":"Duan Xuguang","year":"2018","unstructured":"Xuguang Duan, Wenbing Huang, Chuang Gan, Jingdong Wang, Wenwu Zhu, and Junzhou Huang. 2018. Weakly supervised dense event captioning in videos. Advances in Neural Information Processing Systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_7_1","volume-title":"Weak supervision and referring attention for temporal-textual association learning. arXiv preprint arXiv:2006.11747","author":"Fang Zhiyuan","year":"2020","unstructured":"Zhiyuan Fang, Shu Kong, Zhe Wang, Charless Fowlkes, and Yezhou Yang. 2020. Weak supervision and referring attention for temporal-textual association learning. arXiv preprint arXiv:2006.11747 (2020)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.392"},{"key":"e_1_3_2_1_10_1","first-page":"343","article-title":"Weakly Supervised Natural Language Localization Networks","volume":"16","author":"Gao Mingfei","year":"2020","unstructured":"Mingfei Gao, Richard Socher, and Caiming Xiong. 2020. Weakly Supervised Natural Language Localization Networks. US Patent App. 16\/531,343.","journal-title":"US Patent App."},{"key":"e_1_3_2_1_11_1","volume-title":"Causal inference in statistics: A primer","author":"Glymour Madelyn","unstructured":"Madelyn Glymour, Judea Pearl, and Nicholas P Jewell. 2016. Causal inference in statistics: A primer. John Wiley & Sons."},{"key":"e_1_3_2_1_12_1","volume-title":"TaoHighlight: Commodity-Aware Multi-modal Video Highlight Detection in E-Commerce","author":"Guo Zhaoyu","year":"2021","unstructured":"Zhaoyu Guo, Zhou Zhao, Weike Jin, Wang Dazhou, Liu Ruitao, and Jun Yu. 2021. TaoHighlight: Commodity-Aware Multi-modal Video Highlight Detection in E-Commerce. IEEE Transactions on Multimedia (2021)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00711"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3532626"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16285"},{"key":"e_1_3_2_1_17_1","volume-title":"Multiple instance curriculum learning for weakly supervised object detection. arXiv preprint arXiv:1711.09191","author":"Li Siyang","year":"2017","unstructured":"Siyang Li, Xiangxin Zhu, Qin Huang, Hao Xu, and C-C Jay Kuo. 2017. Multiple instance curriculum learning for weakly supervised object detection. arXiv preprint arXiv:1711.09191 (2017)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00333"},{"key":"e_1_3_2_1_19_1","volume-title":"Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio.","author":"Lin Zhouhan","year":"2017","unstructured":"Zhouhan Lin, Minwei Feng, Cicero Nogueira dos Santos, Mo Yu, Bing Xiang, Bowen Zhou, and Yoshua Bengio. 2017. A structured self-attentive sentence embedding. arXiv preprint arXiv:1703.03130 (2017)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6820"},{"key":"e_1_3_2_1_21_1","volume-title":"Vlanet: Video-language alignment network for weakly-supervised video moment retrieval. In Computer Vision-ECCV 2020: 16th European Conference","author":"Ma Minuk","year":"2020","unstructured":"Minuk Ma, Sunjae Yoon, Junyeong Kim, Youngjoon Lee, Sunghun Kang, and Chang D Yoo. 2020. Vlanet: Video-language alignment network for weakly-supervised video moment retrieval. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XXVIII 16. Springer, 156--171."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01186"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01251"},{"key":"e_1_3_2_1_24_1","volume-title":"Bayesian inference for causal effects: The role of randomization. The Annals of statistics","author":"Rubin Donald B","year":"1978","unstructured":"Donald B Rubin. 1978. Bayesian inference for causal effects: The role of randomization. The Annals of statistics (1978), 34--58."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"e_1_3_2_1_26_1","volume-title":"Weakly-supervised multi-level attentional reconstruction network for grounding textual queries in videos. arXiv preprint arXiv:2003.07048","author":"Song Yijun","year":"2020","unstructured":"Yijun Song, Jingwen Wang, Lin Ma, Zhou Yu, and Jun Yu. 2020. Weakly-supervised multi-level attentional reconstruction network for grounding textual queries in videos. arXiv preprint arXiv:2003.07048 (2020)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00213"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3096087"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475278"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413862"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3058614"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00785"},{"key":"e_1_3_2_1_33_1","volume-title":"Tel Aviv","author":"Yoon Sunjae","year":"2022","unstructured":"Sunjae Yoon, Ji Woo Hong, Eunseop Yoon, Dahyun Kim, Junyeong Kim, Hee Suk Yoon, and Chang D Yoo. 2022. Selective Query-Guided Debiasing for Video Corpus Moment Retrieval. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXVI. Springer, 185--200."},{"key":"e_1_3_2_1_34_1","volume-title":"Advances in Neural Information Processing Systems","volume":"32","author":"Yuan Yitian","year":"2019","unstructured":"Yitian Yuan, Lin Ma, Jingwen Wang, Wei Liu, and Wenwu Zhu. 2019. Semantic conditioned dynamic modulation for temporal sentence grounding in videos. Advances in Neural Information Processing Systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413967"},{"key":"e_1_3_2_1_38_1","first-page":"18123","article-title":"Counterfactual contrastive learning for weakly-supervised vision-language grounding","volume":"33","author":"Zhang Zhu","year":"2020","unstructured":"Zhu Zhang, Zhou Zhao, Zhijie Lin, Xiuqiang He, et al. 2020d. Counterfactual contrastive learning for weakly-supervised vision-language grounding. Advances in Neural Information Processing Systems, Vol. 33 (2020), 18123--18134.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_39_1","volume-title":"CPL: Weakly Supervised Temporal Sentence Grounding with Gaussian-based Contrastive Proposal Learning. https:\/\/github.com\/minghangz\/cpl","author":"Zheng Minghang","year":"2023","unstructured":"Minghang Zheng. [n.d.]. CPL: Weakly Supervised Temporal Sentence Grounding with Gaussian-based Contrastive Proposal Learning. https:\/\/github.com\/minghangz\/cpl (2023, Feb 20)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20263"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01511"},{"key":"e_1_3_2_1_42_1","volume-title":"Ensemble learning","author":"Zhou Zhi-Hua","unstructured":"Zhi-Hua Zhou and Zhi-Hua Zhou. 2021. Ensemble learning. Springer."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612495","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612495","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:01:37Z","timestamp":1755820897000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612495"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":42,"alternative-id":["10.1145\/3581783.3612495","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612495","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}