{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T05:19:43Z","timestamp":1781587183216,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3760958","type":"proceedings-article","created":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T00:36:36Z","timestamp":1762562196000},"page":"5444-5448","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["MOVER: Multimodal Optimal Transport with Volume-based Embedding Regularization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9178-2912","authenticated-orcid":false,"given":"Haochen","family":"You","sequence":"first","affiliation":[{"name":"Columbia University, New York, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1444-7267","authenticated-orcid":false,"given":"Baojing","family":"Liu","sequence":"additional","affiliation":[{"name":"Hebei Institute of Communications, Shijiazhuang, Hebei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"A Short Note on the Kinetics-700 Human Action Dataset. arXiv preprint arXiv:1907.06987","author":"Carreira Jo\u00e3o","year":"2019","unstructured":"Jo\u00e3o Carreira, Eric Noland, Chloe Hillier, and Andrew Zisserman. 2019. A Short Note on the Kinetics-700 Human Action Dataset. arXiv preprint arXiv:1907.06987 (2019)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053174"},{"key":"e_1_3_2_1_3_1","first-page":"72842","article-title":"Vast: A vision-audio-subtitle-text omni-modality foundation model and dataset","volume":"36","author":"Chen Sihan","year":"2023","unstructured":"Sihan Chen, Handong Li, Qunbo Wang, Zijia Zhao, Mingzhen Sun, Xinxin Zhu, and Jing Liu. 2023. Vast: A vision-audio-subtitle-text omni-modality foundation model and dataset. Advances in Neural Information Processing Systems, Vol. 36 (2023), 72842-72866.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_4_1","volume-title":"Beats: Audio pre-training with acoustic tokenizers. arXiv preprint arXiv:2212.09058","author":"Chen Sanyuan","year":"2022","unstructured":"Sanyuan Chen, Yu Wu, Chengyi Wang, Shujie Liu, Daniel Tompkins, Zhuo Chen, and Furu Wei. 2022. Beats: Audio pre-training with acoustic tokenizers. arXiv preprint arXiv:2212.09058 (2022)."},{"key":"e_1_3_2_1_5_1","volume-title":"Joint Generalized Cosine Similarity: A Novel Method for N-Modal Semantic Alignment Based on Contrastive Learning. arXiv preprint arXiv:2505.03532","author":"Chen Yiqiao","year":"2025","unstructured":"Yiqiao Chen and Zijian Huang. 2025. Joint Generalized Cosine Similarity: A Novel Method for N-Modal Semantic Alignment Based on Contrastive Learning. arXiv preprint arXiv:2505.03532 (2025)."},{"key":"e_1_3_2_1_6_1","volume-title":"Gramian Multimodal Representation Learning and Alignment. arXiv preprint arXiv:2412.11959","author":"Cicchetti Giordano","year":"2024","unstructured":"Giordano Cicchetti, Eleonora Grassucci, Luigi Sigillo, and Danilo Comminiello. 2024. Gramian Multimodal Representation Learning and Alignment. arXiv preprint arXiv:2412.11959 (2024)."},{"key":"e_1_3_2_1_7_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"e_1_3_2_1_11_1","volume-title":"Vlm2vec: Training vision-language models for massive multimodal embedding tasks. arXiv preprint arXiv:2410.05160","author":"Jiang Ziyan","year":"2024","unstructured":"Ziyan Jiang, Rui Meng, Xinyi Yang, Semih Yavuz, Yingbo Zhou, and Wenhu Chen. 2024. Vlm2vec: Training vision-language models for massive multimodal embedding tasks. arXiv preprint arXiv:2410.05160 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Kim Chris Dongjoo","year":"2019","unstructured":"Chris Dongjoo Kim, Byeongchang Kim, Hyunmin Lee, and Gunhee Kim. 2019. Audiocaps: Generating captions for audios in the wild. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). 119-132."},{"key":"e_1_3_2_1_13_1","volume-title":"Audio Retrieval with Natural Language Queries: A Benchmark Study. arXiv preprint arXiv:2112.09418","author":"Koepke A. Sophia","year":"2021","unstructured":"A. Sophia Koepke, Andreea-Maria Oncescu, Jo\u00e3o F. Henriques, Zeynep Akata, and Samuel Albanie. 2021. Audio Retrieval with Natural Language Queries: A Benchmark Study. arXiv preprint arXiv:2112.09418 (2021). https:\/\/arxiv.org\/abs\/2112.09418"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1561\/9781638283379"},{"key":"e_1_3_2_1_15_1","volume-title":"Mm-embed: Universal multimodal retrieval with multimodal llms. arXiv preprint arXiv:2411.02571","author":"Lin Sheng-Chieh","year":"2024","unstructured":"Sheng-Chieh Lin, Chankyu Lee, Mohammad Shoeybi, Jimmy Lin, Bryan Catanzaro, and Wei Ping. 2024. Mm-embed: Universal multimodal retrieval with multimodal llms. arXiv preprint arXiv:2411.02571 (2024)."},{"key":"e_1_3_2_1_16_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_17_1","volume-title":"DecAlign: Hierarchical Cross-Modal Alignment for Decoupled Multimodal Representation Learning. arXiv preprint arXiv:2503.11892","author":"Qian Chengxuan","year":"2025","unstructured":"Chengxuan Qian, Shuo Xing, Shawn Li, Yue Zhao, and Zhengzhong Tu. 2025. DecAlign: Hierarchical Cross-Modal Alignment for Decoupled Multimodal Representation Learning. arXiv preprint arXiv:2503.11892 (2025)."},{"key":"e_1_3_2_1_18_1","volume-title":"On the importance of contrastive loss in multimodal learning. arXiv preprint arXiv:2304.03717","author":"Ren Yunwei","year":"2023","unstructured":"Yunwei Ren and Yuanzhi Li. 2023. On the importance of contrastive loss in multimodal learning. arXiv preprint arXiv:2304.03717 (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"Understanding the emergence of multimodal representation alignment. arXiv preprint arXiv:2502.16282","author":"Tjandrasuwita Megan","year":"2025","unstructured":"Megan Tjandrasuwita, Chanakya Ekbote, Liu Ziyin, and Paul Pu Liang. 2025. Understanding the emergence of multimodal representation alignment. arXiv preprint arXiv:2502.16282 (2025)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-025-06459-5"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"e_1_3_2_1_22_1","volume-title":"A comprehensive survey on deep multimodal learning with missing modality. arXiv e-prints","author":"Wu Renjie","year":"2024","unstructured":"Renjie Wu, Hu Wang, and Hsiang-Ting Chen. 2024. A comprehensive survey on deep multimodal learning with missing modality. arXiv e-prints (2024), arXiv-2409."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_24_1","volume-title":"International Conference on Neural Information Processing. Springer, 59-73","author":"You Haochen","year":"2024","unstructured":"Haochen You and Baojing Liu. 2024. Application of Pseudometric Functions in Clustering and a Novel Similarity Measure Based on Path Information Discrepancy. In International Conference on Neural Information Processing. Springer, 59-73."},{"key":"e_1_3_2_1_25_1","volume-title":"Recent Advances of Multimodal Continual Learning: A Comprehensive Survey. arXiv preprint arXiv:2410.05352","author":"Yu Dianzhi","year":"2024","unstructured":"Dianzhi Yu, Xinni Zhang, Yankai Chen, Aiwei Liu, Yifei Zhang, Philip S Yu, and Irwin King. 2024. Recent Advances of Multimodal Continual Learning: A Comprehensive Survey. arXiv preprint arXiv:2410.05352 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"A Survey of Multimodal Learning: Methods, Applications, and Future. Comput. Surveys","author":"Yuan Yuan","year":"2025","unstructured":"Yuan Yuan, Zhaojian Li, and Bin Zhao. 2025. A Survey of Multimodal Learning: Methods, Applications, and Future. Comput. Surveys (2025)."},{"key":"e_1_3_2_1_27_1","volume-title":"Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601","author":"Zhang Duzhen","year":"2024","unstructured":"Duzhen Zhang, Yahan Yu, Jiahua Dong, Chenxing Li, Dan Su, Chenhui Chu, and Dong Yu. 2024b. Mm-llms: Recent advances in multimodal large language models. arXiv preprint arXiv:2401.13601 (2024)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02592"},{"key":"e_1_3_2_1_29_1","volume-title":"Videoprism: A foundational visual encoder for video understanding. arXiv preprint arXiv:2402.13217","author":"Zhao Long","year":"2024","unstructured":"Long Zhao, Nitesh B Gundavarapu, Liangzhe Yuan, Hao Zhou, Shen Yan, Jennifer J Sun, Luke Friedman, Rui Qian, Tobias Weyand, Yue Zhao, et al., 2024. Videoprism: A foundational visual encoder for video understanding. arXiv preprint arXiv:2402.13217 (2024)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671454"},{"key":"e_1_3_2_1_31_1","volume-title":"Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852","author":"Zhu Bin","year":"2023","unstructured":"Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, HongFa Wang, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, et al., 2023. Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. arXiv preprint arXiv:2310.01852 (2023)."},{"key":"e_1_3_2_1_32_1","volume-title":"Chinese Conference on Pattern Recognition and Computer Vision (PRCV). Springer, 157-171","author":"Zhu Sidan","year":"2024","unstructured":"Sidan Zhu and Dixin Luo. 2024. Enhancing Multi-modal Contrastive Learning via Optimal Transport-Based Consistent Modality Alignment. In Chinese Conference on Pattern Recognition and Computer Vision (PRCV). Springer, 157-171."},{"key":"e_1_3_2_1_33_1","volume-title":"Loic Feujio, Akash Maharaj, and Yunyao Li.","author":"Zhu Zhengyuan","year":"2024","unstructured":"Zhengyuan Zhu, Daniel Lee, Hong Zhang, Sai Sree Harsha, Loic Feujio, Akash Maharaj, and Yunyao Li. 2024. Murar: A simple and effective multimodal retrieval and answer refinement framework for multimodal question answering. arXiv preprint arXiv:2408.08521 (2024)."},{"key":"e_1_3_2_1_34_1","volume-title":"Oisin Mac Aodha, and Timothy Hospedales","author":"Zong Yongshuo","year":"2023","unstructured":"Yongshuo Zong, Oisin Mac Aodha, and Timothy Hospedales. 2023. Self-supervised multimodal learning: A survey. arXiv preprint arXiv:2304.01008 (2023)."}],"event":{"name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","location":"Seoul Republic of Korea","acronym":"CIKM '25","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3760958","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:06:45Z","timestamp":1765505205000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3760958"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":34,"alternative-id":["10.1145\/3746252.3760958","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3760958","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}