{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:20:12Z","timestamp":1765308012303,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":81,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758174","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:44:48Z","timestamp":1761371088000},"page":"12179-12188","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Towards Modality Generalization: A Benchmark and Prospective Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6037-8580","authenticated-orcid":false,"given":"Xiaohao","family":"Liu","sequence":"first","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3615-0919","authenticated-orcid":false,"given":"Xiaobo","family":"Xia","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4430-0983","authenticated-orcid":false,"given":"Zhuo","family":"Huang","sequence":"additional","affiliation":[{"name":"The University of Sydney, Sydney, NSW, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6565-7511","authenticated-orcid":false,"given":"See-Kiong","family":"Ng","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6097-7807","authenticated-orcid":false,"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"3438","article-title":"Invariance principle meets information bottleneck for out-of-distribution generalization","volume":"34","author":"Ahuja Kartik","year":"2021","unstructured":"Kartik Ahuja, Ethan Caballero, Dinghuai Zhang, Jean-Christophe Gagnon-Audet, Yoshua Bengio, Ioannis Mitliagkas, and Irina Rish. 2021. Invariance principle meets information bottleneck for out-of-distribution generalization. NeurIPS, Vol. 34 (2021), 3438-3450.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_2_1","first-page":"24206","article-title":"Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text","author":"Akbari Hassan","year":"2021","unstructured":"Hassan Akbari, Liangzhe Yuan, Rui Qian, Wei-Hong Chuang, Shih-Fu Chang, Yin Cui, and Boqing Gong. 2021. Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text. In NeurIPS. 24206-24221.","journal-title":"NeurIPS."},{"key":"e_1_3_2_1_3_1","volume-title":"Invariant risk minimization. arXiv preprint arXiv:1907.02893","author":"Arjovsky Martin","year":"2019","unstructured":"Martin Arjovsky, L\u00e9on Bottou, Ishaan Gulrajani, and David Lopez-Paz. 2019. Invariant risk minimization. arXiv preprint arXiv:1907.02893 (2019)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-021-02166-7"},{"key":"e_1_3_2_1_5_1","first-page":"1989","article-title":"Murel: Multimodal relational reasoning for visual question answering","author":"Cadene Remi","year":"2019","unstructured":"Remi Cadene, Hedi Ben-Younes, Matthieu Cord, and Nicolas Thome. 2019. Murel: Multimodal relational reasoning for visual question answering. In CVPR. 1989-1998.","journal-title":"CVPR."},{"key":"e_1_3_2_1_6_1","volume-title":"Enhancing Cross-Modal Fine-Tuning with Gradually Intermediate Modality Generation. arXiv preprint arXiv:2406.09003","author":"Cai Lincan","year":"2024","unstructured":"Lincan Cai, Shuang Li, Wenxuan Ma, Jingxuan Kang, Binhui Xie, Zixun Sun, and Chengwei Zhu. 2024. Enhancing Cross-Modal Fine-Tuning with Gradually Intermediate Modality Generation. arXiv preprint arXiv:2406.09003 (2024)."},{"key":"e_1_3_2_1_7_1","first-page":"721","article-title":"Vggsound: A large-scale audio-visual dataset","author":"Chen Honglie","year":"2020","unstructured":"Honglie Chen, Weidi Xie, Andrea Vedaldi, and Andrew Zisserman. 2020b. Vggsound: A large-scale audio-visual dataset. In ICASSP. 721-725.","journal-title":"ICASSP."},{"key":"e_1_3_2_1_8_1","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020a. A simple framework for contrastive learning of visual representations. In ICML. PMLR, 1597-1607.","journal-title":"ICML. PMLR"},{"key":"e_1_3_2_1_9_1","first-page":"139","article-title":"Multimodal object detection via probabilistic ensembling","author":"Chen Yi-Ting","year":"2022","unstructured":"Yi-Ting Chen, Jinghao Shi, Zelin Ye, Christoph Mertz, Deva Ramanan, and Shu Kong. 2022. Multimodal object detection via probabilistic ensembling. In ECCV. 139-158.","journal-title":"ECCV."},{"key":"e_1_3_2_1_10_1","first-page":"78674","article-title":"SimMMDG: A simple and effective framework for multi-modal domain generalization","author":"Dong Hao","year":"2023","unstructured":"Hao Dong, Ismail Nejjar, Han Sun, Eleni Chatzi, and Olga Fink. 2023. SimMMDG: A simple and effective framework for multi-modal domain generalization. In NeurIPS. 78674-78695.","journal-title":"NeurIPS."},{"key":"e_1_3_2_1_11_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ICLR","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ICLR (2021)."},{"key":"e_1_3_2_1_12_1","first-page":"17340","article-title":"Probable domain generalization via quantile risk minimization","volume":"35","author":"Eastwood Cian","year":"2022","unstructured":"Cian Eastwood, Alexander Robey, Shashank Singh, Julius Von K\u00fcgelgen, Hamed Hassani, George J Pappas, and Bernhard Sch\u00f6lkopf. 2022. Probable domain generalization via quantile risk minimization. NeurIPS, Vol. 35 (2022), 17340-17358.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-023-00624-6"},{"key":"e_1_3_2_1_14_1","first-page":"1","article-title":"Domain-adversarial training of neural networks","volume":"17","author":"Ganin Yaroslav","year":"2016","unstructured":"Yaroslav Ganin, Evgeniya Ustinova, Hana Ajakan, Pascal Germain, Hugo Larochelle, Fran\u00e7ois Laviolette, Mario March, and Victor Lempitsky. 2016. Domain-adversarial training of neural networks. Journal of Machine Learning Research, Vol. 17, 59 (2016), 1-35.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_15_1","first-page":"15180","article-title":"Imagebind: One embedding space to bind them all","author":"Girdhar Rohit","year":"2023","unstructured":"Rohit Girdhar, Alaaeldin El-Nouby, Zhuang Liu, Mannat Singh, Kalyan Vasudev Alwala, Armand Joulin, and Ishan Misra. 2023. Imagebind: One embedding space to bind them all. In CVPR. 15180-15190.","journal-title":"CVPR."},{"key":"e_1_3_2_1_16_1","first-page":"16102","article-title":"Omnivore: A single model for many visual modalities","author":"Girdhar Rohit","year":"2022","unstructured":"Rohit Girdhar, Mannat Singh, Nikhila Ravi, Laurens Van Der Maaten, Armand Joulin, and Ishan Misra. 2022. Omnivore: A single model for many visual modalities. In CVPR. 16102-16112.","journal-title":"CVPR."},{"key":"e_1_3_2_1_17_1","volume-title":"Ast: Audio spectrogram transformer. Interspeech","author":"Gong Yuan","year":"2021","unstructured":"Yuan Gong, Yu-An Chung, and James Glass. 2021. Ast: Audio spectrogram transformer. Interspeech (2021)."},{"key":"e_1_3_2_1_18_1","unstructured":"Ishaan Gulrajani and David Lopez-Paz. 2021. In search of lost domain generalization. In ICLR."},{"key":"e_1_3_2_1_19_1","unstructured":"Ziyu Guo Renrui Zhang Xiangyang Zhu Yiwen Tang Xianzheng Ma Jiaming Han Kexin Chen Peng Gao Xianzhi Li Hongsheng Li et al. 2023. Point-bind & point-llm: Aligning point cloud with multi-modality for 3d understanding generation and instruction following. arXiv preprint arXiv:2309.00615 (2023)."},{"key":"e_1_3_2_1_20_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_21_1","first-page":"10944","article-title":"What makes multi-modal learning better than single (provably)","author":"Huang Yu","year":"2021","unstructured":"Yu Huang, Chenzhuang Du, Zihui Xue, Xuanyao Chen, Hang Zhao, and Longbo Huang. 2021. What makes multi-modal learning better than single (provably). In NeurIPS. 10944-10956.","journal-title":"NeurIPS."},{"key":"e_1_3_2_1_22_1","unstructured":"Zhuo Huang Gang Niu Bo Han Masashi Sugiyama and Tongliang Liu. 2025. Towards Out-of-Modal Generalization without Instance-level Modal Correspondence. In ICLR."},{"key":"e_1_3_2_1_23_1","first-page":"124","article-title":"Self-challenging improves cross-domain generalization","author":"Huang Zeyi","year":"2020","unstructured":"Zeyi Huang, Haohan Wang, Eric P Xing, and Dong Huang. 2020. Self-challenging improves cross-domain generalization. In ECCV. 124-140.","journal-title":"ECCV."},{"key":"e_1_3_2_1_24_1","volume-title":"Position: The platonic representation hypothesis. In ICML.","author":"Huh Minyoung","year":"2024","unstructured":"Minyoung Huh, Brian Cheung, Tongzhou Wang, and Phillip Isola. 2024. Position: The platonic representation hypothesis. In ICML."},{"key":"e_1_3_2_1_25_1","volume-title":"Uniformly Distributed Feature Representations for Fair and Robust Learning. TMLR","author":"Krishnamachari Kiran","year":"2024","unstructured":"Kiran Krishnamachari, See-Kiong Ng, and Chuan-Sheng Foo. 2024. Uniformly Distributed Feature Representations for Fair and Robust Learning. TMLR (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"Subword regularization: Improving neural network translation models with multiple subword candidates. ACL","author":"Kudo Taku","year":"2018","unstructured":"Taku Kudo. 2018. Subword regularization: Improving neural network translation models with multiple subword candidates. ACL (2018)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Da Li Yongxin Yang Yi-Zhe Song and Timothy Hospedales. 2018b. Learning to generalize: Meta-learning for domain generalization. In AAAI.","DOI":"10.1609\/aaai.v32i1.11596"},{"key":"e_1_3_2_1_28_1","first-page":"19730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In ICML. 19730-19742.","journal-title":"ICML."},{"key":"e_1_3_2_1_29_1","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In ICML. 12888-12900.","journal-title":"ICML."},{"key":"e_1_3_2_1_30_1","first-page":"624","article-title":"Deep domain generalization via conditional invariant adversarial networks","author":"Li Ya","year":"2018","unstructured":"Ya Li, Xinmei Tian, Mingming Gong, Yajing Liu, Tongliang Liu, Kun Zhang, and Dacheng Tao. 2018a. Deep domain generalization via conditional invariant adversarial networks. In ECCV. 624-639.","journal-title":"ECCV."},{"key":"e_1_3_2_1_31_1","volume-title":"Yun Cheng, Alex Obolenskiy, Yudong Liu, Rohan Pandey, Alex Wilf, Louis-Philippe Morency, and Ruslan Salakhutdinov.","author":"Liang Paul Pu","year":"2023","unstructured":"Paul Pu Liang, Chun Kai Ling, Yun Cheng, Alex Obolenskiy, Yudong Liu, Rohan Pandey, Alex Wilf, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2023. Multimodal learning without labeled multimodal data: Guarantees and applications. arXiv preprint arXiv:2306.04539 (2023)."},{"key":"e_1_3_2_1_32_1","first-page":"2680","article-title":"Cross-modal generalization: Learning in low resource modalities via meta-alignment","author":"Liang Paul Pu","year":"2021","unstructured":"Paul Pu Liang, Peter Wu, Liu Ziyin, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2021. Cross-modal generalization: Learning in low resource modalities via meta-alignment. In ACMMM. 2680-2689.","journal-title":"ACMMM."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3656580"},{"key":"e_1_3_2_1_34_1","first-page":"24529","article-title":"ZIN: When and how to learn invariance without environment partition?","author":"Lin Yong","year":"2022","unstructured":"Yong Lin, Shengyu Zhu, Lu Tan, and Peng Cui. 2022. ZIN: When and how to learn invariance without environment partition?. In NeurIPS. 24529-24542.","journal-title":"NeurIPS."},{"key":"e_1_3_2_1_35_1","volume-title":"Continual multimodal contrastive learning. arXiv preprint arXiv:2503.14963","author":"Liu Xiaohao","year":"2025","unstructured":"Xiaohao Liu, Xiaobo Xia, See-Kiong Ng, and Tat-Seng Chua. 2025a. Continual multimodal contrastive learning. arXiv preprint arXiv:2503.14963 (2025)."},{"key":"e_1_3_2_1_36_1","volume-title":"Principled Multimodal Representation Learning. arXiv preprint arXiv:2507.17343","author":"Liu Xiaohao","year":"2025","unstructured":"Xiaohao Liu, Xiaobo Xia, See-Kiong Ng, and Tat-Seng Chua. 2025b. Principled Multimodal Representation Learning. arXiv preprint arXiv:2507.17343 (2025)."},{"key":"e_1_3_2_1_37_1","first-page":"57244","article-title":"A theory of multimodal learning","author":"Lu Zhou","year":"2023","unstructured":"Zhou Lu. 2023. A theory of multimodal learning. In NeurIPS. 57244-57255.","journal-title":"NeurIPS."},{"key":"e_1_3_2_1_38_1","first-page":"810","article-title":"On the Computational Benefit of Multimodal Learning","author":"Lu Zhou","year":"2024","unstructured":"Zhou Lu. 2024. On the Computational Benefit of Multimodal Learning. In ICALT. 810-821.","journal-title":"ICALT."},{"key":"e_1_3_2_1_39_1","volume-title":"Deem: Diffusion models serve as the eyes of large language models for image perception. arXiv preprint arXiv:2405.15232","author":"Luo Run","year":"2024","unstructured":"Run Luo, Yunshui Li, Longze Chen, Wanwei He, Ting-En Lin, Ziqiang Liu, Lei Zhang, Zikai Song, Xiaobo Xia, Tongliang Liu, et al., 2024a. Deem: Diffusion models serve as the eyes of large language models for image perception. arXiv preprint arXiv:2405.15232 (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"VCM: Vision Concept Modeling Based on Implicit Contrastive Learning with Vision-Language Instruction Fine-Tuning. arXiv preprint arXiv:2504.19627","author":"Luo Run","year":"2025","unstructured":"Run Luo, Renke Shan, Longze Chen, Ziqiang Liu, Lu Wang, Min Yang, and Xiaobo Xia. 2025. VCM: Vision Concept Modeling Based on Implicit Contrastive Learning with Vision-Language Instruction Fine-Tuning. arXiv preprint arXiv:2504.19627 (2025)."},{"key":"e_1_3_2_1_41_1","volume-title":"Mmevol: Empowering multimodal large language models with evol-instruct. arXiv preprint arXiv:2409.05840","author":"Luo Run","year":"2024","unstructured":"Run Luo, Haonan Zhang, Longze Chen, Ting-En Lin, Xiong Liu, Yuchuan Wu, Min Yang, Minzheng Wang, Pengpeng Zeng, Lianli Gao, et al., 2024b. Mmevol: Empowering multimodal large language models with evol-instruct. arXiv preprint arXiv:2409.05840 (2024)."},{"key":"e_1_3_2_1_42_1","first-page":"26752","article-title":"UniBind: LLM-Augmented Unified and Balanced Representation Space to Bind Them All","author":"Lyu Yuanhuiyi","year":"2024","unstructured":"Yuanhuiyi Lyu, Xu Zheng, Jiazhou Zhou, and Lin Wang. 2024. UniBind: LLM-Augmented Unified and Balanced Representation Space to Bind Them All. In CVPR. 26752-26762.","journal-title":"CVPR."},{"key":"e_1_3_2_1_43_1","first-page":"7598","article-title":"Unpaired image-to-speech synthesis with multimodal information bottleneck","author":"Ma Shuang","year":"2019","unstructured":"Shuang Ma, Daniel McDuff, and Yale Song. 2019. Unpaired image-to-speech synthesis with multimodal information bottleneck. In ICCV. 7598-7607.","journal-title":"ICCV."},{"key":"e_1_3_2_1_44_1","first-page":"8690","article-title":"Reducing domain gap by reducing style bias","author":"Nam Hyeonseob","year":"2021","unstructured":"Hyeonseob Nam, HyunJae Lee, Jongchan Park, Wonjun Yoon, and Donggeun Yoo. 2021. Reducing domain gap by reducing style bias. In CVPR. 8690-8699.","journal-title":"CVPR."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNN.2010.2091281"},{"key":"e_1_3_2_1_46_1","first-page":"8238","article-title":"Balanced multimodal learning via on-the-fly gradient modulation","author":"Peng Xiaokang","year":"2022","unstructured":"Xiaokang Peng, Yake Wei, Andong Deng, Dong Wang, and Di Hu. 2022. Balanced multimodal learning via on-the-fly gradient modulation. In CVPR. 8238-8247.","journal-title":"CVPR."},{"key":"e_1_3_2_1_47_1","first-page":"462","article-title":"Online zero-shot classification with clip","author":"Qian Qi","year":"2024","unstructured":"Qi Qian and Juhua Hu. 2024. Online zero-shot classification with clip. In ECCV. 462-477.","journal-title":"ECCV."},{"key":"e_1_3_2_1_48_1","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In ICML. 8748-8763.","journal-title":"ICML."},{"key":"e_1_3_2_1_49_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of Machine Learning Research, Vol. 21, 140 (2020), 1-67.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_50_1","volume-title":"Optimal representations for covariate shift. ICLR","author":"Ruan Yangjun","year":"2022","unstructured":"Yangjun Ruan, Yann Dubois, and Chris J Maddison. 2022. Optimal representations for covariate shift. ICLR (2022)."},{"key":"e_1_3_2_1_51_1","volume-title":"Neural machine translation of rare words with subword units. ACL","author":"Sennrich Rico","year":"2016","unstructured":"Rico Sennrich. 2016. Neural machine translation of rare words with subword units. ACL (2016)."},{"key":"e_1_3_2_1_52_1","first-page":"31030","article-title":"Cross-modal fine-tuning: Align then refine","author":"Shen Junhong","year":"2023","unstructured":"Junhong Shen, Liam Li, Lucio M Dery, Corey Staten, Mikhail Khodak, Graham Neubig, and Ameet Talwalkar. 2023. Cross-modal fine-tuning: Align then refine. In ICML. 31030-31056.","journal-title":"ICML."},{"key":"e_1_3_2_1_53_1","first-page":"746","article-title":"Indoor segmentation and support inference from rgbd images","author":"Silberman Nathan","year":"2012","unstructured":"Nathan Silberman, Derek Hoiem, Pushmeet Kohli, and Rob Fergus. 2012. Indoor segmentation and support inference from rgbd images. In ECCV. 746-760.","journal-title":"ECCV."},{"key":"e_1_3_2_1_54_1","volume-title":"How to bridge the gap between modalities: A comprehensive survey on multimodal large language model. arXiv preprint arXiv:2311.07594","author":"Song Shezheng","year":"2023","unstructured":"Shezheng Song, Xiaopeng Li, Shasha Li, Shan Zhao, Jie Yu, Jun Ma, Xiaoguang Mao, and Weimin Zhang. 2023. How to bridge the gap between modalities: A comprehensive survey on multimodal large language model. arXiv preprint arXiv:2311.07594 (2023)."},{"key":"e_1_3_2_1_55_1","volume-title":"Erm: An improved baseline for domain generalization. arXiv preprint arXiv:2304.01973","author":"Teterwak Piotr","year":"2023","unstructured":"Piotr Teterwak, Kuniaki Saito, Theodoros Tsiligkaridis, Kate Saenko, and Bryan A Plummer. 2023. Erm: An improved baseline for domain generalization. arXiv preprint arXiv:2304.01973 (2023)."},{"key":"e_1_3_2_1_56_1","volume-title":"Understanding the emergence of multimodal representation alignment. arXiv preprint arXiv:2502.16282","author":"Tjandrasuwita Megan","year":"2025","unstructured":"Megan Tjandrasuwita, Chanakya Ekbote, Liu Ziyin, and Paul Pu Liang. 2025. Understanding the emergence of multimodal representation alignment. arXiv preprint arXiv:2502.16282 (2025)."},{"key":"e_1_3_2_1_57_1","first-page":"23","article-title":"Domain randomization for transferring deep neural networks from simulation to the real world","author":"Tobin Josh","year":"2017","unstructured":"Josh Tobin, Rachel Fong, Alex Ray, Jonas Schneider, Wojciech Zaremba, and Pieter Abbeel. 2017. Domain randomization for transferring deep neural networks from simulation to the real world. In IROS. 23-30.","journal-title":"IROS."},{"key":"e_1_3_2_1_58_1","unstructured":"Vladimir Naumovich Vapnik Vlamimir Vapnik et al. 1998. Statistical learning theory. (1998)."},{"key":"e_1_3_2_1_59_1","first-page":"8052","article-title":"Generalizing to unseen domains: A survey on domain generalization","volume":"35","author":"Wang Jindong","year":"2022","unstructured":"Jindong Wang, Cuiling Lan, Chang Liu, Yidong Ouyang, Tao Qin, Wang Lu, Yiqiang Chen, Wenjun Zeng, and S Yu Philip. 2022. Generalizing to unseen domains: A survey on domain generalization. IEEE Transactions on Knowledge and Data Engineering, Vol. 35, 8 (2022), 8052-8072.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"e_1_3_2_1_60_1","volume-title":"Open-vocabulary segmentation with unpaired mask-text supervision. arXiv preprint arXiv:2402.08960","author":"Wang Zhaoqing","year":"2024","unstructured":"Zhaoqing Wang, Xiaobo Xia, Ziye Chen, Xiao He, Yandong Guo, Mingming Gong, and Tongliang Liu. 2024. Open-vocabulary segmentation with unpaired mask-text supervision. arXiv preprint arXiv:2402.08960 (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"Extending multi-modal contrastive representations. arXiv preprint arXiv:2310.08884","author":"Wang Zehan","year":"2023","unstructured":"Zehan Wang, Ziang Zhang, Luping Liu, Yang Zhao, Haifeng Huang, Tao Jin, and Zhou Zhao. 2023a. Extending multi-modal contrastive representations. arXiv preprint arXiv:2310.08884 (2023)."},{"key":"e_1_3_2_1_62_1","first-page":"22099","article-title":"Connecting multi-modal contrastive representations","author":"Wang Zehan","year":"2023","unstructured":"Zehan Wang, Yang Zhao, Haifeng Huang, Jiageng Liu, Aoxiong Yin, Li Tang, Linjun Li, Yongqi Wang, Ziang Zhang, and Zhou Zhao. 2023b. Connecting multi-modal contrastive representations. In NeurIPS. 22099-22114.","journal-title":"NeurIPS."},{"key":"e_1_3_2_1_63_1","volume-title":"Deep Multimodal Learning with Missing Modality: A Survey. arXiv preprint arXiv:2409.07825","author":"Wu Renjie","year":"2024","unstructured":"Renjie Wu, Hu Wang, Hsiang-Ting Chen, and Gustavo Carneiro. 2024b. Deep Multimodal Learning with Missing Modality: A Survey. arXiv preprint arXiv:2409.07825 (2024)."},{"key":"e_1_3_2_1_64_1","volume-title":"Next-gpt: Any-to-any multimodal llm. In ICML.","author":"Wu Shengqiong","year":"2024","unstructured":"Shengqiong Wu, Hao Fei, Leigang Qu, Wei Ji, and Tat-Seng Chua. 2024a. Next-gpt: Any-to-any multimodal llm. In ICML."},{"key":"e_1_3_2_1_65_1","unstructured":"Yan Xia Hai Huang Jieming Zhu and Zhou Zhao. 2024. Achieving cross modal generalization with multimodal unified representation. In NeurIPS."},{"key":"e_1_3_2_1_66_1","first-page":"5288","article-title":"Msr-vtt: A large video description dataset for bridging video and language","author":"Xu Jun","year":"2016","unstructured":"Jun Xu, Tao Mei, Ting Yao, and Yong Rui. 2016. Msr-vtt: A large video description dataset for bridging video and language. In CVPR. 5288-5296.","journal-title":"CVPR."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3275156"},{"key":"e_1_3_2_1_68_1","volume-title":"Improve unsupervised domain adaptation with mixup training. arXiv preprint arXiv:2001.00677","author":"Yan Shen","year":"2020","unstructured":"Shen Yan, Huan Song, Nanxiang Li, Lincan Zou, and Liu Ren. 2020. Improve unsupervised domain adaptation with mixup training. arXiv preprint arXiv:2001.00677 (2020)."},{"key":"e_1_3_2_1_69_1","first-page":"26340","article-title":"Binding touch to everything: Learning unified multimodal tactile representations","author":"Yang Fengyu","year":"2024","unstructured":"Fengyu Yang, Chao Feng, Ziyang Chen, Hyoungseob Park, Daniel Wang, Yiming Dou, Ziyao Zeng, Xien Chen, Rit Gangopadhyay, Andrew Owens, et al., 2024a. Binding touch to everything: Learning unified multimodal tactile representations. In CVPR. 26340-26353.","journal-title":"CVPR."},{"key":"e_1_3_2_1_70_1","unstructured":"Yang Yang Fengqiang Wan Qing-Yuan Jiang and Yi Xu. 2024b. Facilitating Multimodal Classification via Dynamically Learning Modality Gap. In NIPS."},{"key":"e_1_3_2_1_71_1","volume-title":"mixup: Beyond empirical risk minimization. arXiv preprint arXiv:1710.09412","author":"Zhang Hongyi","year":"2017","unstructured":"Hongyi Zhang. 2017. mixup: Beyond empirical risk minimization. arXiv preprint arXiv:1710.09412 (2017)."},{"key":"e_1_3_2_1_72_1","first-page":"9781","article-title":"Frame-event alignment and fusion network for high frame rate tracking","author":"Zhang Jiqing","year":"2023","unstructured":"Jiqing Zhang, Yuanchen Wang, Wenxi Liu, Meng Li, Jinpeng Bai, Baocai Yin, and Xin Yang. 2023b. Frame-event alignment and fusion network for high frame rate tracking. In CVPR. 9781-9790.","journal-title":"CVPR."},{"key":"e_1_3_2_1_73_1","unstructured":"Qi Zhang Yifei Wang and Yisen Wang. 2023a. On the Generalization of Multi-modal Contrastive Learning. In ICML."},{"key":"e_1_3_2_1_74_1","first-page":"10394","article-title":"Deep supervised cross-modal retrieval","author":"Zhen Liangli","year":"2019","unstructured":"Liangli Zhen, Peng Hu, Xu Wang, and Dezhong Peng. 2019. Deep supervised cross-modal retrieval. In CVPR. 10394-10403.","journal-title":"CVPR."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7003"},{"key":"e_1_3_2_1_76_1","volume-title":"Aligning modalities in vision large language models via preference fine-tuning. arXiv preprint arXiv:2402.11411","author":"Zhou Yiyang","year":"2024","unstructured":"Yiyang Zhou, Chenhang Cui, Rafael Rafailov, Chelsea Finn, and Huaxiu Yao. 2024a. Aligning modalities in vision large language models via preference fine-tuning. arXiv preprint arXiv:2402.11411 (2024)."},{"key":"e_1_3_2_1_77_1","volume-title":"Where","author":"Zhou Yuchen","year":"2025","unstructured":"Yuchen Zhou, Jiayu Tang, Xiaoyan Xiao, Yueyao Lin, Linkai Liu, Zipeng Guo, Hao Fei, Xiaobo Xia, and Chao Gou. 2025a. Where, What, Why: Towards Explainable Driver Attention Prediction. arXiv preprint arXiv:2506.23088 (2025)."},{"key":"e_1_3_2_1_78_1","first-page":"3122","article-title":"Few-shot adversarial prompt learning on vision-language models","author":"Zhou Yiwei","year":"2024","unstructured":"Yiwei Zhou, Xiaobo Xia, Zhiwei Lin, Bo Han, and Tongliang Liu. 2024b. Few-shot adversarial prompt learning on vision-language models. In NeurIPS. 3122-3156.","journal-title":"NeurIPS."},{"key":"e_1_3_2_1_79_1","unstructured":"Zhenglin Zhou Xiaobo Xia Fan Ma Hehe Fan Yi Yang and Tat-Seng Chua. 2025b. DreamDPO: Aligning Text-to-3D Generation with Human Preferences via Direct Preference Optimization. In ICML."},{"key":"e_1_3_2_1_80_1","volume-title":"Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. In ICLR.","author":"Zhu Bin","year":"2024","unstructured":"Bin Zhu, Bin Lin, Munan Ning, Yang Yan, Jiaxi Cui, HongFa Wang, Yatian Pang, Wenhao Jiang, Junwu Zhang, Zongwei Li, et al., 2024. Languagebind: Extending video-language pretraining to n-modality by language-based semantic alignment. In ICLR."},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-021-1293-0"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758174","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:15:17Z","timestamp":1765307717000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758174"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":81,"alternative-id":["10.1145\/3746027.3758174","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758174","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}