{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T10:44:32Z","timestamp":1773398672456,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"The research funding of GuangZhou DaYou Network Technology Co., Ltd."}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681683","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"438-446","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":22,"title":["Leveraging Knowledge of Modality Experts for Incomplete Multimodal Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-0813-5969","authenticated-orcid":false,"given":"Wenxin","family":"Xu","sequence":"first","affiliation":[{"name":"Guangzhou Institute of Technology, Xidian University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5287-4170","authenticated-orcid":false,"given":"Hexin","family":"Jiang","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University, Xi'an, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1448-0477","authenticated-orcid":false,"given":"Xuefeng","family":"Liang","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence &amp; Guangzhou Institute of Technology, Xidian University, Xi'an, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1095"},{"key":"e_1_3_2_1_2_1","volume-title":"Multimodal machine learning: A survey and taxonomy","author":"Baltruvsaitis Tadas","year":"2018","unstructured":"Tadas Baltruvsaitis, Chaitanya Ahuja, and Louis-Philippe Morency. 2018. Multimodal machine learning: A survey and taxonomy. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 2 (2018), 423--443."},{"key":"e_1_3_2_1_3_1","volume-title":"IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation","author":"Busso Carlos","year":"2008","unstructured":"Carlos Busso, Murtaza Bulut, Chi-Chun Lee, Abe Kazemzadeh, Emily Mower, Samuel Kim, Jeannette N Chang, Sungbok Lee, and Shrikanth S Narayanan. 2008. IEMOCAP: Interactive emotional dyadic motion capture database. Language resources and evaluation, Vol. 42 (2008), 335--359."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219963"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/79.911197"},{"key":"e_1_3_2_1_6_1","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. 2022. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research, Vol. 23, 120 (2022), 1--39.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2022.02.001"},{"key":"e_1_3_2_1_8_1","volume-title":"2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 2822--2826","author":"Goyal Ankit","unstructured":"Ankit Goyal, Naveen Kumar, Tanaya Guha, and Shrikanth S. Narayanan. 2016. A multimodal mixture-of-experts model for dynamic emotion prediction in movies. In 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). 2822--2826."},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"He Pengcheng","year":"2021","unstructured":"Pengcheng He, Xiaodong Liu, Jianfeng Gao, and Weizhu Chen. 2021. Deberta: Decoding-enhanced bert with disentangled attention. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Elsa A Kirchner Stephen H Fairclough and Frank Kirchner. 2019. Embedded multimodal interfaces in robotics: applications future trends and societal implications. In The Handbook of Multimodal-Multisensor Interfaces: Language Processing Software Commercialization and Emerging Directions-Volume 3. 523--576.","DOI":"10.1145\/3233795.3233810"},{"key":"e_1_3_2_1_11_1","volume-title":"International Conference on Learning Representations (ICLR).","author":"Lepikhin Dmitry","year":"2021","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2021. Gshard: Scaling giant models with conditional computation and automatic sharding. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00641"},{"key":"e_1_3_2_1_13_1","volume-title":"Gcnet: Graph completion network for incomplete multimodal learning in conversation","author":"Lian Zheng","year":"2023","unstructured":"Zheng Lian, Lan Chen, Licai Sun, Bin Liu, and Jianhua Tao. 2023. Gcnet: Graph completion network for incomplete multimodal learning in conversation. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.artint.2022.103714"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102216"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101973"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-27818-1_34"},{"key":"e_1_3_2_1_18_1","volume-title":"Maximum likelihood estimation for multimodal learning with missing modality. arXiv preprint arXiv:2108.10513","author":"Ma Fei","year":"2021","unstructured":"Fei Ma, Xiangxiang Xu, Shao-Lun Huang, and Lin Zhang. 2021. Maximum likelihood estimation for multimodal learning with missing modality. arXiv preprint arXiv:2108.10513 (2021)."},{"key":"e_1_3_2_1_19_1","first-page":"9564","article-title":"Multimodal contrastive learning with limoe: the language-image mixture of experts","volume":"35","author":"Mustafa Basil","year":"2022","unstructured":"Basil Mustafa, Carlos Riquelme, Joan Puigcerver, Rodolphe Jenatton, and Neil Houlsby. 2022. Multimodal contrastive learning with limoe: the language-image mixture of experts. Advances in Neural Information Processing Systems (NeurIPS), Vol. 35, 9564--9576.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3395035.3425202"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33016892"},{"key":"e_1_3_2_1_22_1","first-page":"8583","article-title":"Scaling vision with sparse mixture of experts","volume":"34","author":"Riquelme Carlos","year":"2021","unstructured":"Carlos Riquelme, Joan Puigcerver, Basil Mustafa, Maxim Neumann, Rodolphe Jenatton, Andr\u00e9 Susano Pinto, Daniel Keysers, and Neil Houlsby. 2021. Scaling vision with sparse mixture of experts. Advances in Neural Information Processing Systems (NeurIPS), Vol. 34, 8583--8595.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_23_1","volume-title":"wav2vec: Unsupervised pre-training for speech recognition. arXiv preprint arXiv:1904.05862","author":"Schneider Steffen","year":"2019","unstructured":"Steffen Schneider, Alexei Baevski, Ronan Collobert, and Michael Auli. 2019. wav2vec: Unsupervised pre-training for speech recognition. arXiv preprint arXiv:1904.05862 (2019)."},{"key":"e_1_3_2_1_24_1","volume-title":"Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. In International Conference on Learning Representations (ICLR).","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, *Azalia Mirhoseini, *Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.758"},{"key":"e_1_3_2_1_26_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","volume":"32","author":"Shi Yuge","year":"2019","unstructured":"Yuge Shi, Brooks Paige, Philip Torr, et al. 2019. Variational mixture-of-experts autoencoders for multi-modal deep generative models. Advances in Neural Information Processing Systems (NeurIPS), Vol. 32 (2019)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2020.3047978"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.528"},{"key":"e_1_3_2_1_29_1","volume-title":"J Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov.","author":"Hubert Tsai Yao-Hung","year":"2019","unstructured":"Yao-Hung Hubert Tsai, Shaojie Bai, Paul Pu Liang, J Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Multimodal transformer for unaligned multimodal language sequences. In Proceedings of the Association for Computational Linguistics (ACL), Vol. 2019. 6558."},{"key":"e_1_3_2_1_30_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems (NeurIPS), Vol. 30."},{"key":"e_1_3_2_1_31_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","volume":"36","author":"Wang Yuanzhi","year":"2024","unstructured":"Yuanzhi Wang, Yong Li, and Zhen Cui. 2024. Incomplete multimodality-diffused emotion recognition. In Advances in Neural Information Processing Systems (NeurIPS), Vol. 36."},{"key":"e_1_3_2_1_32_1","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","volume":"31","author":"Wu Mike","year":"2018","unstructured":"Mike Wu and Noah Goodman. 2018. Multimodal generative models for scalable weakly-supervised learning. Advances in Neural Information Processing Systems (NeurIPS), Vol. 31 (2018)."},{"key":"e_1_3_2_1_33_1","volume-title":"Multimodal learning with transformers: A survey","author":"Xu Peng","year":"2023","unstructured":"Peng Xu, Xiatian Zhu, and David A Clifton. 2023. Multimodal learning with transformers: A survey. IEEE Transactions on Pattern Analysis and Machine Intelligence (2023)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2016.94"},{"key":"e_1_3_2_1_35_1","volume-title":"Soujanya Poria, Erik Cambria, and Louis-Philippe Morency.","author":"Bagher Zadeh AmirAli","year":"2018","unstructured":"AmirAli Bagher Zadeh, Paul Pu Liang, Soujanya Poria, Erik Cambria, and Louis-Philippe Morency. 2018. Multimodal language analysis in the wild: Cmu-mosei dataset and interpretable dynamic fusion graph. In Proceedings of the Association for Computational Linguistics (ACL). 2236--2246."},{"key":"e_1_3_2_1_36_1","volume-title":"Huazhu Fu, and Qinghua Hu.","author":"Zhang Changqing","year":"2020","unstructured":"Changqing Zhang, Yajie Cui, Zongbo Han, Joey Tianyi Zhou, Huazhu Fu, and Qinghua Hu. 2020. Deep partial multi-view learning. IEEE transactions on pattern analysis and machine intelligence, Vol. 44, 5 (2020), 2402--2415."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2016.2603342"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.203"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2021.3106895"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3093397"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095836"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681683","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681683","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:50Z","timestamp":1750295870000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681683"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":41,"alternative-id":["10.1145\/3664647.3681683","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681683","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}