{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,23]],"date-time":"2026-03-23T11:58:46Z","timestamp":1774267126800,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.62222211,No.61836002,No.62072397"],"award-info":[{"award-number":["No.62222211,No.61836002,No.62072397"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Key R\\&D Program of China","award":["No.2022ZD0162000"],"award-info":[{"award-number":["No.2022ZD0162000"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612291","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"4431-4439","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Rethinking Missing Modality Learning from a Decoding Perspective"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3564-1628","authenticated-orcid":false,"given":"Tao","family":"Jin","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9708-3225","authenticated-orcid":false,"given":"Xize","family":"Cheng","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2734-0414","authenticated-orcid":false,"given":"Linjun","family":"Li","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3507-7070","authenticated-orcid":false,"given":"Wang","family":"Lin","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3169-0211","authenticated-orcid":false,"given":"Ye","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6651-1802","authenticated-orcid":false,"given":"Zhou","family":"Zhao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Mutan: Multimodal tucker fusion for visual question answering. In ICCV.","author":"Ben-Younes Hedi","year":"2017","unstructured":"Hedi Ben-Younes, R\u00e9mi Cadene, Matthieu Cord, and Nicolas Thome. 2017. Mutan: Multimodal tucker fusion for visual question answering. In ICCV."},{"key":"e_1_3_2_1_2_1","volume-title":"OpenSR: Open-Modality Speech Recognition via Maintaining Multi-Modality Alignment. arXiv preprint arXiv:2306.06410","author":"Cheng Xize","year":"2023","unstructured":"Xize Cheng, Tao Jin, Linjun Li, Wang Lin, Xinyu Duan, and Zhou Zhao. 2023. OpenSR: Open-Modality Speech Recognition via Maintaining Multi-Modality Alignment. arXiv preprint arXiv:2306.06410 (2023)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Gilles Degottex John Kane Thomas Drugman Tuomo Raitio and Stefan Scherer. 2014. COVAREP-A collaborative voice analysis repository for speech technologies. In ICASSP.","DOI":"10.1109\/ICASSP.2014.6853739"},{"key":"e_1_3_2_1_4_1","unstructured":"Tuong Do Thanh-Toan Do Huy Tran Erman Tjiputra and Quang D Tran. 2019. Compact trilinear interaction for visual question answering. In ICCV."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Valentin Gabeur Chen Sun Karteek Alahari and Cordelia Schmid. 2020. Multi-modal transformer for video retrieval. In ECCV.","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_2_1_6_1","unstructured":"Richard A Harshman et al. 1970. Foundations of the PARAFAC procedure: Models and conditions for an \"explanatory\" multimodal factor analysis. (1970)."},{"key":"e_1_3_2_1_7_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Jie Hu Li Shen and Gang Sun. 2018. Squeeze-and-excitation networks. In CVPR.","DOI":"10.1109\/CVPR.2018.00745"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_1_10_1","unstructured":"iMotions. 2017. Facial expression analysis."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531960"},{"key":"e_1_3_2_1_12_1","volume-title":"SBAT: Video captioning with sparse boundary-aware transformer. arXiv preprint arXiv:2007.11888","author":"Jin Tao","year":"2020","unstructured":"Tao Jin, Siyu Huang, Ming Chen, Yingming Li, and Zhongfei Zhang. 2020. SBAT: Video captioning with sparse boundary-aware transformer. arXiv preprint arXiv:2007.11888 (2020)."},{"key":"e_1_3_2_1_13_1","volume-title":"Low-rank hoca: Efficient high-order cross-modal attention for video captioning. arXiv preprint arXiv:1911.00212","author":"Jin Tao","year":"2019","unstructured":"Tao Jin, Siyu Huang, Yingming Li, and Zhongfei Zhang. 2019. Low-rank hoca: Efficient high-order cross-modal attention for video captioning. arXiv preprint arXiv:1911.00212 (2019)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.35"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2019.08.042"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475456"},{"key":"e_1_3_2_1_17_1","first-page":"9049","article-title":"Generalizable Multi-Linear Attention Network","volume":"34","author":"Jin Tao","year":"2021","unstructured":"Tao Jin and Zhou Zhao. 2021. Generalizable Multi-Linear Attention Network. Advances in Neural Information Processing Systems 34 (2021), 9049--9060.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.03.065"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548069"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.297"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01851"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.699"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3264524"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1152"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00804"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.836"},{"key":"e_1_3_2_1_28_1","unstructured":"Yang Liu Samuel Albanie Arsha Nagrani and Andrew Zisserman. 2019. Use what you have: Video retrieval using representations from collaborative experts. In BMVC."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1209"},{"key":"e_1_3_2_1_30_1","volume-title":"Smil: Multimodal learning with severely missing modality. In AAAI.","author":"Ma Mengmeng","year":"2021","unstructured":"Mengmeng Ma, Jian Ren, Long Zhao, Sergey Tulyakov, Cathy Wu, and Xi Peng. 2021. Smil: Multimodal learning with severely missing modality. In AAAI."},{"key":"e_1_3_2_1_31_1","volume-title":"Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781","author":"Mikolov Tomas","year":"2013","unstructured":"Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. 2013. Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781 (2013)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Srinivas Parthasarathy and Shiva Sundaram. 2020. Training strategies to handle missing modalities for audio-visual expression recognition. In ICMI.","DOI":"10.1145\/3395035.3425202"},{"key":"e_1_3_2_1_33_1","volume-title":"Glove: Global vectors for word representation. In EMNLP.","author":"Pennington Jeffrey","year":"2014","unstructured":"Jeffrey Pennington, Richard Socher, and Christopher D Manning. 2014. Glove: Global vectors for word representation. In EMNLP."},{"key":"e_1_3_2_1_34_1","unstructured":"Ver\u00f3nica P\u00e9rez-Rosas Rada Mihalcea and Louis-Philippe Morency. 2013. Utterance-level multimodal sentiment analysis. In ACL."},{"key":"e_1_3_2_1_35_1","volume-title":"Thomas Manzini, Louis-Philippe Morency, and Barnab\u00e1s P\u00f3czos.","author":"Pham Hai","year":"2019","unstructured":"Hai Pham, Paul Pu Liang, Thomas Manzini, Louis-Philippe Morency, and Barnab\u00e1s P\u00f3czos. 2019. Found in translation: Learning robust joint representations by cyclic translations between modalities. In AAAI."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Luan Tran Xiaoming Liu Jiayu Zhou and Rong Jin. 2017. Missing modalities imputation via cascaded residual autoencoder. In CVPR.","DOI":"10.1109\/CVPR.2017.528"},{"key":"e_1_3_2_1_38_1","volume-title":"J Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov.","author":"Hubert Tsai Yao-Hung","year":"2019","unstructured":"Yao-Hung Hubert Tsai, Shaojie Bai, Paul Pu Liang, J Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Multimodal transformer for unaligned multimodal language sequences. In ACL."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.621"},{"key":"e_1_3_2_1_40_1","volume-title":"Multimodal generative models for scalable weakly-supervised learning. Advances in Neural Information Processing Systems 31","author":"Wu Mike","year":"2018","unstructured":"Mike Wu, Noah Goodman, and Mike Wu. 2018. Multimodal generative models for scalable weakly-supervised learning. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_43_1","volume-title":"Speaker identification on the SCOTUS corpus. Journal of the Acoustical Society of America","author":"Yuan Jiahong","year":"2008","unstructured":"Jiahong Yuan and Mark Liberman. 2008. Speaker identification on the SCOTUS corpus. Journal of the Acoustical Society of America (2008)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1115"},{"key":"e_1_3_2_1_45_1","volume-title":"Navonil Mazumder, Soujanya Poria, Erik Cambria, and Louis-Philippe Morency.","author":"Zadeh Amir","year":"2018","unstructured":"Amir Zadeh, Paul Pu Liang, Navonil Mazumder, Soujanya Poria, Erik Cambria, and Louis-Philippe Morency. 2018. Memory fusion network for multi-view sequential learning. In AAAI."},{"key":"e_1_3_2_1_46_1","volume-title":"MOSI: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. arXiv preprint arXiv:1606.06259","author":"Zadeh Amir","year":"2016","unstructured":"Amir Zadeh, Rowan Zellers, Eli Pincus, and Louis-Philippe Morency. 2016. MOSI: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. arXiv preprint arXiv:1606.06259 (2016)."},{"key":"e_1_3_2_1_47_1","volume-title":"Multi-modal sentiment intensity analysis in videos: Facial gestures and verbal messages","author":"Zadeh Amir","year":"2016","unstructured":"Amir Zadeh, Rowan Zellers, Eli Pincus, and Louis-Philippe Morency. 2016. Multi-modal sentiment intensity analysis in videos: Facial gestures and verbal messages. IEEE Intelligent Systems (2016)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Jinming Zhao Ruichen Li and Qin Jin. 2021. Missing modality imagination network for emotion recognition with uncertain missing modalities. In ACL.","DOI":"10.18653\/v1\/2021.acl-long.203"},{"key":"e_1_3_2_1_49_1","volume-title":"Places: A 10 million image database for scene recognition","author":"Zhou Bolei","year":"2017","unstructured":"Bolei Zhou, Agata Lapedriza, Aditya Khosla, Aude Oliva, and Antonio Torralba. 2017. Places: A 10 million image database for scene recognition. IEEE transactions on pattern analysis and machine intelligence (2017)."},{"key":"e_1_3_2_1_50_1","volume-title":"Chen Change Loy, and Ziwei Liu","author":"Zhou Kaiyang","year":"2021","unstructured":"Kaiyang Zhou, Chen Change Loy, and Ziwei Liu. 2021. Semi-supervised domain generalization with stochastic stylematch. arXiv preprint arXiv:2106.00592 (2021)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00261"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612291","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612291","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:02:49Z","timestamp":1755820969000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612291"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":51,"alternative-id":["10.1145\/3581783.3612291","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612291","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}