{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:36:14Z","timestamp":1765308974994,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","funder":[{"name":"STI 2030-Major Projects+2022ZD0208500"},{"DOI":"10.13039\/100017052","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376158"],"award-info":[{"award-number":["62376158"]}],"id":[{"id":"10.13039\/100017052","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shanghai Municipal Science and Technology Major Project","award":["2021SHZD ZX"],"award-info":[{"award-number":["2021SHZD ZX"]}]},{"name":"Medical-Engineering Interdisciplinary Research Foundation of Shanghai Jiao Tong University ?Jiao Tong Star? Program","award":["YG2023ZD25, YG2024ZD25, YG2024QNA03"],"award-info":[{"award-number":["YG2023ZD25, YG2024ZD25, YG2024QNA03"]}]},{"name":"Shanghai Pujiang Program","award":["22PJ1408600"],"award-info":[{"award-number":["22PJ1408600"]}]},{"name":"Shanghai Pilot Program for Basic Research - Shanghai Jiao Tong University","award":["21TQ1400203"],"award-info":[{"award-number":["21TQ1400203"]}]},{"name":"Shanghai Jiao Tong University 2030 Initiative"},{"name":"Shanghai Jiao Tong University SCS-Shanghai EmoRays Technology Co., Ltd Joint Laboratory of Affective Brain-Computer Interfaces."}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755459","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:50:47Z","timestamp":1761371447000},"page":"5717-5725","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal Emotion Recognition with Missing Modality via a Unified Multi-task Pre-training Framework"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8944-741X","authenticated-orcid":false,"given":"Ziyi","family":"Li","sequence":"first","affiliation":[{"name":"School of Computer Science, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9474-6369","authenticated-orcid":false,"given":"Wei-Long","family":"Zheng","sequence":"additional","affiliation":[{"name":"School of Computer Science, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8359-0058","authenticated-orcid":false,"given":"Bao-Liang","family":"Lu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Vesper: a compact and effective pretrained model for speech emotion recognition","author":"Chen Weidong","year":"2024","unstructured":"Weidong Chen, Xiaofen Xing, Peihao Chen, and Xiangmin Xu. 2024. Vesper: a compact and effective pretrained model for speech emotion recognition. IEEE Transactions on Affective Computing (2024)."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548367"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/NER.2013.6695876"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102218"},{"key":"e_1_3_2_1_5_1","volume-title":"Bin Hu, and Tong Zhang.","author":"Gong Xinrong","year":"2024","unstructured":"Xinrong Gong, CL Philip Chen, Bin Hu, and Tong Zhang. 2024. CiABL: Completeness-induced Adaptative Broad Learning for Cross-Subject Emotion Recognition with EEG and Eye Movement Signals. IEEE Transactions on Affective Computing (2024)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475583"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446937"},{"key":"e_1_3_2_1_8_1","volume-title":"SEED-VII: a Multimodal Dataset of Six Basic Emotions with Continuous Labels for Emotion Recognition","author":"Jiang Wei-Bang","year":"2024","unstructured":"Wei-Bang Jiang, Xuan-Hao Liu, Wei-Long Zheng, and Bao-Liang Lu. 2024b. SEED-VII: a Multimodal Dataset of Six Basic Emotions with Continuous Labels for Emotion Recognition. IEEE Transactions on Affective Computing (2024)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102019"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41551-022-00914-1"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN48605.2020.9207625"},{"key":"e_1_3_2_1_12_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023. BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In International Conference on Machine Learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_13_1","volume-title":"International Conference on Machine Learning. PMLR, 12888-12900","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022a. BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In International Conference on Machine Learning. PMLR, 12888-12900."},{"key":"e_1_3_2_1_14_1","first-page":"9694","article-title":"Align before fuse: vision and language representation learning with momentum distillation","volume":"34","author":"Li Junnan","year":"2021","unstructured":"Junnan Li, Ramprasaath Selvaraju, Akhilesh Gotmare, Shafiq Joty, Caiming Xiong, and Steven Chu Hong Hoi. 2021. Align before fuse: vision and language representation learning with momentum distillation. In Advances in Neural Information Processing Systems, Vol. 34. 9694-9705.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548243"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.102216"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2021.3071170"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1088\/1741-2552\/ac5c8d"},{"key":"e_1_3_2_1_19_1","volume-title":"Emotion Recognition Using Multimodal Deep Learning. In International Conference on Neural Information Processing.","author":"Liu W.","year":"2016","unstructured":"W. Liu, Wei-Long Zheng, and Bao-Liang Lu. 2016. Emotion Recognition Using Multimodal Deep Learning. In International Conference on Neural Information Processing."},{"key":"e_1_3_2_1_20_1","first-page":"1170","volume-title":"International Joint Conference on Artificial Intelligence","volume":"15","author":"Lu Yifei","year":"2015","unstructured":"Yifei Lu, Wei-Long Zheng, Binbin Li, and Bao-Liang Lu. 2015. Combining Eye Movements and EEG to Enhance Emotion Recognition.. In International Joint Conference on Artificial Intelligence, Vol. 15. Buenos Aires, 1170-1176."},{"key":"e_1_3_2_1_21_1","volume-title":"A transformer-based model with self-distillation for multimodal emotion recognition in conversations","author":"Ma Hui","year":"2023","unstructured":"Hui Ma, Jian Wang, Hongfei Lin, Bo Zhang, Yijia Zhang, and Bo Xu. 2023. A transformer-based model with self-distillation for multimodal emotion recognition in conversations. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"Representation learning with contrastive predictive coding. ArXiv Preprint ArXiv:1807.03748","author":"van den Oord Aaron","year":"2018","unstructured":"Aaron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation learning with contrastive predictive coding. ArXiv Preprint ArXiv:1807.03748 (2018)."},{"key":"e_1_3_2_1_23_1","volume-title":"International Conference on Machine Learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1146\/annurev-clinpsy-032816-045252"},{"key":"e_1_3_2_1_25_1","volume-title":"Real-time facial emotion recognition model based on kernel autoencoder and convolutional neural network for autism children. Soft Computing","author":"Talaat Fatma M","year":"2024","unstructured":"Fatma M Talaat, Zainab H Ali, Reham R Mostafa, and Nora El-Rashidy. 2024. Real-time facial emotion recognition model based on kernel autoencoder and convolutional neural network for autism children. Soft Computing (2024), 1-14."},{"key":"e_1_3_2_1_26_1","volume-title":"Annual Conference on Neural Information Processing Systems.","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam M. Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01524"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/BIBM52615.2021.9669556"},{"key":"e_1_3_2_1_30_1","first-page":"17117","article-title":"Incomplete multimodality-diffused emotion recognition","volume":"36","author":"Wang Yuanzhi","year":"2023","unstructured":"Yuanzhi Wang, Yong Li, and Zhen Cui. 2023c. Incomplete multimodality-diffused emotion recognition. Advances in Neural Information Processing Systems, Vol. 36 (2023), 17117-17128.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2023.3263907"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681683"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475701"},{"key":"e_1_3_2_1_34_1","volume-title":"Identifying Gender Differences in Multimodal Emotion Recognition Using Bimodal Deep AutoEncoder. In International Conference on Neural Information Processing.","author":"Yan Xue","year":"2017","unstructured":"Xue Yan, Wei-Long Zheng, W. Liu, and Bao-Liang Lu. 2017. Identifying Gender Differences in Multimodal Emotion Recognition Using Bimodal Deep AutoEncoder. In International Conference on Neural Information Processing."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02592"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.203"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/EMBC.2014.6944757"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2018.2797176"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755459","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:32:30Z","timestamp":1765308750000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755459"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":38,"alternative-id":["10.1145\/3746027.3755459","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755459","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}