{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:11:21Z","timestamp":1776888681418,"version":"3.51.2"},"reference-count":111,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2024,7,29]],"date-time":"2024-07-29T00:00:00Z","timestamp":1722211200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,7,29]],"date-time":"2024-07-29T00:00:00Z","timestamp":1722211200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62376151"],"award-info":[{"award-number":["62376151"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62376151"],"award-info":[{"award-number":["62376151"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100003399","name":"Science and Technology Commission of Shanghai Municipality","doi-asserted-by":"crossref","award":["22DZ2205600"],"award-info":[{"award-number":["22DZ2205600"]}],"id":[{"id":"10.13039\/501100003399","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100003399","name":"Science and Technology Commission of Shanghai Municipality","doi-asserted-by":"crossref","award":["22DZ2205600"],"award-info":[{"award-number":["22DZ2205600"]}],"id":[{"id":"10.13039\/501100003399","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2024,8]]},"DOI":"10.1007\/s00530-024-01422-9","type":"journal-article","created":{"date-parts":[[2024,7,29]],"date-time":"2024-07-29T07:46:41Z","timestamp":1722239201000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":20,"title":["A survey of multimodal federated learning: background, applications, and perspectives"],"prefix":"10.1007","volume":"30","author":[{"given":"Hao","family":"Pan","sequence":"first","affiliation":[]},{"given":"Xiaoli","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Lipeng","family":"He","sequence":"additional","affiliation":[]},{"given":"Yicong","family":"Shi","sequence":"additional","affiliation":[]},{"given":"Xiaogang","family":"Lin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,7,29]]},"reference":[{"key":"1422_CR1","doi-asserted-by":"crossref","unstructured":"Cai, Y., Cai, H., Wan, X.: Multi-modal sarcasm detection in twitter with hierarchical fusion model. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 2506\u20132515 (2019)","DOI":"10.18653\/v1\/P19-1239"},{"key":"1422_CR2","doi-asserted-by":"crossref","unstructured":"Castro, S., Hazarika, D., P\u00e9rez-Rosas, V., Zimmermann, R., Mihalcea, R., Poria, S.: Towards multimodal sarcasm detection (an _obviously_ perfect paper). Preprint at arXiv:1906.01815\u00a0(2019)","DOI":"10.18653\/v1\/P19-1455"},{"key":"1422_CR3","unstructured":"Yu, Q., Liu, Y., Wang, Y., Xu, K., Liu, J.: Multimodal federated learning via contrastive representation ensemble. Preprint at \u00a0arXiv:2302.08888v3\u00a0(2023)"},{"key":"1422_CR4","unstructured":"Thrasher, J., Devkota, A., Siwakotai, P., Chivukula, R., Poudel, P., Hu, C., Bhattarai, B., Gyawali, P.: Multimodal federated learning in healthcare: a review. Preprint at\u00a0arXiv:2310.09650\u00a0(2023)"},{"key":"1422_CR5","unstructured":"Wang, K., Yin, Q., Wang, W., Wu, S., Wang, L.: A comprehensive survey on cross-modal retrieval. Preprint at arXiv:1607.06215\u00a0\u00a0(2016)"},{"issue":"3","key":"1422_CR6","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3617592","volume":"56","author":"T Ghandi","year":"2023","unstructured":"Ghandi, T., Pourreza, H., Mahyar, H.: Deep learning approaches on image captioning: a review. ACM Comput. Surv. 56(3), 1\u201339 (2023)","journal-title":"ACM Comput. Surv."},{"key":"1422_CR7","doi-asserted-by":"publisher","first-page":"107567","DOI":"10.1016\/j.patcog.2020.107567","volume":"109","author":"T Hussain","year":"2021","unstructured":"Hussain, T., Muhammad, K., Ding, W., Lloret, J., Baik, S.W., Albuquerque, V.H.C.: A comprehensive survey of multi-view video summarization. Pattern Recogn. 109, 107567 (2021)","journal-title":"Pattern Recogn."},{"key":"1422_CR8","doi-asserted-by":"crossref","unstructured":"Liang, P.P., Liu, T., Cai, A., Muszynski, M., Ishii, R., Allen, N., Auerbach, R., Brent, D., Salakhutdinov, R., Morency, L.-P.: Learning language and multimodal privacy-preserving markers of mood from mobile data. Preprint at\u00a0arXiv:2106.13213\u00a0(2021)","DOI":"10.18653\/v1\/2021.acl-long.322"},{"issue":"10","key":"1422_CR9","doi-asserted-by":"publisher","first-page":"2598","DOI":"10.1109\/TMI.2022.3167808","volume":"41","author":"O Dalmaz","year":"2022","unstructured":"Dalmaz, O., Yurt, M., \u00c7ukur, T.: Resvit: Residual vision transformers for multimodal medical image synthesis. IEEE Trans. Med. Imaging 41(10), 2598\u20132614 (2022)","journal-title":"IEEE Trans. Med. Imaging"},{"issue":"1\u20132","key":"1422_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1561\/2200000083","volume":"14","author":"P Kairouz","year":"2021","unstructured":"Kairouz, P., McMahan, H.B., Avent, B., Bellet, A., Bennis, M., Bhagoji, A.N., Bonawitz, K., Charles, Z., Cormode, G., Cummings, R., et al.: Advances and open problems in federated learning. Found. Trends\u00ae Mach. Learn. 14(1\u20132), 1\u2013210 (2021)","journal-title":"Found. Trends\u00ae Mach. Learn."},{"key":"1422_CR11","unstructured":"McMahan, B., Moore, E., Ramage, D., Hampson, S., Arcas, B.A.: Communication-efficient learning of deep networks from decentralized data. In: Artificial Intelligence and Statistics, pp. 1273\u20131282\u00a0PMLR, (2017)"},{"key":"1422_CR12","first-page":"429","volume":"2","author":"T Li","year":"2020","unstructured":"Li, T., Sahu, A.K., Zaheer, M., Sanjabi, M., Talwalkar, A., Smith, V.: Federated optimization in heterogeneous networks. Proc. Mach. Learn. syst. 2, 429\u2013450 (2020)","journal-title":"Proc. Mach. Learn. syst."},{"key":"1422_CR13","unstructured":"Karimireddy, S.P., Kale, S., Mohri, M., Reddi, S., Stich, S., Suresh, A.T.: Scaffold: Stochastic controlled averaging for federated learning. In: International Conference on Machine Learning,\u00a0 pp. 5132\u20135143 PMLR, (2020)"},{"key":"1422_CR14","unstructured":"Li, X., Jiang, M., Zhang, X., Kamp, M., Dou, Q.: Fedbn: Federated learning on non-iid features via local batch normalization. Preprint at arXiv:2102.07623\u00a0(2021)"},{"key":"1422_CR15","first-page":"7611","volume":"33","author":"J Wang","year":"2020","unstructured":"Wang, J., Liu, Q., Liang, H., Joshi, G., Poor, H.V.: Tackling the objective inconsistency problem in heterogeneous federated optimization. Adv. Neural Inform. Proc. Syst. 33, 7611\u20137623 (2020)","journal-title":"Adv. Neural Inform. Proc. Syst."},{"issue":"2","key":"1422_CR16","doi-asserted-by":"publisher","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","volume":"41","author":"T Baltru\u0161aitis","year":"2018","unstructured":"Baltru\u0161aitis, T., Ahuja, C., Morency, L.-P.: Multimodal machine learning: a survey and taxonomy. IEEE Trans. Pattern Anal. Mach. Intell. 41(2), 423\u2013443 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1422_CR17","doi-asserted-by":"crossref","unstructured":"Chen, S., Li, B.: Towards optimal multi-modal federated learning on non-iid data with hierarchical gradient blending. In: IEEE INFOCOM 2022-IEEE Conference on Computer Communications, pp. 1469\u20131478\u00a0IEEE, (2022)","DOI":"10.1109\/INFOCOM48880.2022.9796724"},{"issue":"15","key":"1422_CR18","doi-asserted-by":"publisher","first-page":"6986","DOI":"10.3390\/s23156986","volume":"23","author":"L Che","year":"2023","unstructured":"Che, L., Wang, J., Zhou, Y., Ma, F.: Multimodal federated learning: a survey. Sensors 23(15), 6986 (2023)","journal-title":"Sensors"},{"key":"1422_CR19","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco: common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pp. 740\u2013755\u00a0Springer, (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1422_CR20","doi-asserted-by":"crossref","unstructured":"Tan, A.Z., Yu, H., Cui, L., Yang, Q.: Towards personalized federated learning. IEEE Transactions on Neural Networks and Learning Systems (2022)","DOI":"10.1109\/TNNLS.2022.3160699"},{"key":"1422_CR21","unstructured":"Smith, V., Chiang, C.-K., Sanjabi, M., Talwalkar, A.S.: Federated multi-task learning. Advances in neural information processing systems 30 (2017)"},{"issue":"5","key":"1422_CR22","doi-asserted-by":"publisher","first-page":"829","DOI":"10.1162\/neco_a_01273","volume":"32","author":"J Gao","year":"2020","unstructured":"Gao, J., Li, P., Chen, Z., Zhang, J.: A survey on deep learning for multimodal data fusion. Neural Comput. 32(5), 829\u2013864 (2020)","journal-title":"Neural Comput."},{"key":"1422_CR23","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"1422_CR24","doi-asserted-by":"crossref","unstructured":"Cour, T., Jordan, C., Miltsakaki, E., Taskar, B.: Movie\/script: Alignment and parsing of video and text transcription. In: Computer Vision\u2013ECCV 2008: 10th European Conference on Computer Vision, Marseille, France, October 12-18, 2008, Proceedings, Part IV 10, pp. 158\u2013171\u00a0Springer, (2008)","DOI":"10.1007\/978-3-540-88693-8_12"},{"issue":"11","key":"1422_CR25","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"LeCun, Y., Bottou, L., Bengio, Y., Haffner, P.: Gradient-based learning applied to document recognition. Proc. IEEE 86(11), 2278\u20132324 (1998)","journal-title":"Proc. IEEE"},{"issue":"6","key":"1422_CR26","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Commun. ACM 60(6), 84\u201390 (2017)","journal-title":"Commun. ACM"},{"key":"1422_CR27","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"1422_CR28","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. Preprint at arXiv:1409.1556\u00a0(2014)"},{"key":"1422_CR29","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. Preprint at\u00a0arXiv:2010.11929\u00a0(2020)"},{"issue":"8","key":"1422_CR30","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"1422_CR31","unstructured":"Chung, J., Gulcehre, C., Cho, K., Bengio, Y.: Empirical evaluation of gated recurrent neural networks on sequence modeling. Preprint at\u00a0arXiv:1412.3555 (2014)"},{"key":"1422_CR32","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. Advances in neural information processing systems 27 (2014)"},{"key":"1422_CR33","unstructured":"Mikolov, T., Chen, K., Corrado, G., Dean, J.: Efficient estimation of word representations in vector space. Preprint at\u00a0arXiv:1301.3781\u00a0(2013)"},{"key":"1422_CR34","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Advances in Neural Information Processing Systems 30,(2017)"},{"key":"1422_CR35","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607 PMLR (2020)"},{"key":"1422_CR36","doi-asserted-by":"crossref","unstructured":"Feng, T., Bose, D., Zhang, T., Hebbar, R., Ramakrishna, A., Gupta, R., Zhang, M., Avestimehr, S., Narayanan, S.: Fedmultimodal: A benchmark for multimodal federated learning.\u00a0 Preprint at\u00a0arXiv:2306.09486\u00a0(2023)","DOI":"10.1145\/3580305.3599825"},{"key":"1422_CR37","doi-asserted-by":"publisher","first-page":"1186","DOI":"10.1016\/j.neucom.2017.09.065","volume":"275","author":"N Zhang","year":"2018","unstructured":"Zhang, N., Ding, S., Zhang, J., Xue, Y.: An overview on restricted boltzmann machines. Neurocomputing 275, 1186\u20131199 (2018)","journal-title":"Neurocomputing"},{"key":"1422_CR38","unstructured":"Tschannen, M., Bachem, O., Lucic, M.: Recent advances in autoencoder-based representation learning. Preprint at\u00a0arXiv:1812.05069\u00a0(2018)"},{"key":"1422_CR39","doi-asserted-by":"publisher","first-page":"355","DOI":"10.1016\/j.inffus.2021.06.007","volume":"76","author":"G Muhammad","year":"2021","unstructured":"Muhammad, G., Alshehri, F., Karray, F., El Saddik, A., Alsulaiman, M., Falk, T.H.: A comprehensive survey on multimodal medical signals fusion for smart healthcare systems. Inform. Fusion 76, 355\u2013375 (2021)","journal-title":"Inform. Fusion"},{"key":"1422_CR40","unstructured":"Huang, K., Shi, B., Li, X., Li, X., Huang, S., Li, Y.: Multi-modal sensor fusion for auto driving perception: A survey. Preprint at\u00a0arXiv:2202.02703\u00a0(2022)"},{"key":"1422_CR41","doi-asserted-by":"publisher","first-page":"101890","DOI":"10.1016\/j.inffus.2023.101890","volume":"99","author":"P Qi","year":"2023","unstructured":"Qi, P., Chiaro, D., Piccialli, F.: Fl-fd: Federated learning-based fall detection with multimodal data fusion. Inform. Fusion 99, 101890 (2023)","journal-title":"Inform. Fusion"},{"key":"1422_CR42","unstructured":"Jaggi, M., Smith, V., Tak\u00e1c, M., Terhorst, J., Krishnan, S., Hofmann, T., Jordan, M.I.: Communication-efficient distributed dual coordinate ascent. Advances in Neural Information Processing Systems 27 (2014)"},{"key":"1422_CR43","unstructured":"Ma, C., Smith, V., Jaggi, M., Jordan, M., Richt\u00e1rik, P., Tak\u00e1c, M.: Adding vs. averaging in distributed primal-dual optimization. In: International Conference on Machine Learning, pp. 1973\u20131982\u00a0 PMLR, (2015)"},{"issue":"3","key":"1422_CR44","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3625558","volume":"56","author":"M Ye","year":"2023","unstructured":"Ye, M., Fang, X., Du, B., Yuen, P.C., Tao, D.: Heterogeneous federated learning: state-of-the-art and research challenges. ACM Comput. Surv. 56(3), 1\u201344 (2023)","journal-title":"ACM Comput. Surv."},{"issue":"2","key":"1422_CR45","doi-asserted-by":"publisher","first-page":"197","DOI":"10.1109\/JSAIT.2022.3205475","volume":"3","author":"A Reisizadeh","year":"2022","unstructured":"Reisizadeh, A., Tziotis, I., Hassani, H., Mokhtari, A., Pedarsani, R.: Straggler-resilient federated learning: Leveraging the interplay between statistical accuracy and system heterogeneity. IEEE J. Selected Areas Inform. Theory 3(2), 197\u2013205 (2022)","journal-title":"IEEE J. Selected Areas Inform. Theory"},{"key":"1422_CR46","doi-asserted-by":"crossref","unstructured":"Chen, J., Zhang, A.: Fedmsplit: Correlation-adaptive federated multi-task learning across multimodal split networks. In: Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, pp. 87\u201396 (2022)","DOI":"10.1145\/3534678.3539384"},{"key":"1422_CR47","doi-asserted-by":"crossref","unstructured":"Liu, Y., Kang, Y., Zou, T., Pu, Y., He, Y., Ye, X., Ouyang, Y., Zhang, Y.-Q., Yang, Q.: Vertical federated learning: Concepts, advances, and challenges. IEEE Transactions on Knowledge and Data Engineering (2024)","DOI":"10.1109\/TKDE.2024.3352628"},{"issue":"2","key":"1422_CR48","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3298981","volume":"10","author":"Q Yang","year":"2019","unstructured":"Yang, Q., Liu, Y., Chen, T., Tong, Y.: Federated machine learning: concept and applications. ACM Trans. Intell. Syst. Technol. (TIST) 10(2), 1\u201319 (2019)","journal-title":"ACM Trans. Intell. Syst. Technol. (TIST)"},{"key":"1422_CR49","doi-asserted-by":"crossref","unstructured":"Liu, F., Wu, X., Ge, S., Fan, W., Zou, Y.: Federated learning for vision-and-language grounding problems. In: Proceedings of the AAAI Conference on Artificial Intelligence, 34, pp. 11572\u201311579 (2020)","DOI":"10.1609\/aaai.v34i07.6824"},{"key":"1422_CR50","first-page":"1","volume":"4","author":"Y-M Lin","year":"2023","unstructured":"Lin, Y.-M., Gao, Y., Gong, M.-G., Zhang, S.-J., Zhang, Y.-Q., Li, Z.-Y.: Federated learning on multimodal data: a comprehensive survey. Mach. Intell. Res. 4, 1\u201315 (2023)","journal-title":"Mach. Intell. Res."},{"key":"1422_CR51","doi-asserted-by":"crossref","unstructured":"Chen, J., Zhang, A.: On disentanglement of asymmetrical knowledge transfer for modality-task agnostic federated learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, 38, pp. 11311\u201311319 (2024)","DOI":"10.1609\/aaai.v38i10.29010"},{"key":"1422_CR52","first-page":"3557","volume":"33","author":"A Fallah","year":"2020","unstructured":"Fallah, A., Mokhtari, A., Ozdaglar, A.: Personalized federated learning with theoretical guarantees: A model-agnostic meta-learning approach. Adv. Neural Inform. Proc. Syst. 33, 3557\u20133568 (2020)","journal-title":"Adv. Neural Inform. Proc. Syst."},{"key":"1422_CR53","unstructured":"Liang, P.P., Liu, T., Ziyin, L., Allen, N.B., Auerbach, R.P., Brent, D., Salakhutdinov, R., Morency, L.-P.: Think locally, act globally: Federated learning with local and global representations. Preprint at\u00a0arXiv:2001.01523\u00a0(2020)"},{"key":"1422_CR54","doi-asserted-by":"crossref","unstructured":"Yang, X., Xiong, B., Huang, Y., Xu, C.: Cross-modal federated human activity recognition via modality-agnostic and modality-specific representation learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, 36, pp. 3063\u20133071 (2022)","DOI":"10.1609\/aaai.v36i3.20213"},{"key":"1422_CR55","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1109\/OJCS.2022.3206407","volume":"3","author":"A Qayyum","year":"2022","unstructured":"Qayyum, A., Ahmad, K., Ahsan, M.A., Al-Fuqaha, A., Qadir, J.: Collaborative federated learning for healthcare: Multi-modal covid-19 diagnosis at the edge. IEEE Open J. Comput. Soc. 3, 172\u2013184 (2022)","journal-title":"IEEE Open J. Comput. Soc."},{"key":"1422_CR56","doi-asserted-by":"crossref","unstructured":"Li, D., Xie, W., Li, Y., Fang, L.: Fedfusion: Manifold driven federated learning for multi-satellite and multi-modality fusion. IEEE Transactions on Geoscience and Remote Sensing (2023)","DOI":"10.1109\/TGRS.2023.3339522"},{"key":"1422_CR57","doi-asserted-by":"crossref","unstructured":"Yang, X., Xiong, B., Huang, Y., Xu, C.: Cross-modal federated human activity recognition. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)","DOI":"10.1109\/TPAMI.2024.3367412"},{"key":"1422_CR58","doi-asserted-by":"crossref","unstructured":"Dai, Q., Wei, D., Liu, H., Sun, J., Wang, L., Zheng, Y.: Federated modality-specific encoders and multimodal anchors for personalized brain tumor segmentation. In: Proceedings of the AAAI Conference on Artificial Intelligence, 38, pp. 1445\u20131453 (2024)","DOI":"10.1609\/aaai.v38i2.27909"},{"key":"1422_CR59","doi-asserted-by":"crossref","unstructured":"Agbley, B.L.Y., Li, J., Haq, A.U., Bankas, E.K., Ahmad, S., Agyemang, I.O., Kulevome, D., Ndiaye, W.D., Cobbinah, B., Latipova, S.: Multimodal melanoma detection with federated learning. In: 2021 18th International Computer Conference on Wavelet Active Media Technology and Information Processing (ICCWAMTIP), pp. 238\u2013244\u00a0IEEE, (2021)","DOI":"10.1109\/ICCWAMTIP53232.2021.9674116"},{"key":"1422_CR60","doi-asserted-by":"publisher","first-page":"110","DOI":"10.1016\/j.neucom.2022.01.063","volume":"480","author":"B Xiong","year":"2022","unstructured":"Xiong, B., Yang, X., Qi, F., Xu, C.: A unified framework for multi-modal federated learning. Neurocomputing 480, 110\u2013118 (2022)","journal-title":"Neurocomputing"},{"key":"1422_CR61","doi-asserted-by":"crossref","unstructured":"Zheng, T., Li, A., Chen, Z., Wang, H., Luo, J.: Autofed: Heterogeneity-aware federated multimodal learning for robust autonomous driving. Preprint at\u00a0arXiv:2302.08646\u00a0(2023)","DOI":"10.1145\/3570361.3592517"},{"key":"1422_CR62","unstructured":"Lu, W., Hu, X., Wang, J., Xie, X.: Fedclip: Fast generalization and personalization for clip in federated learning. Preprint at\u00a0arXiv:2302.13485\u00a0(2023)"},{"key":"1422_CR63","doi-asserted-by":"crossref","unstructured":"Ouyang, X., Xie, Z., Fu, H., Cheng, S., Pan, L., Ling, N., Xing, G., Zhou, J., Huang, J.: Harmony: Heterogeneous multi-modal federated learning through disentangled model training. In: Proceedings of the 21st Annual International Conference on Mobile Systems, Applications and Services, pp. 530\u2013543 (2023)","DOI":"10.1145\/3581791.3596844"},{"key":"1422_CR64","doi-asserted-by":"publisher","first-page":"102342","DOI":"10.1016\/j.compmedimag.2024.102342","volume":"113","author":"J Chen","year":"2024","unstructured":"Chen, J., Pan, R.: Medical report generation based on multimodal federated learning. Comput. Med. Imaging Graph. 113, 102342 (2024)","journal-title":"Comput. Med. Imaging Graph."},{"key":"1422_CR65","doi-asserted-by":"crossref","unstructured":"Zong, L., Xie, Q., Zhou, J., Wu, P., Zhang, X., Xu, B.: Fedcmr: Federated cross-modal retrieval. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1672\u20131676 (2021)","DOI":"10.1145\/3404835.3462989"},{"key":"1422_CR66","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Barnaghi, P., Haddadi, H.: Multimodal federated learning on iot data. In: 2022 IEEE\/ACM Seventh International Conference on Internet-of-Things Design and Implementation (IoTDI), pp. 43\u201354\u00a0IEEE, (2022)","DOI":"10.1109\/IoTDI54339.2022.00011"},{"key":"1422_CR67","doi-asserted-by":"crossref","unstructured":"Le, H.Q., Nguyen, M.N., Thwal, C.M., Qiao, Y., Zhang, C., Hong, C.S.: Fedmekt: Distillation-based embedding knowledge transfer for multimodal federated learning. Preprint at\u00a0arXiv:2307.13214\u00a0(2023)","DOI":"10.2139\/ssrn.4641403"},{"key":"1422_CR68","doi-asserted-by":"crossref","unstructured":"Guo, T., Guo, S., Wang, J.: pfedprompt: Learning personalized prompt for vision-language models in federated learning. In: Proceedings of the ACM Web Conference 2023, pp. 1364\u20131374 (2023)","DOI":"10.1145\/3543507.3583518"},{"key":"1422_CR69","unstructured":"Bao, G., Zhang, Q., Miao, D., Gong, Z., Hu, L.: Multimodal federated learning with missing modality via prototype mask and contrast. Preprint at\u00a0arXiv:2312.13508  (2023)"},{"key":"1422_CR70","doi-asserted-by":"publisher","DOI":"10.1145\/3657291","author":"S Yu","year":"2024","unstructured":"Yu, S., Yang, Q., Wang, J., Wu, C.: Fedusl,: A federated annotation method for driving fatigue detection based on multimodal sensing data. ACM Trans. Sensor Netw. (2024). https:\/\/doi.org\/10.1145\/3657291","journal-title":"ACM Trans. Sensor Netw."},{"key":"1422_CR71","doi-asserted-by":"crossref","unstructured":"Gong, M., Zhang, Y., Gao, Y., Qin, A., Wu, Y., Wang, S., Zhang, Y.: A multi-modal vertical federated learning framework based on homomorphic encryption. IEEE Transactions on Information Forensics and Security (2023)","DOI":"10.1109\/TIFS.2023.3340994"},{"key":"1422_CR72","doi-asserted-by":"crossref","unstructured":"Tan, M., Feng, Y., Chu, L., Shi, J., Xiao, R., Tang, H., Yu, J.: Fedsea: Federated learning via selective feature alignment for non-iid multimodal data. IEEE Transactions on Multimedia (2023)","DOI":"10.1109\/TMM.2023.3340109"},{"key":"1422_CR73","unstructured":"Yuan, L., Han, D.-J., Wang, S., Upadhyay, D., Brinton, C.G.: Communication-efficient multimodal federated learning: Joint modality and client selection. Preprint at\u00a0arXiv:2401.16685\u00a0(2024)"},{"key":"1422_CR74","unstructured":"Finn, C., Abbeel, P., Levine, S.: Model-agnostic meta-learning for fast adaptation of deep networks. In: International Conference on Machine Learning, pp. 1126\u20131135\u00a0PMLR, (2017).\u00a0"},{"issue":"1","key":"1422_CR75","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s00530-021-00875-6","volume":"29","author":"M Hu","year":"2023","unstructured":"Hu, M., Luo, M., Huang, M., Meng, W., Xiong, B., Yang, X., Sang, J.: Towards a multimodal human activity dataset for healthcare. Multimed. Syst. 29(1), 1\u201313 (2023)","journal-title":"Multimed. Syst."},{"key":"1422_CR76","unstructured":"Reddi, S., Charles, Z., Zaheer, M., Garrett, Z., Rush, K., Kone\u010dn\u1ef3, J., Kumar, S., McMahan, H.B.: Adaptive federated optimization. Preprint at\u00a0arXiv:2003.00295\u00a0(2020)"},{"key":"1422_CR77","doi-asserted-by":"crossref","unstructured":"Fang, H., Gupta, S., Iandola, F., Srivastava, R.K., Deng, L., Dollar, P., Gao, J., He, X., Mitchell, M., Platt, J.C., Zitnick, C.L., Zweig, G.: From captions to visual concepts and back. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"1422_CR78","doi-asserted-by":"crossref","unstructured":"Zhen, L., Hu, P., Wang, X., Peng, D.: Deep supervised cross-modal retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10394\u201310403 (2019)","DOI":"10.1109\/CVPR.2019.01064"},{"key":"1422_CR79","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. Preprint at\u00a0arXiv:1810.04805\u00a0(2018)"},{"key":"1422_CR80","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763\u00a0PMLR, (2021)"},{"key":"1422_CR81","unstructured":"Sun, Y.: Federated transfer learning with multimodal data. Preprint at\u00a0arXiv:2209.03137 (2022)"},{"issue":"2","key":"1422_CR82","doi-asserted-by":"publisher","first-page":"1030","DOI":"10.1109\/JIOT.2020.3009358","volume":"8","author":"A Saeed","year":"2020","unstructured":"Saeed, A., Salim, F.D., Ozcelebi, T., Lukkien, J.: Federated self-supervised learning of multisensor representations for embedded intelligence. IEEE Int. Things J. 8(2), 1030\u20131040 (2020)","journal-title":"IEEE Int. Things J."},{"key":"1422_CR83","unstructured":"Wang, J., Yang, X., Cui, S., Che, L., Lyu, L., Xu, D.D., Ma, F.: Towards personalized federated learning via heterogeneous model reassembly. Adv. Neural Inform. Proc. Syst.\u00a036 (2024)"},{"key":"1422_CR84","unstructured":"Kim, W., Son, B., Kim, I.: Vilt: Vision-and-language transformer without convolution or region supervision. International Conference on Machine Learning, 5583\u20135594 PMLR (2021)"},{"key":"1422_CR85","first-page":"32897","volume":"35","author":"H Bao","year":"2022","unstructured":"Bao, H., Wang, W., Dong, L., Liu, Q., Mohammed, O.K., Aggarwal, K., Som, S., Piao, S., Wei, F.: Vlmo: Unified vision-language pre-training with mixture-of-modality-experts. Adv. Neural Inform. Proc. Syst. 35, 32897\u201332912 (2022)","journal-title":"Adv. Neural Inform. Proc. Syst."},{"key":"1422_CR86","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"1422_CR87","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6904\u20136913 (2017)","DOI":"10.1109\/CVPR.2017.670"},{"issue":"6","key":"1422_CR88","doi-asserted-by":"publisher","first-page":"59","DOI":"10.1109\/MSP.2021.3106895","volume":"38","author":"S Zhao","year":"2021","unstructured":"Zhao, S., Jia, G., Yang, J., Ding, G., Keutzer, K.: Emotion recognition from multiple modalities: fundamentals and methodologies. IEEE Signal Proc. Mag. 38(6), 59\u201373 (2021)","journal-title":"IEEE Signal Proc. Mag."},{"issue":"1","key":"1422_CR89","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1007\/s00530-021-00786-6","volume":"28","author":"V Chaturvedi","year":"2022","unstructured":"Chaturvedi, V., Kaur, A.B., Varshney, V., Garg, A., Chhabra, G.S., Kumar, M.: Music mood and human emotion recognition based on physiological signals: a systematic review. Multimed. Syst. 28(1), 21\u201344 (2022)","journal-title":"Multimed. Syst."},{"key":"1422_CR90","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","volume":"42","author":"C Busso","year":"2008","unstructured":"Busso, C., Bulut, M., Lee, C.-C., Kazemzadeh, A., Mower, E., Kim, S., Chang, J.N., Lee, S., Narayanan, S.S.: Iemocap: Interactive emotional dyadic motion capture database. Lang. Resour. Eval. 42, 335\u2013359 (2008)","journal-title":"Lang. Resour. Eval."},{"key":"1422_CR91","doi-asserted-by":"crossref","unstructured":"Poria, S., Hazarika, D., Majumder, N., Naik, G., Cambria, E., Mihalcea, R.: Meld: A multimodal multi-party dataset for emotion recognition in conversations. Preprint at\u00a0arXiv:1810.02508v6\u00a0(2018)","DOI":"10.18653\/v1\/P19-1050"},{"key":"1422_CR92","unstructured":"Zadeh, A.B., Liang, P.P., Poria, S., Cambria, E., Morency, L.-P.: Multimodal language analysis in the wild: Cmu-mosei dataset and interpretable dynamic fusion graph. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics. Vol.\u00a01,\u00a0pp. 2236\u20132246\u00a0Long Papers,\u00a0 (2018)"},{"issue":"4","key":"1422_CR93","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3409332","volume":"16","author":"Y Huang","year":"2020","unstructured":"Huang, Y., Yang, X., Gao, J., Sang, J., Xu, C.: Knowledge-driven egocentric multimodal activity recognition. ACM Trans. Multimed. Comput. Commun. Appl.\u00a0(TOMM) 16(4), 1\u2013133 (2020)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl.\u00a0(TOMM)"},{"issue":"2","key":"1422_CR94","doi-asserted-by":"publisher","first-page":"83","DOI":"10.1007\/s00530-019-00635-7","volume":"26","author":"R Singh","year":"2020","unstructured":"Singh, R., Sonawane, A., Srivastava, R.: Recent evolution of modern datasets for human activity recognition: a deep survey. Multimed. Syst. 26(2), 83\u2013106 (2020)","journal-title":"Multimed. Syst."},{"issue":"3","key":"1422_CR95","doi-asserted-by":"publisher","first-page":"1301","DOI":"10.1007\/s00530-023-01054-5","volume":"29","author":"X Chao","year":"2023","unstructured":"Chao, X., Hou, Z., Mo, Y., Shi, H., Yao, W.: Structural feature representation and fusion of human spatial cooperative motion for action recognition. Multimed. Syst. 29(3), 1301\u20131314 (2023)","journal-title":"Multimed. Syst."},{"key":"1422_CR96","unstructured":"Kay, W., Carreira, J., Simonyan, K., Zhang, B., Hillier, C., Vijayanarasimhan, S., Viola, F., Green, T., Back, T., Natsev, P., et al.: The kinetics human action video dataset. Preprint at\u00a0arXiv:1705.06950\u00a0(2017)"},{"key":"1422_CR97","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: Ucf101: A dataset of 101 human actions classes from videos in the wild. Preprint at\u00a0arXiv:1212.0402\u00a0(2012)"},{"issue":"3","key":"1422_CR98","doi-asserted-by":"publisher","first-page":"489","DOI":"10.1016\/j.cmpb.2014.09.005","volume":"117","author":"B Kwolek","year":"2014","unstructured":"Kwolek, B., Kepski, M.: Human fall detection on embedded platform using depth maps and wireless accelerometer. Comput. Methods Prog. Biomed. 117(3), 489\u2013501 (2014)","journal-title":"Comput. Methods Prog. Biomed."},{"key":"1422_CR99","first-page":"2611","volume":"33","author":"D Kiela","year":"2020","unstructured":"Kiela, D., Firooz, H., Mohan, A., Goswami, V., Singh, A., Ringshia, P., Testuggine, D.: The hateful memes challenge: detecting hate speech in multimodal memes. Adv. Neural Inform. Proc. Syst. 33, 2611\u20132624 (2020)","journal-title":"Adv. Neural Inform. Proc. Syst."},{"key":"1422_CR100","doi-asserted-by":"crossref","unstructured":"Hasan, M.K., Rahman, W., Zadeh, A., Zhong, J., Tanveer, M.I., Morency, L.-P., et al.: Ur-funny: A multimodal language dataset for understanding humor. Preprint at\u00a0arXiv:1904.06618 (2019)","DOI":"10.18653\/v1\/D19-1211"},{"key":"1422_CR101","doi-asserted-by":"crossref","unstructured":"Alam, F., Ofli, F., Imran, M.: Crisismmd: Multimodal twitter datasets from natural disasters. In: Proceedings of the International AAAI Conference on Web and Social Media, vol. 12 (2018)","DOI":"10.1609\/icwsm.v12i1.14983"},{"issue":"7","key":"1422_CR102","doi-asserted-by":"publisher","first-page":"826","DOI":"10.1016\/j.jpdc.2004.03.020","volume":"64","author":"MF Duarte","year":"2004","unstructured":"Duarte, M.F., Hu, Y.H.: Vehicle classification in distributed sensor networks. J. Parallel Distrib. Comput. 64(7), 826\u2013838 (2004)","journal-title":"J. Parallel Distrib. Comput."},{"key":"1422_CR103","doi-asserted-by":"crossref","unstructured":"Banos, O., Garcia, R., Holgado-Terriza, J.A., Damas, M., Pomares, H., Rojas, I., Saez, A., Villalonga, C.: mhealthdroid: a novel framework for agile development of mobile health applications. In: Ambient Assisted Living and Daily Activities: 6th International Work-Conference, IWAAL 2014, Belfast, UK, December 2-5, 2014. Proceedings 6, pp. 91\u201398\u00a0Springer, (2014)","DOI":"10.1007\/978-3-319-13105-4_14"},{"issue":"1","key":"1422_CR104","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1038\/s41597-020-0495-6","volume":"7","author":"P Wagner","year":"2020","unstructured":"Wagner, P., Strodthoff, N., Bousseljot, R.-D., Kreiseler, D., Lunze, F.I., Samek, W., Schaeffter, T.: Ptb-xl, a large publicly available electrocardiography dataset. Sci. Data 7(1), 154 (2020)","journal-title":"Sci. Data"},{"key":"1422_CR105","unstructured":"Wu, Z., Song, S., Khosla, A., Yu, F., Zhang, L., Tang, X., Xiao, J.: 3d shapenets: A deep representation for volumetric shapes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1912\u20131920 (2015)"},{"key":"1422_CR106","unstructured":"Liang, P.P., Lyu, Y., Fan, X., Wu, Z., Cheng, Y., Wu, J., Chen, L., Wu, P., Lee, M.A., Zhu, Y., et al.: Multibench: Multiscale benchmarks for multimodal representation learning.\u00a0 Preprint at\u00a0arXiv:2107.07502\u00a0(2021)"},{"issue":"1","key":"1422_CR107","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1007\/s00530-014-0430-9","volume":"23","author":"X Li","year":"2017","unstructured":"Li, X.: Tag relevance fusion for social image retrieval. Multimed. Syst. 23(1), 29\u201340 (2017)","journal-title":"Multimed. Syst."},{"key":"1422_CR108","doi-asserted-by":"crossref","unstructured":"Bano, S., Tonellotto, N., Cassar\u00e0, P., Gotta, A.: Fedcmd: A federated cross-modal knowledge distillation for drivers emotion recognition. ACM Transactions on Intelligent Systems and Technology (2024)","DOI":"10.1145\/3650040"},{"key":"1422_CR109","doi-asserted-by":"crossref","unstructured":"Liang, P.P., Liu, T., Cai, A., Muszynski, M., Ishii, R., Allen, N., Auerbach, R., Brent, D., Salakhutdinov, R., Morency, L.-P.: Learning language and multimodal privacy-preserving markers of mood from mobile data. Preprint at\u00a0arXiv:2106.13213 (2021)","DOI":"10.18653\/v1\/2021.acl-long.322"},{"issue":"4","key":"1422_CR110","doi-asserted-by":"publisher","first-page":"2193","DOI":"10.1007\/s00530-023-01090-1","volume":"29","author":"Z Li","year":"2023","unstructured":"Li, Z., Cheng, W., Zhou, J., An, Z., Hu, B.: Deep learning model with multi-feature fusion and label association for suicide detection. Multimed. Syst. 29(4), 2193\u20132203 (2023)","journal-title":"Multimed. Syst."},{"issue":"1","key":"1422_CR111","doi-asserted-by":"publisher","first-page":"5721","DOI":"10.1038\/s41467-021-25874-z","volume":"12","author":"A Gupta","year":"2021","unstructured":"Gupta, A., Savarese, S., Ganguli, S., Fei-Fei, L.: Embodied intelligence via learning and evolution. Nature Commun. 12(1), 5721 (2021)","journal-title":"Nature Commun."}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-024-01422-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-024-01422-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-024-01422-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,22]],"date-time":"2024-08-22T08:36:28Z","timestamp":1724315788000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-024-01422-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,29]]},"references-count":111,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,8]]}},"alternative-id":["1422"],"URL":"https:\/\/doi.org\/10.1007\/s00530-024-01422-9","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,7,29]]},"assertion":[{"value":"16 March 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 July 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 July 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors have no Conflict of interest to declare that are relevant to the content of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"222"}}