{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:07:22Z","timestamp":1777655242617,"version":"3.51.4"},"publisher-location":"Cham","reference-count":32,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730382","type":"print"},{"value":"9783031730399","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73039-9_5","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:57:07Z","timestamp":1730300227000},"page":"71-86","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":17,"title":["Diagnosing and\u00a0Re-learning for\u00a0Balanced Multimodal Learning"],"prefix":"10.1007","author":[{"given":"Yake","family":"Wei","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Siwei","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruoxuan","family":"Feng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Di","family":"Hu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"5_CR1","unstructured":"Alabdulmohsin, I., Maennel, H., Keysers, D.: The impact of reinitialization on generalization in convolutional neural networks. arXiv preprint arXiv:2109.00267 (2021)"},{"key":"5_CR2","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., Zisserman, A.: Look, listen and learn. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 609\u2013617 (2017)","DOI":"10.1109\/ICCV.2017.73"},{"key":"5_CR3","first-page":"3884","volume":"33","author":"J Ash","year":"2020","unstructured":"Ash, J., Adams, R.P.: On warm-starting neural network training. Adv. Neural. Inf. Process. Syst. 33, 3884\u20133894 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"2","key":"5_CR4","doi-asserted-by":"publisher","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","volume":"41","author":"T Baltru\u0161aitis","year":"2018","unstructured":"Baltru\u0161aitis, T., Ahuja, C., Morency, L.P.: Multimodal machine learning: a survey and taxonomy. IEEE Trans. Pattern Anal. Mach. Intell. 41(2), 423\u2013443 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"5_CR5","doi-asserted-by":"crossref","unstructured":"Cao, H., Cooper, D.G., Keutmann, M.K., Gur, R.C., Nenkova, A., Verma, R.: Crema-d: crowd-sourced emotional multimodal actors dataset. IEEE Trans. Affect. Comput. 5(4), 377\u2013390 (2014)","DOI":"10.1109\/TAFFC.2014.2336244"},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Fan, Y., Xu, W., Wang, H., Wang, J., Guo, S.: PMR: prototypical modal rebalance for multimodal learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20029\u201320038 (2023)","DOI":"10.1109\/CVPR52729.2023.01918"},{"key":"5_CR7","unstructured":"Huang, Y., Lin, J., Zhou, C., Yang, H., Huang, L.: Modality competition: What makes joint training of multi-modal network fail in deep learning?(provably). arXiv preprint arXiv:2203.12221 (2022)"},{"key":"5_CR8","unstructured":"Kay, W., et\u00a0al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"5_CR9","doi-asserted-by":"crossref","unstructured":"Li, H., Li, X., Hu, P., Lei, Y., Li, C., Zhou, Y.: Boosting multi-modal model performance with adaptive gradient modulation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 22214\u201322224 (2023)","DOI":"10.1109\/ICCV51070.2023.02030"},{"key":"5_CR10","unstructured":"Liang, P.P., et\u00a0al.: Multibench: Multiscale benchmarks for multimodal representation learning. arXiv preprint arXiv:2107.07502 (2021)"},{"key":"5_CR11","unstructured":"Liang, P.P., Zadeh, A., Morency, L.P.: Foundations and recent trends in multimodal machine learning: Principles, challenges, and open questions. arXiv preprint arXiv:2209.03430 (2022)"},{"key":"5_CR12","unstructured":"Van\u00a0der Maaten, L., Hinton, G.: Visualizing data using t-SNE. J. Mach. Learn. Res. 9(11) (2008)"},{"key":"5_CR13","unstructured":"MacQueen, J., et\u00a0al.: Some methods for classification and analysis of multivariate observations. In: Proceedings of the Fifth Berkeley Symposium on Mathematical Statistics and Probability. vol.\u00a01, pp. 281\u2013297. Oakland, CA, USA (1967)"},{"key":"5_CR14","first-page":"14200","volume":"34","author":"A Nagrani","year":"2021","unstructured":"Nagrani, A., Yang, S., Arnab, A., Jansen, A., Schmid, C., Sun, C.: Attention bottlenecks for multimodal fusion. Adv. Neural. Inf. Process. Syst. 34, 14200\u201314213 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5_CR15","doi-asserted-by":"crossref","unstructured":"Peng, X., Wei, Y., Deng, A., Wang, D., Hu, D.: Balanced multimodal learning via on-the-fly gradient modulation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8238\u20138247 (2022)","DOI":"10.1109\/CVPR52688.2022.00806"},{"key":"5_CR16","doi-asserted-by":"crossref","unstructured":"Qiao, S., Lin, Z., Zhang, J., Yuille, A.L.: Neural rejuvenation: improving deep network training by enhancing computational resource utilization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 61\u201371 (2019)","DOI":"10.1109\/CVPR.2019.00015"},{"key":"5_CR17","unstructured":"Sehwag, V., Chiang, M., Mittal, P.: On separability of self-supervised representations. In: ICML workshop on Uncertainty and Robustness in Deep Learning (UDL). vol.\u00a03 (2020)"},{"key":"5_CR18","unstructured":"Sokar, G., Agarwal, R., Castro, P.S., Evci, U.: The dormant neuron phenomenon in deep reinforcement learning. In: Proceedings of the 40th International Conference on Machine Learning (2023)"},{"key":"5_CR19","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: Ucf101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"5_CR20","doi-asserted-by":"crossref","unstructured":"Wang, W., Tran, D., Feiszli, M.: What makes training multi-modal classification networks hard? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12695\u201312705 (2020)","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"5_CR21","doi-asserted-by":"crossref","unstructured":"Wei, Y., Feng, R., Wang, Z., Hu, D.: Enhancing multimodal cooperation via sample-level modality valuation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 27338\u201327347 (2024)","DOI":"10.1109\/CVPR52733.2024.02581"},{"key":"5_CR22","unstructured":"Wei, Y., Hu, D.: Mmpareto: boosting multimodal learning with innocent unimodal assistance. In: International Conference on Machine Learning (2024)"},{"key":"5_CR23","unstructured":"Wei, Y., Hu, D., Tian, Y., Li, X.: Learning in audio-visual context: A review, analysis, and new perspective. arXiv preprint arXiv:2208.09579 (2022)"},{"key":"5_CR24","doi-asserted-by":"crossref","unstructured":"Wong, K.C.: A short survey on data clustering algorithms. In: 2015 Second International Conference on Soft Computing and Machine Intelligence (ISCMI), pp. 64\u201368. IEEE (2015)","DOI":"10.1109\/ISCMI.2015.10"},{"key":"5_CR25","unstructured":"Wu, N., Jastrzebski, S., Cho, K., Geras, K.J.: Characterizing and overcoming the greedy nature of learning in multi-modal deep neural networks. In: International Conference on Machine Learning, pp. 24043\u201324055. PMLR (2022)"},{"key":"5_CR26","doi-asserted-by":"crossref","unstructured":"Xu, P., Zhu, X., Clifton, D.A.: Multimodal learning with transformers: a survey. IEEE Trans. Pattern Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3275156"},{"key":"5_CR27","doi-asserted-by":"publisher","first-page":"106970","DOI":"10.1016\/j.knosys.2021.106970","volume":"223","author":"SK Yadav","year":"2021","unstructured":"Yadav, S.K., Tiwari, K., Pandey, H.M., Akbar, S.A.: A review of multimodal human activity recognition with special emphasis on classification, applications, challenges and future directions. Knowl.-Based Syst. 223, 106970 (2021)","journal-title":"Knowl.-Based Syst."},{"key":"5_CR28","unstructured":"Yang, Z., Wei, Y., Liang, C., Hu, D.: Quantifying and enhancing multi-modal robustness with modality preference. In: The Twelfth International Conference on Learning Representations (2024)"},{"key":"5_CR29","doi-asserted-by":"crossref","unstructured":"Ying, X.: An overview of overfitting and its solutions. In: Journal of physics: Conference series. vol.\u00a01168, pp. 022022, IOP Publishing (2019)","DOI":"10.1088\/1742-6596\/1168\/2\/022022"},{"key":"5_CR30","unstructured":"Zadeh, A., Zellers, R., Pincus, E., Morency, L.P.: Mosi: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. arXiv preprint arXiv:1606.06259 (2016)"},{"key":"5_CR31","unstructured":"Zaidi, S., et al.: When does re-initialization work? In: Proceedings on, pp. 12\u201326. PMLR (2023)"},{"issue":"3","key":"5_CR32","doi-asserted-by":"publisher","first-page":"351","DOI":"10.1007\/s11633-021-1293-0","volume":"18","author":"H Zhu","year":"2021","unstructured":"Zhu, H., Luo, M.D., Wang, R., Zheng, A.H., He, R.: Deep audio-visual learning: a survey. Int. J. Autom. Comput. 18(3), 351\u2013376 (2021)","journal-title":"Int. J. Autom. Comput."}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73039-9_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:18:09Z","timestamp":1730301489000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73039-9_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031730382","9783031730399"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73039-9_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}