{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T09:12:52Z","timestamp":1774602772416,"version":"3.50.1"},"reference-count":41,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T00:00:00Z","timestamp":1770854400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T00:00:00Z","timestamp":1770854400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100003130","name":"Fonds Wetenschappelijk Onderzoek","doi-asserted-by":"publisher","award":["G0C9623N"],"award-info":[{"award-number":["G0C9623N"]}],"id":[{"id":"10.13039\/501100003130","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003130","name":"Fonds Wetenschappelijk Onderzoek","doi-asserted-by":"publisher","award":["G0D8321N"],"award-info":[{"award-number":["G0D8321N"]}],"id":[{"id":"10.13039\/501100003130","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s11263-025-02696-w","type":"journal-article","created":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T04:09:19Z","timestamp":1770869359000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Self-Balancing Multimodal Models via Multi-Loss Gradient Modulation"],"prefix":"10.1007","volume":"134","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-4101-0720","authenticated-orcid":false,"given":"Konstantinos","family":"Kontras","sequence":"first","affiliation":[]},{"given":"Christos","family":"Chatzichristos","sequence":"additional","affiliation":[]},{"given":"Matthew","family":"Blaschko","sequence":"additional","affiliation":[]},{"given":"Maarten","family":"De Vos","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,12]]},"reference":[{"key":"2696_CR1","first-page":"12449","volume":"33","author":"A Baevski","year":"2020","unstructured":"Baevski, A., et al. (2020). wav2vec 2.0: A framework for self-supervised learning of speech representations. In: Advances in neural information processing systems,33, 12449\u201312460.","journal-title":"In: Advances in neural information processing systems"},{"issue":"4","key":"2696_CR2","first-page":"377","volume":"5","author":"H Cao","year":"2014","unstructured":"Cao, H., et al. (2014). Crema-d: Crowd-sourced emotional multimodal actors dataset. In: IEEE transactions on affective computing,5(4), 377\u2013390.","journal-title":"In: IEEE transactions on affective computing"},{"key":"2696_CR3","unstructured":"Du, C., et al. (2023). On uni-modal feature learning in supervised multi-modal learning. (pp. 8632\u20138656)"},{"key":"2696_CR4","doi-asserted-by":"crossref","unstructured":"Fan, Y., et al. (2023). PMR: Prototypical Modal Rebalance for Multimodal Learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 20029\u201320038.","DOI":"10.1109\/CVPR52729.2023.01918"},{"key":"2696_CR5","first-page":"21630","volume":"34","author":"I Gat","year":"2021","unstructured":"Gat, I., Schwartz, I., & Schwing, A. (2021). Perceptual score: What data modalities does your model perceive? Advances in Neural Information Processing Systems,34, 21630\u201321643.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2696_CR6","first-page":"3197","volume":"33","author":"I Gat","year":"2020","unstructured":"Gat, I., et al. (2020). Removing bias in multi-modal classifiers: Regularization by maximizing functional entropies. Advances in Neural Information Processing Systems,33, 3197\u20133208.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2696_CR7","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al. (2023). Imagebind: One embedding space to bind them all. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 15180\u201315190.","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"2696_CR8","unstructured":"Goncalves, L., Leem, S. G., Lin, W. C., Sisman, B., & Busso, C. (2023). Versatile Audio-Visual Learning for Handling Single and Multi Modalities in Emotion Regression and Classification Tasks. In: arXiv:2305.07216."},{"key":"2696_CR9","doi-asserted-by":"crossref","unstructured":"Goyal, R., et al. (2017). The something something video database for learning and evaluating visual common sense. In: Proceedings of the IEEE international conference on computer vision. pp. 5842\u20135850.","DOI":"10.1109\/ICCV.2017.622"},{"key":"2696_CR10","doi-asserted-by":"publisher","unstructured":"Gulati, A., et al. (2020). Conformer: Convolution-augmented Transformer for Speech Recognition. In: Interspeech 2020. pp.\u00a05036\u20135040. https:\/\/doi.org\/10.21437\/Interspeech.2020-3015.","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"2696_CR11","unstructured":"Guo, C., et al. (2017). On calibration of modern neural networks. In: International conference on machine learning. PMLR. pp. 1321\u20131330."},{"key":"2696_CR12","doi-asserted-by":"crossref","unstructured":"He, K., et al. (2016). Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition. pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2696_CR13","unstructured":"Hua, C., et al. (2024). ReconBoost: boosting can achieve modality reconcilement. In: Proceedings of the 41st International Conference on Machine Learning. ICML\u201924. Vienna, Austria: JMLR.org."},{"key":"2696_CR14","unstructured":"Huang, Yu., et al. (2022). What makes joint training of multimodal network fail in deep learning?(provably). In: International Conference on Machine Learning. PMLR. pp. 9226\u20139259."},{"key":"2696_CR15","doi-asserted-by":"crossref","unstructured":"Kiela, D., Grave, E., Joulin, A., & Mikolov, T. (2018). Efficient large-scale multi-modal classification. In: Proceedings of the AAAI conference on artificial intelligence. Vol.\u00a032. (1).","DOI":"10.1609\/aaai.v32i1.11945"},{"key":"2696_CR16","doi-asserted-by":"crossref","unstructured":"Kontras, K., et al. (2024). CoRe-Sleep: A Multimodal Fusion Framework for Time Series Robust to Imperfect Modalities. In: IEEE Transactions on Neural Systems and Rehabilitation Engineering.","DOI":"10.1109\/TNSRE.2024.3354388"},{"key":"2696_CR17","unstructured":"Kontras, K., Chatzichristos, C., Blaschko, M., & De Vos, M. (2024). Improving Multimodal Learning with Multi-Loss Gradient Modulation. In: 35th British Machine Vision Conference 2024, BMVC 2024, Glasgow, UK, November 25-28, 2024. BMVA, 2024. https:\/\/papers.bmvc2024.org\/0977.pdf."},{"key":"2696_CR18","doi-asserted-by":"crossref","unstructured":"Li, H., et al. (2023). Boosting Multi-modal Model Performance with Adaptive Gradient Modulation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 22214\u201322224.","DOI":"10.1109\/ICCV51070.2023.02030"},{"key":"2696_CR19","unstructured":"Blip, J. L., et al. (2022). Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning. PMLR. pp. 12888\u201312900."},{"key":"2696_CR20","unstructured":"Liang, P. P., et al. (2021). Multibench: Multiscale benchmarks for multimodal representation learning. In: Advances in neural information processing systems 2021.DB1, p.\u00a01."},{"key":"2696_CR21","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF international conference on computer vision. pp. 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2696_CR22","unstructured":"Lundberg, S. M., & Lee, S.-I. (2017). A unified approach to interpreting model predictions. In: Advances in neural information processing systems 30."},{"key":"2696_CR23","doi-asserted-by":"crossref","unstructured":"Peng, X., et al. (2022). Balanced multimodal learning via on-the-fly gradient modulation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 8238\u20138247","DOI":"10.1109\/CVPR52688.2022.00806"},{"key":"2696_CR24","doi-asserted-by":"crossref","unstructured":"Perez, E., et al. (2018). Film: Visual reasoning with a general conditioning layer. In: Proceedings of the AAAI conference on artificial intelligence. Vol.\u00a032. 1.","DOI":"10.1609\/aaai.v32i1.11671"},{"issue":"9","key":"2696_CR25","first-page":"5903","volume":"44","author":"H Phan","year":"2021","unstructured":"Phan, H., et al. (2021). XSleepNet: Multi-view sequential model for automatic sleep staging. In: IEEE Transactions on Pattern Analysis and Machine Intelligence,44(9), 5903\u20135915.","journal-title":"In: IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2696_CR26","doi-asserted-by":"crossref","unstructured":"Radevski, G., et al. (2023). Multimodal distillation for egocentric action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp. 5213\u20135224.","DOI":"10.1109\/ICCV51070.2023.00481"},{"key":"2696_CR27","unstructured":"Radford, A., et al. (2021). Learning transferable visual models from natural language supervision. In: International conference on machine learning. PmLR. pp. 8748\u20138763."},{"key":"2696_CR28","doi-asserted-by":"crossref","unstructured":"Shapley, L. S. (1953). A value for n-person games. In:","DOI":"10.1515\/9781400881970-018"},{"key":"2696_CR29","unstructured":"Soomro, K., Zamir, A. R., & Shah, M. (2012). UCF101: A dataset of 101 human actions classes from videos in the wild. In: arXiv:1212.0402"},{"key":"2696_CR30","unstructured":"Tan, M., & Le, Q. (2019). Efficientnet: Rethinking model scaling for convolutional neural networks. In: International conference on machine learning. PMLR. pp. 6105\u20136114."},{"key":"2696_CR31","doi-asserted-by":"crossref","unstructured":"Tian, Y., et al. (2018). Audio-visual event localization in unconstrained videos. In: Proceedings of the European conference on computer vision (ECCV). pp. 247\u2013263.","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"2696_CR32","doi-asserted-by":"crossref","unstructured":"Tsai, Y. H. H., Bai, S., Liang, P. P., Kolter, J. Z., Morency, L. P., & Salakhutdinov, R. (2019). Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the conference. Association for computational linguistics. Meeting. Vol.\u00a02019. NIH Public Access. 2019, p.\u00a06558.","DOI":"10.18653\/v1\/P19-1656"},{"key":"2696_CR33","doi-asserted-by":"crossref","unstructured":"Vielzeuf, V., Lechervy, A., Pateux, S., & Jurie, F. (2018). Centralnet: a multilayer approach for multimodal fusion. In: Proceedings of the European Conference on Computer Vision (ECCV) Workshops.","DOI":"10.1007\/978-3-030-11024-6_44"},{"key":"2696_CR34","doi-asserted-by":"crossref","unstructured":"Wang, W., Tran, D., & Feiszli, M. (2020). What makes training multi-modal classification networks hard? In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition. pp. 12695\u201312705","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"2696_CR35","doi-asserted-by":"crossref","unstructured":"Wang, W., et al. (2023). Image as a foreign language: Beit pretraining for vision and vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 19175\u201319186","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"2696_CR36","unstructured":"Wei, Y., & Di, H. (2024). MMPareto: boosting multimodal learning with innocent unimodal assistance. In: Proceedings of the 41st International Conference on Machine Learning. ICML\u201924. Vienna, Austria: JMLR.org."},{"key":"2696_CR37","doi-asserted-by":"publisher","unstructured":"Wolf, T., et al. (2020). Transformers: State-of-the-Art Natural Language Processing. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations. Ed. by Qun Liu and David Schlangen. Online: Association for Computational Linguistics, Oct. pp.\u00a038\u201345. https:\/\/doi.org\/10.18653\/v1\/2020.emnlp-demos.6.","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"2696_CR38","unstructured":"Nan, W., et al. (2022). Characterizing and overcoming the greedy nature of learning in multi-modal deep neural networks. In: International Conference on Machine Learning. PMLR. pp. 24043\u201324055."},{"key":"2696_CR39","doi-asserted-by":"crossref","unstructured":"Ruize, X., et al. (2023). MMCosine: Multi-Modal Cosine Loss Towards Balanced Audio- Visual Fine-Grained Learning. In: ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE. pp. 1\u20135.","DOI":"10.1109\/ICASSP49357.2023.10096655"},{"key":"2696_CR40","doi-asserted-by":"crossref","unstructured":"Yao, Y., & Mihalcea, R. (2022). Modality-specific Learning Rates for Effective Multimodal Additive Late-fusion. Findings of the Association for Computational Linguistics: ACL 2022 (pp. 1824\u20131834). Association for Computational Linguistics.","DOI":"10.18653\/v1\/2022.findings-acl.143"},{"key":"2696_CR41","doi-asserted-by":"crossref","unstructured":"Zadeh, A., et al. (2018). Multi-attention recurrent network for human communication comprehension. In: Thirty-Second AAAI Conference on Artificial Intelligence","DOI":"10.1609\/aaai.v32i1.12024"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02696-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02696-w","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02696-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T08:38:42Z","timestamp":1774600722000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02696-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,12]]},"references-count":41,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["2696"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02696-w","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,12]]},"assertion":[{"value":"28 February 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 September 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"117"}}