{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,5]],"date-time":"2025-05-05T04:02:55Z","timestamp":1746417775351,"version":"3.40.4"},"reference-count":32,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2025,5,4]],"date-time":"2025-05-04T00:00:00Z","timestamp":1746316800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,4]],"date-time":"2025-05-04T00:00:00Z","timestamp":1746316800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"the National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62106074"],"award-info":[{"award-number":["62106074"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"the Scientific Research Project of Hunan Provincial Department of Education","award":["22A0391"],"award-info":[{"award-number":["22A0391"]}]},{"name":"the National Science Fund of Hunan","award":["2024JJ7132"],"award-info":[{"award-number":["2024JJ7132"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"DOI":"10.1007\/s11227-025-07311-w","type":"journal-article","created":{"date-parts":[[2025,5,4]],"date-time":"2025-05-04T06:45:03Z","timestamp":1746341103000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["An audio\u2013visual multimodal adaptive balanced learning method based on gradient modulation"],"prefix":"10.1007","volume":"81","author":[{"given":"Wenxiu","family":"Ao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhongmei","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianhua","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shenao","family":"Peng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Liang","family":"Zheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,5,4]]},"reference":[{"key":"7311_CR1","doi-asserted-by":"publisher","first-page":"125","DOI":"10.1007\/s11227-024-06542-7","volume":"81","author":"J Chen","year":"2024","unstructured":"Chen J, Ye W (2024) Roaen: reversed dependency graph and orthogonal-gating strategy attention-enhanced network for aspect-level sentiment classification. J Supercomput 81:125. https:\/\/doi.org\/10.1007\/s11227-024-06542-7","journal-title":"J Supercomput"},{"key":"7311_CR2","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1007\/s11227-024-06546-3","volume":"81","author":"C Zhu","year":"2024","unstructured":"Zhu C, Ding Q (2024) Aspect-based sentiment analysis via dual residual networks with sentiment knowledge. J Supercomput 81:131. https:\/\/doi.org\/10.1007\/s11227-024-06546-3","journal-title":"J Supercomput"},{"key":"7311_CR3","doi-asserted-by":"publisher","unstructured":"Chang Y, Xue F, Sheng F, et al (2022) Fast road segmentation via uncertainty-aware symmetric network. In: 2022 International Conference on Robotics and Automation (ICRA), pp 11124\u201311130. https:\/\/doi.org\/10.1109\/ICRA46639.2022.9812452","DOI":"10.1109\/ICRA46639.2022.9812452"},{"key":"7311_CR4","doi-asserted-by":"publisher","unstructured":"Cui C, Ma Y, Cao X, et al (2024) A survey on multimodal large language models for autonomous driving. In: 2024 IEEE\/CVF Winter Conference on Applications of Computer Vision Workshops (WACVW), pp 958\u2013979. https:\/\/doi.org\/10.1109\/WACVW60836.2024.00106","DOI":"10.1109\/WACVW60836.2024.00106"},{"key":"7311_CR5","doi-asserted-by":"publisher","unstructured":"Huang J, Yan M, Chen S, et al (2024) Magicfight: personalized martial arts combat video generation. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp 10833\u201310842. https:\/\/doi.org\/10.1145\/3664647.3680849","DOI":"10.1145\/3664647.3680849"},{"key":"7311_CR6","doi-asserted-by":"publisher","unstructured":"Wang W, Tran D, Feiszli M (2020) What makes training multi-modal classification networks hard? In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 12692\u201312702. https:\/\/doi.org\/10.1109\/CVPR42600.2020.01271","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"7311_CR7","unstructured":"Huang Y, Lin J, Zhou C, et al (2022) Modality competition: What makes joint training of multi-modal network fail in deep learning? (Provably). In: Proceedings of the 39th International Conference on Machine Learning(PMLR), pp 9226\u20139259"},{"key":"7311_CR8","doi-asserted-by":"publisher","unstructured":"Xu R, Feng R, Zhang SX, et al (2023) Mmcosine: multi-modal cosine loss towards balanced audio-visual fine-grained learning. In: ICASSP 2023\u20142023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp 1\u20135. https:\/\/doi.org\/10.1109\/ICASSP49357.2023.10096655","DOI":"10.1109\/ICASSP49357.2023.10096655"},{"key":"7311_CR9","doi-asserted-by":"publisher","unstructured":"Li H, Li X, Hu P, et al (2023) Boosting multi-modal model performance with adaptive gradient modulation. In: 2023 IEEE\/CVF International Conference on Computer Vision (ICCV), pp 22157\u201322167. https:\/\/doi.org\/10.1109\/ICCV51070.2023.02030","DOI":"10.1109\/ICCV51070.2023.02030"},{"issue":"1","key":"7311_CR10","doi-asserted-by":"publisher","first-page":"469","DOI":"10.1109\/TPAMI.2024.3468315","volume":"47","author":"Y Wei","year":"2025","unstructured":"Wei Y, Hu D, Du H et al (2025) On-the-fly modulation for balanced multimodal learning. IEEE Trans Pattern Anal Mach Intell 47(1):469\u2013485. https:\/\/doi.org\/10.1109\/TPAMI.2024.3468315","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"7311_CR11","doi-asserted-by":"publisher","first-page":"408","DOI":"10.1109\/LSP.2022.3227818","volume":"30","author":"L Yang","year":"2023","unstructured":"Yang L, Wu Z, Hong J et al (2023) MCL: a contrastive learning method for multimodal data fusion in violence detection. IEEE Signal Process Lett 30:408\u2013412. https:\/\/doi.org\/10.1109\/LSP.2022.3227818","journal-title":"IEEE Signal Process Lett"},{"key":"7311_CR12","unstructured":"Du C, Teng J, Li T, et al (2023) On uni-modal feature learning in supervised multi-modal learning. In: Proceedings of the 40th International Conference on Machine Learning, pp 8632\u20138656"},{"key":"7311_CR13","doi-asserted-by":"publisher","unstructured":"Liu S, Li L, Song J, et al (2023) Multimodal pre-training with self-distillation for product understanding in e-commerce. In: Proceedings of the Sixteenth ACM International Conference on Web Search and Data Mining, pp 1039\u20131047. https:\/\/doi.org\/10.1145\/3539597.3570423","DOI":"10.1145\/3539597.3570423"},{"key":"7311_CR14","doi-asserted-by":"publisher","unstructured":"Peng X, Wei Y, Deng A, et al (2022) Balanced multimodal learning via on-the-fly gradient modulation. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 8228\u20138237. https:\/\/doi.org\/10.1109\/CVPR52688.2022.00806","DOI":"10.1109\/CVPR52688.2022.00806"},{"issue":"6","key":"7311_CR15","doi-asserted-by":"publisher","first-page":"4843","DOI":"10.1109\/TCSVT.2023.3337134","volume":"34","author":"J Fu","year":"2023","unstructured":"Fu J, Gao J, Bao BK et al (2023) Multimodal imbalance-aware gradient modulation for weakly-supervised audio\u2013visual video parsing. IEEE Trans Circuits Syst Video Technol 34(6):4843\u20134856. https:\/\/doi.org\/10.1109\/TCSVT.2023.3337134","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"7311_CR16","doi-asserted-by":"publisher","unstructured":"Fan Y, Xu W, Wang H, et al (2023) PMR: prototypical modal rebalance for multimodal learning. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 20029\u201320038. https:\/\/doi.org\/10.1109\/CVPR52729.2023.01918","DOI":"10.1109\/CVPR52729.2023.01918"},{"key":"7311_CR17","doi-asserted-by":"publisher","unstructured":"Lin X, Wang S, Cai R, et al (2024) Suppress and rebalance: towards generalized multi-modal face anti-spoofing. In: 2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 211\u2013221. https:\/\/doi.org\/10.1109\/CVPR52733.2024.00028","DOI":"10.1109\/CVPR52733.2024.00028"},{"key":"7311_CR18","doi-asserted-by":"publisher","unstructured":"He Y, Sun L, Lian Z, et al (2022) Multimodal temporal attention in sentiment analysis. In: Proceedings of the 3rd International on Multimodal Sentiment Analysis Workshop and Challenge, pp 61\u201366. https:\/\/doi.org\/10.1145\/3551876.3554811","DOI":"10.1145\/3551876.3554811"},{"key":"7311_CR19","doi-asserted-by":"publisher","unstructured":"Zhou Y, Lim SN (2021) Joint audio\u2013visual deepfake detection. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp 14780\u201314789. https:\/\/doi.org\/10.1109\/ICCV48922.2021.01453","DOI":"10.1109\/ICCV48922.2021.01453"},{"issue":"12","key":"7311_CR20","doi-asserted-by":"publisher","first-page":"10790","DOI":"10.1609\/aaai.v35i12.17289","volume":"35","author":"W Yu","year":"2021","unstructured":"Yu W, Xu H, Yuan Z et al (2021) Learning modality-specific representations with self-supervised multi-task learning for multimodal sentiment analysis. Proc AAAI Conf Artif Intell 35(12):10790\u201310797. https:\/\/doi.org\/10.1609\/aaai.v35i12.17289","journal-title":"Proc AAAI Conf Artif Intell"},{"issue":"1","key":"7311_CR21","doi-asserted-by":"publisher","first-page":"537","DOI":"10.1109\/TITS.2020.3013234","volume":"23","author":"Y Xiao","year":"2022","unstructured":"Xiao Y, Codevilla F, Gurram A et al (2022) Multimodal end-to-end autonomous driving. Trans Intell Transp Syst 23(1):537\u2013547. https:\/\/doi.org\/10.1109\/TITS.2020.3013234","journal-title":"Trans Intell Transp Syst"},{"key":"7311_CR22","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-024-06546-3","author":"Y Chen","year":"2025","unstructured":"Chen Y, Hu X, Lu T et al (2025) A multi-scale large kernel attention with u-net for medical image registration. J Supercomput. https:\/\/doi.org\/10.1007\/s11227-024-06546-3","journal-title":"J Supercomput"},{"key":"7311_CR23","doi-asserted-by":"publisher","first-page":"1650","DOI":"10.1109\/LSP.2021.3101421","volume":"28","author":"Y Sun","year":"2021","unstructured":"Sun Y, Mai S, Hu H (2021) Learning to balance the learning rates between various modalities via adaptive tracking factor. IEEE Signal Process Lett 28:1650\u20131654. https:\/\/doi.org\/10.1109\/LSP.2021.3101421","journal-title":"IEEE Signal Process Lett"},{"key":"7311_CR24","doi-asserted-by":"publisher","first-page":"2354","DOI":"10.1109\/TMM.2023.3295094","volume":"26","author":"S Su","year":"2024","unstructured":"Su S, Zhu J, Gao L et al (2024) Utilizing greedy nature for multimodal conditional image synthesis in transformers. IEEE Trans Multimed 26:2354\u20132366. https:\/\/doi.org\/10.1109\/TMM.2023.3295094","journal-title":"IEEE Trans Multimed"},{"issue":"10","key":"7311_CR25","first-page":"4781","volume":"35","author":"Y Luo","year":"2024","unstructured":"Luo Y, Wu R, Liu J et al (2024) Multimodal sentiment analysis method based on adaptive weight fusion. J Softw 35(10):4781\u20134793","journal-title":"J Softw"},{"key":"7311_CR26","unstructured":"Wu N, Jastrzebski S, Cho K, et al (2022) Characterizing and overcoming the greedy nature of learning in multi-modal deep neural networks. In: Proceedings of the 39th International Conference on Machine Learning, pp 24043\u201324055"},{"issue":"4","key":"7311_CR27","doi-asserted-by":"publisher","first-page":"377","DOI":"10.1109\/TAFFC.2014.2336244","volume":"5","author":"H Cao","year":"2014","unstructured":"Cao H, Cooper DG, Keutmann MK et al (2014) Crema-d: Crowd-sourced emotional multimodal actors dataset. IEEE Trans Affect Comput 5(4):377\u2013390. https:\/\/doi.org\/10.1109\/TAFFC.2014.2336244","journal-title":"IEEE Trans Affect Comput"},{"issue":"5","key":"7311_CR28","doi-asserted-by":"publisher","first-page":"e0196391","DOI":"10.1371\/journal.pone.0196391","volume":"13","author":"SR Livingstone","year":"2018","unstructured":"Livingstone SR, Russo FA (2018) The Ryerson audio\u2013visual database of emotional speech and song (RAVDESS): a dynamic, multimodal set of facial and vocal expressions in North American English. PLoS ONE 13(5):e0196391. https:\/\/doi.org\/10.1371\/journal.pone.0196391","journal-title":"PLoS ONE"},{"key":"7311_CR29","doi-asserted-by":"publisher","unstructured":"He K, Zhang X, Ren S, et al (2016) Deep residual learning for image recognition. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp 770\u2013778. https:\/\/doi.org\/10.1109\/CVPR.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"7311_CR30","doi-asserted-by":"publisher","unstructured":"Jin Q, Li C, Chen S, et al (2015) Speech emotion recognition with acoustic and lexical features. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp 4749\u20134753. https:\/\/doi.org\/10.1109\/ICASSP.2015.7178872","DOI":"10.1109\/ICASSP.2015.7178872"},{"key":"7311_CR31","doi-asserted-by":"publisher","first-page":"16359","DOI":"10.1007\/s11042-022-14185-0","volume":"82","author":"G Tang","year":"2023","unstructured":"Tang G, Xie Y, Li K et al (2023) Multimodal emotion recognition from facial expression and speech based on feature fusion. Multimed Tools Appl 82:16359\u201316373. https:\/\/doi.org\/10.1007\/s11042-022-14185-0","journal-title":"Multimed Tools Appl"},{"key":"7311_CR32","doi-asserted-by":"publisher","unstructured":"Vaezi Joze HR, Shaban A, Iuzzolino ML, et al (2020) MMTM: Multimodal transfer module for CNN fusion. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp 13286\u201313296. https:\/\/doi.org\/10.1109\/CVPR42600.2020.01330","DOI":"10.1109\/CVPR42600.2020.01330"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-07311-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11227-025-07311-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-025-07311-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,4]],"date-time":"2025-05-04T06:45:11Z","timestamp":1746341111000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11227-025-07311-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,4]]},"references-count":32,"journal-issue":{"issue":"7","published-online":{"date-parts":[[2025,5]]}},"alternative-id":["7311"],"URL":"https:\/\/doi.org\/10.1007\/s11227-025-07311-w","relation":{},"ISSN":["1573-0484"],"issn-type":[{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,5,4]]},"assertion":[{"value":"11 April 2025","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 May 2025","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"826"}}