{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,10]],"date-time":"2026-02-10T03:14:27Z","timestamp":1770693267792,"version":"3.49.0"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Natural Science Founda- tion of Xinjiang Uygur Autonomous Region","award":["2023D01C176"],"award-info":[{"award-number":["2023D01C176"]}]},{"name":"Xinjiang Uygur Autonomous Region Universities Fundamental Research Funds Scientific Research Project","award":["XJEDU2022P018"],"award-info":[{"award-number":["XJEDU2022P018"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int. J. Mach. Learn. &amp; Cyber."],"published-print":{"date-parts":[[2026,1]]},"DOI":"10.1007\/s13042-025-02876-7","type":"journal-article","created":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T07:37:18Z","timestamp":1768635438000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Integrated feature-enhanced multimodal intent detection method"],"prefix":"10.1007","volume":"17","author":[{"given":"Qimeng","family":"Yang","sequence":"first","affiliation":[]},{"given":"Lanlan","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Yi","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Jinmiao","family":"Song","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,17]]},"reference":[{"key":"2876_CR1","doi-asserted-by":"crossref","unstructured":"Zhang H, Li X, Xu H, Zhang P, Zhao K, Gao K (2021) Textoir: an integrated and visualized platform for text open intent recognition. arXiv preprint arXiv:2110.15063","DOI":"10.18653\/v1\/2021.acl-demo.20"},{"key":"2876_CR2","first-page":"14365","volume":"35","author":"H Zhang","year":"2021","unstructured":"Zhang H, Xu H, Lin T-E, Lyu R (2021) Discovering new intents with deep aligned clustering. Proc AAAI Conf Artif Intell 35:14365\u201314373","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"2876_CR3","doi-asserted-by":"crossref","unstructured":"Zhang H, Xu H, Wang X, Zhou Q, Zhao S, Teng J (2022) Mintrec: a new dataset for multimodal intent recognition. In: Proceedings of the 30th ACM international conference on multimedia, 1688\u20131697","DOI":"10.1145\/3503161.3547906"},{"key":"2876_CR4","doi-asserted-by":"crossref","unstructured":"Saha T, Patra A, Saha S, Bhattacharyya P (2020) Towards emotion-aided multi-modal dialogue act classification. In: Proceedings of the 58th annual meeting of the association for computational linguistics, pp 4361\u20134372","DOI":"10.18653\/v1\/2020.acl-main.402"},{"key":"2876_CR5","doi-asserted-by":"publisher","first-page":"299","DOI":"10.1016\/j.inffus.2022.09.029","volume":"91","author":"M Firdaus","year":"2023","unstructured":"Firdaus M, Ekbal A, Cambria E (2023) Multitask learning for multilingual intent detection and slot filling in dialogue systems. Inform Fus 91:299\u2013315","journal-title":"Inform Fus"},{"key":"2876_CR6","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111826","volume":"296","author":"Z Li","year":"2024","unstructured":"Li Z, Zhang G, Okada S, Wang L, Zhao B, Dang J (2024) Mbcfnet: a multimodal brain-computer fusion network for human intention recognition. Knowl-Based Syst 296:111826","journal-title":"Knowl-Based Syst"},{"key":"2876_CR7","doi-asserted-by":"crossref","unstructured":"Li Z, Zhao B, Zhang G, Dang J (2023) Brain network features differentiate intentions from different emotional expressions of the same text. In: ICASSP 2023-2023 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 1\u20135. IEEE","DOI":"10.1109\/ICASSP49357.2023.10095376"},{"key":"2876_CR8","doi-asserted-by":"crossref","unstructured":"Tsai Y-HH, Bai S, Liang PP, Kolter JZ, Morency L-P, Salakhutdinov R (2019) Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the conference. Association for computational linguistics. Meeting, vol 2019, p 6558. NIH Public Access","DOI":"10.18653\/v1\/P19-1656"},{"key":"2876_CR9","doi-asserted-by":"crossref","unstructured":"Hazarika D, Zimmermann R, Poria S (2020) Misa: modality-invariant and-specific representations for multimodal sentiment analysis. In: Proceedings of the 28th ACM international conference on multimedia, pp 1122\u20131131","DOI":"10.1145\/3394171.3413678"},{"key":"2876_CR10","doi-asserted-by":"crossref","unstructured":"Rahman W, Hasan MK, Lee S, Zadeh A, Mao C, Morency L-P, Hoque E (2020) Integrating multimodal information in large pretrained transformers. In: Proceedings of the conference. Association for computational linguistics. Meeting, vol 2020, p 2359. NIH Public Access","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"2876_CR11","doi-asserted-by":"crossref","unstructured":"Dong J, Fu J, Zhou P, Li H, Wang X (2022) Improving spoken language understanding with cross-modal contrastive learning. In: Interspeech, pp 2693\u20132697","DOI":"10.21437\/Interspeech.2022-658"},{"key":"2876_CR12","doi-asserted-by":"crossref","unstructured":"Yu T, Gao H, Lin T-E, Yang M, Wu Y, Ma W, Wang C, Huang F, Li Y (2023) Speech-text pre-training for spoken dialog understanding with explicit cross-modal alignment. In: Proceedings of the 61st annual meeting of the association for computational linguistics (volume 1: long papers), pp 7900\u20137913","DOI":"10.18653\/v1\/2023.acl-long.438"},{"key":"2876_CR13","doi-asserted-by":"crossref","unstructured":"Haffner P, Tur G, Wright JH (2003) Optimizing svms for complex call classification. In: 2003 IEEE international conference on acoustics, speech, and signal processing, 2003. Proceedings. (ICASSP\u201903)., vol 1, IEEE","DOI":"10.1109\/ICASSP.2003.1198860"},{"key":"2876_CR14","doi-asserted-by":"crossref","unstructured":"Tur G (2005) Model adaptation for spoken language understanding. In: Proceedings. (ICASSP\u201905). IEEE international conference on acoustics, speech, and signal processing, 2005, vol 1, p 41. IEEE","DOI":"10.1109\/ICASSP.2005.1415045"},{"key":"2876_CR15","doi-asserted-by":"crossref","unstructured":"Tur G, Hakkani-T\u00fcr D, Heck L, Parthasarathy S (2011) Sentence simplification for spoken language understanding. In: 2011 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 5628\u20135631. IEEE","DOI":"10.1109\/ICASSP.2011.5947636"},{"key":"2876_CR16","doi-asserted-by":"crossref","unstructured":"Hakkani-T\u00fcr D, Tur G, Chotimongkol A (2005) Using syntactic and semantic graphs for call classification. In: Proceedings of the ACL workshop on feature engineering for machine learning in natural language processing","DOI":"10.3115\/1610230.1610235"},{"issue":"6","key":"2876_CR17","doi-asserted-by":"publisher","first-page":"1207","DOI":"10.1109\/TASL.2008.2001106","volume":"16","author":"S Yaman","year":"2008","unstructured":"Yaman S, Deng L, Yu D, Wang Y-Y, Acero A (2008) An integrative and discriminative technique for spoken utterance classification. IEEE Trans Audio Speech Lang Process 16(6):1207\u20131214","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"2876_CR18","doi-asserted-by":"crossref","unstructured":"Kim J-K, Tur G, Celikyilmaz A, Cao B, Wang Y-Y (2016) Intent detection using semantically enriched word embeddings. In: 2016 IEEE spoken language technology workshop (SLT), pp 414\u2013419. IEEE","DOI":"10.1109\/SLT.2016.7846297"},{"key":"2876_CR19","doi-asserted-by":"crossref","unstructured":"Luan Y, Watanabe S, Harsham B (2015) Efficient learning for spoken language understanding tasks with word embedding based pre-training. In: INTERSPEECH, pp 1398\u20131402. Citeseer","DOI":"10.21437\/Interspeech.2015-56"},{"key":"2876_CR20","unstructured":"Zhou Z-H, Yu Y. National key laboratory for novel software technology. Nanjing University, Nanjing 210093, China"},{"key":"2876_CR21","doi-asserted-by":"crossref","unstructured":"Firdaus M, Bhatnagar S, Ekbal A, Bhattacharyya P (2018) Intent detection for spoken language understanding using a deep ensemble model. In: PRICAI 2018: trends in artificial intelligence: 15th Pacific Rim international conference on artificial intelligence, Nanjing, China, August 28\u201331, 2018, proceedings, part I 15, pp 629\u2013642. Springer","DOI":"10.1007\/978-3-319-97304-3_48"},{"key":"2876_CR22","unstructured":"Masumura R, Tanaka T, Higashinaka R, Masataki H, Aono Y (2018) Multi-task and multi-lingual joint learning of neural lexical utterance classification based on partially-shared modeling. In: Proceedings of the 27th international conference on computational linguistics, pp 3586\u20133596"},{"key":"2876_CR23","doi-asserted-by":"crossref","unstructured":"Zadeh A, Chen M, Poria S, Cambria E, Morency L-P (2017) Tensor fusion network for multimodal sentiment analysis. arXiv preprint arXiv:1707.07250","DOI":"10.18653\/v1\/D17-1115"},{"key":"2876_CR24","doi-asserted-by":"crossref","unstructured":"Liu Z, Shen Y, Lakshminarasimhan VB, Liang PP, Zadeh A, Morency L-P (2018) Efficient low-rank multimodal fusion with modality-specific factors. arXiv preprint arXiv:1806.00064","DOI":"10.18653\/v1\/P18-1209"},{"key":"2876_CR25","unstructured":"Hou M, Tang J, Zhang J, Kong W, Zhao Q (2019) Deep multimodal multilinear fusion with high-order polynomial pooling. Adv Neural Inform Process Syst 32"},{"key":"2876_CR26","doi-asserted-by":"crossref","unstructured":"Zadeh A, Liang PP, Mazumder N, Poria S, Cambria E, Morency L-P (2018) Memory fusion network for multi-view sequential learning. In: Proceedings of the AAAI conference on artificial intelligence, vol 32","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"2876_CR27","doi-asserted-by":"crossref","unstructured":"Han W, Chen H, Poria S (2021) Improving multimodal fusion with hierarchical mutual information maximization for multimodal sentiment analysis. arXiv preprint arXiv:2109.00412","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"key":"2876_CR28","doi-asserted-by":"crossref","unstructured":"Han W, Chen H, Gelbukh A, Zadeh A, Morency L-p, Poria S (2021) Bi-bimodal modality fusion for correlation-controlled multimodal sentiment analysis. In: Proceedings of the 2021 international conference on multimodal interaction, pp 6\u201315","DOI":"10.1145\/3462244.3479919"},{"key":"2876_CR29","doi-asserted-by":"crossref","unstructured":"Paraskevopoulos G, Georgiou E, Potamianos A (2022) Mmlatch: Bottom-up top-down fusion for multimodal sentiment analysis. In: ICASSP 2022-2022 IEEE international conference on acoustics, speech and signal processing (ICASSP), pp 4573\u20134577. IEEE","DOI":"10.1109\/ICASSP43922.2022.9746418"},{"issue":"9","key":"2876_CR30","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou K, Yang J, Loy CC, Liu Z (2022) Learning to prompt for vision-language models. Int J Comput Vis 130(9):2337\u20132348","journal-title":"Int J Comput Vis"},{"key":"2876_CR31","doi-asserted-by":"crossref","unstructured":"Zhou K, Yang J, Loy CC, Liu Z (2022) Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 16816\u201316825","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"2876_CR32","doi-asserted-by":"crossref","unstructured":"Rao Y, Zhao W, Chen G, Tang Y, Zhu Z, Huang G, Zhou J, Lu J (2022) Denseclip: Language-guided dense prediction with context-aware prompting. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 18082\u201318091","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"2876_CR33","doi-asserted-by":"crossref","unstructured":"Wu Z, Xiong Y, Yu SX, Lin D (2018) Unsupervised feature learning via non-parametric instance discrimination. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3733\u20133742","DOI":"10.1109\/CVPR.2018.00393"},{"key":"2876_CR34","doi-asserted-by":"crossref","unstructured":"Ye M, Zhang X, Yuen PC, Chang S-F (2019) Unsupervised embedding learning via invariant and spreading instance feature. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 6210\u20136219","DOI":"10.1109\/CVPR.2019.00637"},{"key":"2876_CR35","doi-asserted-by":"crossref","unstructured":"Tian Y, Krishnan D, Isola P (2020) Contrastive multiview coding. In: Computer Vision\u2013ECCV 2020: 16th European conference, Glasgow, UK, August 23\u201328, 2020, proceedings, part XI 16, pp 776\u2013794. Springer","DOI":"10.1007\/978-3-030-58621-8_45"},{"key":"2876_CR36","doi-asserted-by":"crossref","unstructured":"He K, Fan H, Wu Y, Xie S, Girshick R (2020) Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 9729\u20139738","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"2876_CR37","unstructured":"Chen T, Kornblith S, Norouzi M, Hinton G (2020) A simple framework for contrastive learning of visual representations. In: International conference on machine learning, pp 1597\u20131607. PMLR"},{"key":"2876_CR38","first-page":"21271","volume":"33","author":"J-B Grill","year":"2020","unstructured":"Grill J-B, Strub F, Altch\u00e9 F, Tallec C, Richemond P, Buchatskaya E, Doersch C, Avila Pires B, Guo Z, Gheshlaghi Azar M et al (2020) Bootstrap your own latent-a new approach to self-supervised learning. Adv Neural Inf Process Syst 33:21271\u201321284","journal-title":"Adv Neural Inf Process Syst"},{"key":"2876_CR39","doi-asserted-by":"crossref","unstructured":"Chen X, He K (2021) Exploring simple siamese representation learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 15750\u201315758","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"2876_CR40","doi-asserted-by":"crossref","unstructured":"Caron M, Touvron H, Misra I, J\u00e9gou H, Mairal J, Bojanowski P, Joulin A (2021) Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 9650\u20139660","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"2876_CR41","doi-asserted-by":"crossref","unstructured":"Zhou Q, Xu H, Li H, Zhang H, Zhang X, Wang Y, Gao K (2024) Token-level contrastive learning with modality-aware prompting for multimodal intent recognition. In: Proceedings of the AAAI conference on artificial intelligence, vol 38, pp 17114\u201317122","DOI":"10.1609\/aaai.v38i15.29656"},{"key":"2876_CR42","doi-asserted-by":"crossref","unstructured":"Liu Z, Ning J, Cao Y, Wei Y, Zhang Z, Lin S, Hu H (2022) Video swin transformer. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3202\u20133211","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"2876_CR43","doi-asserted-by":"crossref","unstructured":"Liu Y, Xiong P, Xu L, Cao S, Jin Q (2022) Ts2-net: Token shift and selection transformer for text-video retrieval. In: European conference on computer vision, pp 319\u2013335. Springer","DOI":"10.1007\/978-3-031-19781-9_19"},{"key":"2876_CR44","first-page":"12449","volume":"33","author":"A Baevski","year":"2020","unstructured":"Baevski A, Zhou Y, Mohamed A, Auli M (2020) wav2vec 2.0: a framework for self-supervised learning of speech representations. Adv Neural Inform Process Syst 33:12449\u201312460","journal-title":"Adv Neural Inform Process Syst"},{"key":"2876_CR45","unstructured":"Lang S, Chuqing H, Guofa L, Dongpu C (2020) Msaf: multimodal split attention fusion. CoRR"},{"key":"2876_CR46","unstructured":"Loshchilov I, Hutter F (2017) Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101"}],"container-title":["International Journal of Machine Learning and Cybernetics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-025-02876-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13042-025-02876-7","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-025-02876-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,9]],"date-time":"2026-02-09T09:39:15Z","timestamp":1770629955000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13042-025-02876-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1]]},"references-count":46,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,1]]}},"alternative-id":["2876"],"URL":"https:\/\/doi.org\/10.1007\/s13042-025-02876-7","relation":{},"ISSN":["1868-8071","1868-808X"],"issn-type":[{"value":"1868-8071","type":"print"},{"value":"1868-808X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1]]},"assertion":[{"value":"27 January 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 November 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 January 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"17"}}