{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,3]],"date-time":"2026-05-03T10:03:09Z","timestamp":1777802589565,"version":"3.51.4"},"reference-count":36,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62562057"],"award-info":[{"award-number":["62562057"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100009110","name":"Natural Science Foundation of Xinjiang Uygur Autonomous Region","doi-asserted-by":"publisher","award":["2023D01C176"],"award-info":[{"award-number":["2023D01C176"]}],"id":[{"id":"10.13039\/100009110","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Signal Processing"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.sigpro.2026.110537","type":"journal-article","created":{"date-parts":[[2026,2,5]],"date-time":"2026-02-05T00:52:33Z","timestamp":1770252753000},"page":"110537","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["SaliText: A multimodal intent recognition method with saliency and text-guided fusion"],"prefix":"10.1016","volume":"244","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-1973-5125","authenticated-orcid":false,"given":"Huiting","family":"Li","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5007-8805","authenticated-orcid":false,"given":"Qimeng","family":"Yang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2817-6480","authenticated-orcid":false,"given":"Yichao","family":"Xia","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6166-1690","authenticated-orcid":false,"given":"Lanlan","family":"Lu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3333-470X","authenticated-orcid":false,"given":"Qixing","family":"Wei","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.sigpro.2026.110537_bib0001","doi-asserted-by":"crossref","DOI":"10.1016\/j.compeleceng.2025.110301","article-title":"An effective multi\u2013modality feature synergy and feature enhancer for multimodal intent recognition","volume":"123","author":"Xia","year":"2025","journal-title":"Comput. Electr. Eng."},{"issue":"2","key":"10.1016\/j.sigpro.2026.110537_bib0002","doi-asserted-by":"crossref","first-page":"423","DOI":"10.1109\/TPAMI.2018.2798607","article-title":"Multimodal machine learning: a survey and taxonomy","volume":"41","author":"Baltru\u0161aitis","year":"2018","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.sigpro.2026.110537_bib0003","unstructured":"C. Arnold, A. K\u00fcpfer, Alignment Helps Make the Most of Multimodal Data, 2024, (arXiv preprint arXiv: 2405.08454). https:\/\/arxiv.org\/abs\/2405.08454. 10.48550\/arXiv.2405.08454."},{"key":"10.1016\/j.sigpro.2026.110537_bib0004","series-title":"ICASSP 2024 - IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"10206","article-title":"SDIF\u2013DA: a shallow\u2013to\u2013deep interaction framework with data augmentation for multi-modal intent detection","author":"Huang","year":"2024"},{"key":"10.1016\/j.sigpro.2026.110537_bib0005","series-title":"Proceedings of the 30th ACM International Conference on Multimedia","first-page":"1688","article-title":"MIntRec: a new dataset for multimodal intent recognition","author":"Zhang","year":"2022"},{"key":"10.1016\/j.sigpro.2026.110537_bib0006","series-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","first-page":"4361","article-title":"Towards emotion-aided multi-modal dialogue act classification","author":"Saha","year":"2020"},{"key":"10.1016\/j.sigpro.2026.110537_bib0007","unstructured":"H. Zhang, X. Wang, H. Xu, Q. Zhou, K. Gao, J. Su,...., Y. Chen, MIntRec2.0: A Large-scale Benchmark Dataset for Multimodal Intent Recognition and Out-of-scope Detection in Conversations, 2024, https:\/\/arxiv.org\/abs\/2403.10943. 10.48550\/arXiv.2403.10943."},{"key":"10.1016\/j.sigpro.2026.110537_bib0008","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"1450","article-title":"CrossCLR: cross-modal contrastive learning for multi-modal video representations","author":"Zolfaghari","year":"2021"},{"key":"10.1016\/j.sigpro.2026.110537_bib0009","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2023.126373","article-title":"An effective multimodal representation and fusion method for multimodal intent recognition","volume":"548","author":"Huang","year":"2023","journal-title":"Neurocomputing"},{"key":"10.1016\/j.sigpro.2026.110537_bib0010","series-title":"Proceedings of the 32nd ACM International Conference on Multimedia (MM \u201924), Melbourne, VIC, Australia","first-page":"515","article-title":"InMu-Net: advancing multi-modal intent detection via information bottleneck and multisensory processing","author":"Zhu","year":"2024"},{"key":"10.1016\/j.sigpro.2026.110537_bib0011","series-title":"Proceedings of the 29th International Conference on Computational Linguistics (COLING 2022), Gyeongju, Republic of Korea","first-page":"2572","article-title":"Multi-modal contrastive representation learning for entity alignment","author":"Lin","year":"2022"},{"key":"10.1016\/j.sigpro.2026.110537_bib0012","series-title":"Proceedings of the 31st ACM International Conference on Multimedia (MM \u201923), Ottawa, ON, Canada","first-page":"3317","article-title":"Meaformer: multi-modal entity alignment transformer for meta modality hybrid","author":"Chen","year":"2023"},{"issue":"7","key":"10.1016\/j.sigpro.2026.110537_bib0013","doi-asserted-by":"crossref","first-page":"990","DOI":"10.3390\/sym17070990","article-title":"Multi-modal entity alignment based on enhanced relationship learning and multi-layer feature fusion","volume":"17","author":"Li","year":"2025","journal-title":"Symmetry"},{"key":"10.1016\/j.sigpro.2026.110537_bib0014","series-title":"Proceedings of the 31st International Conference on Computational Linguistics (COLING 2025)","first-page":"7851","article-title":"SGMEA: structure-guided multimodal entity alignment","author":"Cheng","year":"2025"},{"key":"10.1016\/j.sigpro.2026.110537_bib0015","doi-asserted-by":"crossref","first-page":"124","DOI":"10.1016\/j.knosys.2018.07.041","article-title":"Multimodal sentiment analysis using hierarchical fusion with context modeling","volume":"161","author":"Majumder","year":"2018","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.sigpro.2026.110537_bib0016","unstructured":"G. Barnum, S. Talukder, Y. Yue, On the Benefits of Early Fusion in Multimodal Representation Learning, 2020, 10.48550\/arXiv.2011.07191."},{"key":"10.1016\/j.sigpro.2026.110537_bib0017","unstructured":"Z. Liu, Y. Shen, V.B. Lakshminarasimhan, P.P. Liang, A. Zadeh, L.-P. Morency, Efficient Low-rank Multimodal Fusion with Modality-Specific Factors, 2018, https:\/\/arxiv.org\/abs\/1806.00064."},{"key":"10.1016\/j.sigpro.2026.110537_bib0018","series-title":"Advances in Neural Information Processing Systems (NeurIPS 2019)","article-title":"Deep multimodal multilinear fusion with high-order polynomial pooling","volume":"32","author":"Hou","year":"2019"},{"key":"10.1016\/j.sigpro.2026.110537_bib0019","series-title":"Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence","article-title":"Memory fusion network for multi-view sequential learning","volume":"32","author":"Zadeh","year":"2018"},{"key":"10.1016\/j.sigpro.2026.110537_bib0020","series-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics","first-page":"6558","article-title":"Multimodal transformer for unaligned multimodal language sequences","volume":"32","author":"Tsai","year":"2019"},{"key":"10.1016\/j.sigpro.2026.110537_bib0021","series-title":"Proceedings of the 28th ACM International Conference on Multimedia","first-page":"1122","article-title":"MISA: modality-invariant and-Specific representations for multimodal sentiment analysis","author":"Hazarika","year":"2020"},{"key":"10.1016\/j.sigpro.2026.110537_bib0022","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"17114","article-title":"Token-level contrastive learning with modality-aware prompting for multimodal intent recognition","volume":"38","author":"Zhou","year":"2024"},{"key":"10.1016\/j.sigpro.2026.110537_bib0023","series-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","first-page":"2359","article-title":"Integrating multimodal information in large pretrained transformers","volume":"2020","author":"Rahman","year":"2020"},{"key":"10.1016\/j.sigpro.2026.110537_bib0024","series-title":"Advances in Neural Information Processing Systems","article-title":"Bilinear attention networks","volume":"31","author":"Kim","year":"2018"},{"key":"10.1016\/j.sigpro.2026.110537_bib0025","doi-asserted-by":"crossref","unstructured":"Z. Pan, Z. Luo, J. Yang, H. Li, Multi-modal attention for speech emotion recognition, 2020, 2009.04107.","DOI":"10.21437\/Interspeech.2020-1653"},{"key":"10.1016\/j.sigpro.2026.110537_bib0026","series-title":"ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"4693","article-title":"Is cross-attention preferable to self-attention for multi-modal emotion recognition?","author":"Rajan","year":"2022"},{"key":"10.1016\/j.sigpro.2026.110537_bib0027","doi-asserted-by":"crossref","DOI":"10.3389\/fcomp.2024.1304687","article-title":"EmoAsst: emotion recognition assistant via text-guided transfer learning on pre-trained visual and acoustic models","volume":"6","author":"Wang","year":"2024","journal-title":"Front. Comput. Sci."},{"key":"10.1016\/j.sigpro.2026.110537_bib0028","series-title":"Proceedings of the 32nd ACM International Conference on Multimedia (MM \u201924)","first-page":"9330","article-title":"Ada2I: enhancing modality balance for multimodal conversational emotion recognition","author":"Nguyen","year":"2024"},{"issue":"11","key":"10.1016\/j.sigpro.2026.110537_bib0029","doi-asserted-by":"crossref","first-page":"5468","DOI":"10.1109\/TKDE.2023.3340732","article-title":"A clustering framework for unsupervised and semi-supervised new intent discovery","volume":"36","author":"Zhang","year":"2024","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"10.1016\/j.sigpro.2026.110537_bib0030","series-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.sigpro.2026.110537_bib0031","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"1359","article-title":"M3ER: Multiplicative multimodal emotion recognition using facial, textual, and speech cues","volume":"34","author":"Mittal","year":"2020"},{"key":"10.1016\/j.sigpro.2026.110537_bib0032","series-title":"International Conference on Intelligent Computing","article-title":"Auxiliary context module and weighted multihead fusion for multimodal intent recognition","author":"Xia","year":"2025"},{"key":"10.1016\/j.sigpro.2026.110537_bib0033","unstructured":"A. Baevski, Y. Zhou, A. Mohamed, M. Auli, wav2vec 2.0: A framework for self-supervised learning of speech representations, (2020). 10.48550\/arXiv.2006.11477."},{"key":"10.1016\/j.sigpro.2026.110537_bib0034","doi-asserted-by":"crossref","unstructured":"T. Wolf, L. Debut, V. Sanh, J. Chaumond, C. Delangue, A. Moi, et al., HuggingFace\u2019s Transformers: State-of-the-art Natural Language Processing, (2019). 10.48550\/arXiv.1910.03771.","DOI":"10.18653\/v1\/2020.emnlp-demos.6"},{"key":"10.1016\/j.sigpro.2026.110537_bib0035","series-title":"2009 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"248","article-title":"ImageNet: a large-scale hierarchical image database","author":"Deng","year":"2009"},{"key":"10.1016\/j.sigpro.2026.110537_bib0036","unstructured":"I. Loshchilov, F. Hutter, Decoupled Weight Decay Regularization, (2017). 10.48550\/arXiv.1711.05101."}],"container-title":["Signal Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0165168426000514?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0165168426000514?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T20:33:20Z","timestamp":1777494800000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0165168426000514"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":36,"alternative-id":["S0165168426000514"],"URL":"https:\/\/doi.org\/10.1016\/j.sigpro.2026.110537","relation":{},"ISSN":["0165-1684"],"issn-type":[{"value":"0165-1684","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"SaliText: A multimodal intent recognition method with saliency and text-guided fusion","name":"articletitle","label":"Article Title"},{"value":"Signal Processing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.sigpro.2026.110537","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"110537"}}