{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:56:28Z","timestamp":1781538988919,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Science Research Project of Hebei Education Department of China","award":["QN2024196"],"award-info":[{"award-number":["QN2024196"]}]},{"name":"Science Research Project of Hebei Education Department of China","award":["QN2026772"],"award-info":[{"award-number":["QN2026772"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810808","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"797-806","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Mitigating Multimodal Inconsistency via Cognitive Dual-Pathway Reasoning for Intent Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-4984-4347","authenticated-orcid":false,"given":"Yifan","family":"Wang","sequence":"first","affiliation":[{"name":"Hebei University of Science and Technology, Shijiazhuang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1956-1047","authenticated-orcid":false,"given":"Peiwu","family":"Wang","sequence":"additional","affiliation":[{"name":"Hebei University of Science and Technology, Shijiazhuang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8902-1702","authenticated-orcid":false,"given":"Yunxian","family":"Chi","sequence":"additional","affiliation":[{"name":"Hebei University of Science and Technology, Shijiazhuang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4421-273X","authenticated-orcid":false,"given":"Zhinan","family":"Gou","sequence":"additional","affiliation":[{"name":"Hebei University of Economics and Business, Shijiazhuang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0141-0743","authenticated-orcid":false,"given":"Kai","family":"Gao","sequence":"additional","affiliation":[{"name":"Hebei University of Science and Technology, Shijiazhuang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"Konstantinos Bousmalis George Trigeorgis Nathan Silberman Dilip Krishnan and Dumitru Erhan. 2016. Domain separation networks. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"crossref","unstructured":"Santiago Castro Devamanyu Hazarika Ver\u00f3nica P\u00e9rez-Rosas Roger Zimmermann Rada Mihalcea and Soujanya Poria. 2019. Towards multimodal sarcasm detection (an _obviously_ perfect paper). arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1906.01815 (2019).","DOI":"10.18653\/v1\/P19-1455"},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/935"},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","unstructured":"Sanyuan Chen Chengyi Wang Zhengyang Chen Yu Wu Shujie Liu Zhuo Chen Jinyu Li Naoyuki Kanda Takuya Yoshioka Xiong Xiao Jian Wu Long Zhou Shuo Ren Yanmin Qian Yao Qian Jian Wu Michael Zeng Xiangzhan Yu and Furu Wei. 2022. WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing. IEEE Journal of Selected Topics in Signal Processing 16 6 (2022) 1505\u20131518. 10.1109\/JSTSP.2022.3188113","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.972"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-short.105"},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"crossref","unstructured":"Benedict\u00a0GC Dellaert Suzanne\u00a0B Shu Theo\u00a0A Arentze Tom Baker Kristin Diehl Bas Donkers Nathanael\u00a0J Fast Gerald H\u00e4ubl Heidi Johnson Uma\u00a0R Karmarkar et\u00a0al. 2020. Consumer decisions with artificially intelligent voice assistants. Marketing Letters 31 (2020) 335\u2013347.","DOI":"10.1007\/s11002-020-09537-5"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888033"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2025\/582"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"crossref","unstructured":"Wei Han Hui Chen and Soujanya Poria. 2021. Improving multimodal fusion with hierarchical mutual information maximization for multimodal sentiment analysis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2109.00412 (2021).","DOI":"10.18653\/v1\/2021.emnlp-main.723"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1211"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","unstructured":"Bo Hu Kai Zhang Yanghai Zhang and Yuyang Ye. 2025. Adaptive Multimodal Fusion: Dynamic Attention Allocation for Intent Recognition. Proceedings of the AAAI Conference on Artificial Intelligence 39 16 (Apr. 2025) 17267\u201317275. 10.1609\/aaai.v39i16.33898","DOI":"10.1609\/aaai.v39i16.33898"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446922"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i25.34883"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","unstructured":"Sepideh Kaffash An\u00a0Truong Nguyen and Joe Zhu. 2021. Big data algorithms and applications in intelligent transportation system: A review and bibliometric analysis. International Journal of Production Economics 231 (2021) 107868. 10.1016\/j.ijpe.2020.107868","DOI":"10.1016\/j.ijpe.2020.107868"},{"key":"e_1_3_3_1_19_2","unstructured":"Daniel Kahneman. 2011. Thinking fast and slow. Farrar Straus and Giroux (2011)."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Christoph Kofler Martha Larson and Alan Hanjalic. 2016. User intent in multimedia search: a survey of the state of the art and future challenges. ACM Computing Surveys (CSUR) 49 2 (2016) 1\u201337.","DOI":"10.1145\/2954930"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"crossref","unstructured":"Solomon Kullback and Richard\u00a0A Leibler. 1951. On information and sufficiency. The annals of mathematical statistics 22 1 (1951) 79\u201386.","DOI":"10.1214\/aoms\/1177729694"},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"crossref","unstructured":"Jianhua Lin. 2002. Divergence measures based on the Shannon entropy. IEEE Transactions on Information theory 37 1 (2002) 145\u2013151.","DOI":"10.1109\/18.61115"},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_3_1_24_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1209"},{"key":"e_1_3_3_1_25_2","volume-title":"International Conference on Learning Representations","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=Bkg6RiCqY7"},{"key":"e_1_3_3_1_26_2","doi-asserted-by":"publisher","DOI":"10.1145\/3577190.3614127"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","unstructured":"Jong\u00a0Hak Moon Hyungyung Lee Woncheol Shin Young-Hak Kim and Edward Choi. 2022. Multi-Modal Understanding and Generation for Medical Images and Text via Vision-Language Pre-Training. IEEE Journal of Biomedical and Health Informatics 26 12 (2022) 6070\u20136080. 10.1109\/JBHI.2022.3207502","DOI":"10.1109\/JBHI.2022.3207502"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICMLA55696.2022.00127"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"e_1_3_3_1_30_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.402"},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"crossref","unstructured":"Claude\u00a0E Shannon. 1948. A mathematical theory of communication. The Bell system technical journal 27 3 (1948) 379\u2013423.","DOI":"10.1002\/j.1538-7305.1948.tb01338.x"},{"key":"e_1_3_3_1_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02546"},{"key":"e_1_3_3_1_33_2","doi-asserted-by":"publisher","unstructured":"Pallavi Tiwari Bhaskar Pant Mahmoud\u00a0M. Elarabawy Mohammed Abd-Elnaby Noor Mohd Gaurav Dhiman and Subhash Sharma. 2022. CNN Based Multiclass Brain Tumor Detection Using Medical Imaging. Computational Intelligence and Neuroscience 2022 1 (2022) 1830010. arXiv:https:\/\/onlinelibrary.wiley.com\/doi\/pdf\/10.1155\/2022\/183001010.1155\/2022\/1830010","DOI":"10.1155\/2022\/1830010"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1656"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890465"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","unstructured":"Wei Xu. 2019. Toward human-centered AI: a perspective from human-computer interaction. Interactions 26 4 (June 2019) 42\u201346. 10.1145\/3328485","DOI":"10.1145\/3328485"},{"key":"e_1_3_3_1_37_2","volume-title":"The Thirty-ninth Annual Conference on Neural Information Processing Systems","author":"Yang Qu","unstructured":"Qu Yang, Xiyang Li, Fu Lin, and Mang Ye. [n. d.]. Adaptive Re-calibration Learning for Balanced Multimodal Intention Recognition. In The Thirty-ninth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17289"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475585"},{"key":"e_1_3_3_1_40_2","doi-asserted-by":"crossref","unstructured":"Amir Zadeh Minghai Chen Soujanya Poria Erik Cambria and Louis-Philippe Morency. 2017. Tensor fusion network for multimodal sentiment analysis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1707.07250 (2017).","DOI":"10.18653\/v1\/D17-1115"},{"key":"e_1_3_3_1_41_2","unstructured":"Amir Zadeh Rowan Zellers Eli Pincus and Louis-Philippe Morency. 2016. Mosi: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1606.06259 (2016)."},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1208"},{"key":"e_1_3_3_1_43_2","unstructured":"Werner Zellinger Thomas Grubinger Edwin Lughofer Thomas Natschl\u00e4ger and Susanne Saminger-Platz. 2017. Central moment discrepancy (CMD) for domain-invariant representation learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1702.08811 (2017)."},{"key":"e_1_3_3_1_44_2","unstructured":"Hanlei Zhang Xiaoteng Li Hua Xu Panpan Zhang Kang Zhao and Kai Gao. 2021. TEXTOIR: An integrated and visualized platform for text open intent recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2110.15063 (2021)."},{"key":"e_1_3_3_1_45_2","volume-title":"The Twelfth International Conference on Learning Representations","author":"Zhang Hanlei","year":"2024","unstructured":"Hanlei Zhang, Xin Wang, Hua Xu, Qianrui Zhou, Kai Gao, Jianhua Su, jinyue Zhao, Wenrui Li, and Yanting Chen. 2024. MIntRec2.0: A Large-scale Benchmark Dataset for Multimodal Intent Recognition and Out-of-scope Detection in Conversations. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=nY9nITZQjc"},{"key":"e_1_3_3_1_46_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547906"},{"key":"e_1_3_3_1_47_2","unstructured":"Hanlei Zhang Qianrui Zhou Hua Xu Jianhua Su Roberto Evans and Kai Gao. 2024. Multimodal Classification and Out-of-distribution Detection for Multimodal Intent Understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.12453 (2024)."},{"key":"e_1_3_3_1_48_2","doi-asserted-by":"publisher","unstructured":"Qianrui Zhou Hua Xu Hao Li Hanlei Zhang Xiaohan Zhang Yifan Wang and Kai Gao. 2024. Token-Level Contrastive Learning with Modality-Aware Prompting for Multimodal Intent Recognition. Proceedings of the AAAI Conference on Artificial Intelligence 38 15 (Mar. 2024) 17114\u201317122. 10.1609\/aaai.v38i15.29656","DOI":"10.1609\/aaai.v38i15.29656"},{"key":"e_1_3_3_1_49_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.1130"},{"key":"e_1_3_3_1_50_2","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681623"},{"key":"e_1_3_3_1_51_2","unstructured":"Yicheng Zou Hongwei Liu Tao Gui Junzhe Wang Qi Zhang Meng Tang Haixiang Li and Daniel Wang. 2022. Divide and conquer: Text semantic matching with disentangled keywords and intents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2203.02898 (2022)."}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:29:58Z","timestamp":1781537398000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810808"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":50,"alternative-id":["10.1145\/3805622.3810808","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810808","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}