{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T12:21:08Z","timestamp":1773490868184,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":28,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819570744","type":"print"},{"value":"9789819570751","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-7075-1_26","type":"book-chapter","created":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T11:13:41Z","timestamp":1773486821000},"page":"395-410","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Improving Intent Detection with\u00a0Hierarchical Multimodal Representation and\u00a0Triplet Contrastive Learning"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-6166-1690","authenticated-orcid":false,"given":"Lanlan","family":"Lu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5007-8805","authenticated-orcid":false,"given":"Qimeng","family":"Yang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3409-2092","authenticated-orcid":false,"given":"Yi","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,3,15]]},"reference":[{"key":"26_CR1","unstructured":"Wu, J., et al.: Seeing through deception: uncovering misleading creator intent in multimodal news with vision-language models. arXiv preprint arXiv:2505.15489 (2025)"},{"key":"26_CR2","unstructured":"Xin, J., et al.: I2MoE: interpretable multimodal interaction-aware mixture-of-experts. arXiv preprint arXiv:2505.19190 (2025)"},{"key":"26_CR3","unstructured":"Li, T., Liu, D.: MPID: a modality-preserving and interaction-driven fusion network for multimodal sentiment analysis. In: Proceedings of the 31st International Conference on Computational Linguistics, pp. 4313\u20134322 (2025)"},{"key":"26_CR4","unstructured":"Qian, C., et al.: DecAlign: hierarchical cross-modal alignment for decoupled multimodal representation learning. arXiv preprint arXiv:2503.11892 (2025)"},{"key":"26_CR5","doi-asserted-by":"crossref","unstructured":"Wang, C., Gu, Z., Wei, J.-M.: Spectral clustering and embedding with inter-class topology-preserving. Knowl.-Based Syst. 284, 111278 (2024)","DOI":"10.1016\/j.knosys.2023.111278"},{"key":"26_CR6","doi-asserted-by":"crossref","unstructured":"Zhu, Z., et al.: InMu-Net: advancing multi-modal intent detection via information bottleneck and multi-sensory processing. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp. 515\u2013524 (2024)","DOI":"10.1145\/3664647.3681623"},{"key":"26_CR7","doi-asserted-by":"crossref","unstructured":"Shen, Y., Lin, X., Fan, W.: A-MESS: anchor based multi- modal embedding with semantic synchronization for multimodal intent recognition. arXiv preprint arXiv:2503.19474 (2025)","DOI":"10.1109\/ICME59968.2025.11209824"},{"key":"26_CR8","unstructured":"Nguyen, Q.-M.T., Nguyen, L.-N.T., Nguyen, C.-V.T.: TECO: improving multimodal intent recognition with text enhancement through commonsense knowledge extraction. arXiv preprint arXiv:2412.08529 (2024)"},{"key":"26_CR9","unstructured":"Chen, Y., et al.: Prompt learning for multimodal intent recognition with modal alignment perception. Cogn. Comput., 1\u201312 (2024)"},{"key":"26_CR10","doi-asserted-by":"crossref","unstructured":"Huang, S., et al.: SDIF-DA: a shallow-to-deep interaction framework with data augmentation for multi-modal intent detection. In: ICASSP 2024\u20132024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 10206\u201310210. IEEE (2024)","DOI":"10.1109\/ICASSP48485.2024.10446922"},{"key":"26_CR11","doi-asserted-by":"crossref","unstructured":"Zhou, Q., et al.: Token-level contrastive learning with modality-aware prompting for multimodal intent recognition. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, no. 15, pp. 17114\u201317122 (2024)","DOI":"10.1609\/aaai.v38i15.29656"},{"key":"26_CR12","unstructured":"Yin, S., Huang, P., Xu, Y.: MIDLM: multi-intent detection with bidirectional large language models In: Proceedings of the 31st International Conference on Computational Linguistics, pp. 2616\u20132625 (2025)"},{"key":"26_CR13","unstructured":"Tjandrasuwita, M., et al.: Understanding the emergence of multimodal representation alignment. arXiv preprint arXiv:2502.16282 (2025)"},{"key":"26_CR14","unstructured":"Sefidgaran, M., Zaidi, A., Krasnowski, P.: Generalization guarantees for representation learning via data-dependent gaussian mixture priors. arXiv preprint arXiv:2502.15540 (2025)"},{"key":"26_CR15","unstructured":"Yonay, O., Hammond, T., Yang, T.: Myna: masking-based contrastive learning of musical representations. arXiv preprint arXiv:2502.12511 (2025)"},{"key":"26_CR16","unstructured":"Zhou, G., et al.: CL-MFAP: a contrastive learning-based multimodal foundation model for molecular property prediction and antibiotic screening. arXiv preprint arXiv:2502.11001 (2025)"},{"key":"26_CR17","unstructured":"Panigrahi, A., et al.: Progressive distillation induces an implicit curriculum. arXiv preprint arXiv:2410.05464 (2024)"},{"key":"26_CR18","unstructured":"Li, H., et al.: Test-time adaptation for cross-modal retrieval with query shift. arXiv preprint arXiv:2410.15624 (2024)"},{"key":"26_CR19","doi-asserted-by":"crossref","unstructured":"Wennberg, U., Henter, G.: Learned transformer position EM-beddings have a low-dimensional structure. In: Proceedings of the 9th Workshop on Representation Learning for NLP (RepL4NLP-2024), pp. 237\u2013244 (2024)","DOI":"10.18653\/v1\/2024.repl4nlp-1.17"},{"key":"26_CR20","doi-asserted-by":"crossref","unstructured":"Ki, D., Park, C., Kim, H.: Mitigating semantic leakage in cross-lingual embeddings via orthogonality constraint. arXiv preprint arXiv:2409.15664 (2024)","DOI":"10.18653\/v1\/2024.repl4nlp-1.19"},{"key":"26_CR21","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: MIntRec: a new dataset for multimodal intent recognition. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 1688\u20131697 (2022)","DOI":"10.1145\/3503161.3547906"},{"key":"26_CR22","doi-asserted-by":"crossref","unstructured":"Saha, T., et al.: Towards emotion-aided multi-modal dialogue act classification. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 4361\u20134372 (2020)","DOI":"10.18653\/v1\/2020.acl-main.402"},{"key":"26_CR23","unstructured":"Devlin, J., et al.: BERT: pre-training of deep bidirectional transform-ERS for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186 (2019)"},{"key":"26_CR24","doi-asserted-by":"crossref","unstructured":"Graves, A., Schmidhuber, J.: Framewise phoneme classification with bidirectional LSTM and other neural network architectures. Neural Netw. 18(5-6), 602\u2013610 (2005)","DOI":"10.1016\/j.neunet.2005.06.042"},{"key":"26_CR25","unstructured":"Kingma, D.P.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"26_CR26","doi-asserted-by":"crossref","unstructured":"Tsai, Y.-H.H., et al.: Multimodal transformer for unaligned multimodal language sequences. In: Proceedings of the Conference. Association for Computational Linguistics. Meeting, vol. 2019, p. 6558 (2019)","DOI":"10.18653\/v1\/P19-1656"},{"key":"26_CR27","doi-asserted-by":"crossref","unstructured":"Rahman, W., et al.: Integrating multimodal information in large pretrained transformers. In: Proceedings of the Conference. Association for Computational Linguistics. Meeting, vol. 2020, p. 2359 (2020)","DOI":"10.18653\/v1\/2020.acl-main.214"},{"key":"26_CR28","doi-asserted-by":"crossref","unstructured":"Hazarika, D., Zimmermann, R., Poria, S.: MISA: modality-invariant and-specific representations for multimodal sentiment analysis. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1122\u20131131 (2020)","DOI":"10.1145\/3394171.3413678"}],"container-title":["Lecture Notes in Computer Science","PRICAI 2025: Trends in Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-7075-1_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T11:13:43Z","timestamp":1773486823000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-7075-1_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819570744","9789819570751"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-7075-1_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"15 March 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRICAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pacific Rim International Conference on Artificial Intelligence","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wellington","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"New Zealand","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 November 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 November 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"22","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"pricai2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.pricai.org\/2025\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}