{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:18:01Z","timestamp":1776881881755,"version":"3.51.2"},"publisher-location":"Singapore","reference-count":76,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819608461","type":"print"},{"value":"9789819608478","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,14]],"date-time":"2024-12-14T00:00:00Z","timestamp":1734134400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,14]],"date-time":"2024-12-14T00:00:00Z","timestamp":1734134400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-0847-8_3","type":"book-chapter","created":{"date-parts":[[2024,12,13]],"date-time":"2024-12-13T04:26:04Z","timestamp":1734063964000},"page":"33-47","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Recent Advances on\u00a0Multi-modal Dialogue Systems: A Survey"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-8075-7528","authenticated-orcid":false,"given":"Fenghua","family":"Cheng","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4515-6792","authenticated-orcid":false,"given":"Xue","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7634-8758","authenticated-orcid":false,"given":"Haoyang","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4092-1161","authenticated-orcid":false,"given":"Jiangcheng","family":"Sang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4919-7075","authenticated-orcid":false,"given":"Wenqi","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,12,14]]},"reference":[{"key":"3_CR1","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"3_CR2","unstructured":"Adiwardana, D., et\u00a0al.: Towards a human-like open-domain chatbot. arXiv preprint arXiv:2001.09977 (2020)"},{"key":"3_CR3","doi-asserted-by":"crossref","unstructured":"Alamri, H., et\u00a0al.: Audio visual scene-aware dialog. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7558\u20137567 (2019)","DOI":"10.1109\/CVPR.2019.00774"},{"key":"3_CR4","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"3_CR5","doi-asserted-by":"crossref","unstructured":"Andreas, J., Rohrbach, M., Darrell, T., Klein, D.: Neural module networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 39\u201348 (2016)","DOI":"10.1109\/CVPR.2016.12"},{"key":"3_CR6","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: Visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"3_CR7","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp. 65\u201372 (2005)"},{"key":"3_CR8","unstructured":"Byeon, M., Park, B., Kim, H., Lee, S., Baek, W., Kim, S.: COYO-700M: image-text pair dataset. https:\/\/github.com\/kakaobrain\/coyo-dataset (2022)"},{"key":"3_CR9","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo Vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"3_CR10","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12M: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3558\u20133568 (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"3_CR11","unstructured":"Chen, X., et\u00a0al.: PaLI: a jointly-scaled multilingual language-image model. In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"3_CR12","doi-asserted-by":"crossref","unstructured":"Cheng, F., Li, X., Huang, Z., Wang, J., Wang, S.: Event-content-oriented dialogue generation in short video. In: 2024 Annual Conference of the North American Chapter of the Association for Computational Linguistics (2024)","DOI":"10.18653\/v1\/2024.naacl-long.229"},{"key":"3_CR13","unstructured":"Schuhmann, C., K\u00f6pf, A., Vencu, R., Coombes, T., Beaumont, R.: LAION COCO: 600M synthetic captions from laion2B-EN (2022)"},{"key":"3_CR14","doi-asserted-by":"crossref","unstructured":"Das, A., et al.: Visual dialog. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 326\u2013335 (2017)","DOI":"10.1109\/CVPR.2017.121"},{"key":"3_CR15","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"3_CR16","doi-asserted-by":"crossref","unstructured":"Feng, J., et al.: MMDialog: a large-scale multi-turn dialogue dataset towards multi-modal open-domain conversation. arXiv preprint arXiv:2211.05719 (2022)","DOI":"10.18653\/v1\/2023.acl-long.405"},{"key":"3_CR17","doi-asserted-by":"crossref","unstructured":"Firdaus, M., Thakur, N., Ekbal, A.: MultiDM-GCN: aspect-guided response generation in multi-domain multi-modal dialogue system using graph convolutional network. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 2318\u20132328 (2020)","DOI":"10.18653\/v1\/2020.findings-emnlp.210"},{"issue":"2","key":"3_CR18","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3430752","volume":"12","author":"M Firdaus","year":"2021","unstructured":"Firdaus, M., Thakur, N., Ekbal, A.: Aspect-aware response generation for multimodal dialogue system. ACM Trans. Intell. Syst. Technol. (TIST) 12(2), 1\u201333 (2021)","journal-title":"ACM Trans. Intell. Syst. Technol. (TIST)"},{"key":"3_CR19","doi-asserted-by":"crossref","unstructured":"Fu, J., Ng, S.K., Jiang, Z., Liu, P.: GPTScore: evaluate as you desire. arXiv preprint arXiv:2302.04166 (2023)","DOI":"10.18653\/v1\/2024.naacl-long.365"},{"key":"3_CR20","unstructured":"Gadre, S.Y., et\u00a0al.: DataComp: in search of the next generation of multimodal datasets. Adv. Neural Info. Process. Syst. 36 (2024)"},{"key":"3_CR21","unstructured":"Gao, L., et\u00a0al.: The Pile: an 800GB dataset of diverse text for language modeling. arXiv preprint arXiv:2101.00027 (2020)"},{"key":"3_CR22","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast R-CNN. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1440\u20131448 (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"3_CR23","doi-asserted-by":"crossref","unstructured":"Han, S., Hessel, J., Dziri, N., Choi, Y., Yu, Y.: CHAMPAGNE: learning real-world conversation from large-scale web videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15498\u201315509 (2023)","DOI":"10.1109\/ICCV51070.2023.01421"},{"key":"3_CR24","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"3_CR25","unstructured":"Henderson, M., et\u00a0al.: A repository of conversational datasets. arXiv preprint arXiv:1904.06472 (2019)"},{"issue":"6","key":"3_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3295748","volume":"51","author":"MZ Hossain","year":"2019","unstructured":"Hossain, M.Z., Sohel, F., Shiratuddin, M.F., Laga, H.: A comprehensive survey of deep learning for image captioning. ACM Comput. Surv. (CsUR) 51(6), 1\u201336 (2019)","journal-title":"ACM Comput. Surv. (CsUR)"},{"issue":"S1","key":"3_CR27","doi-asserted-by":"publisher","first-page":"S63","DOI":"10.1121\/1.2016299","volume":"62","author":"F Jelinek","year":"1977","unstructured":"Jelinek, F., Mercer, R.L., Bahl, L.R., Baker, J.K.: Perplexity-a measure of the difficulty of speech recognition tasks. J. Acoust. Soc. Am. 62(S1), S63\u2013S63 (1977)","journal-title":"J. Acoust. Soc. Am."},{"key":"3_CR28","doi-asserted-by":"crossref","unstructured":"Kottur, S., Moon, S., Geramifard, A., Damavandi, B.: SIMMC 2.0: a task-oriented dialog dataset for immersive multimodal conversations. arXiv preprint arXiv:2104.08667 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.401"},{"key":"3_CR29","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., Soricut, R.: ALBERT: a lite BERT for self-supervised learning of language representations. In: International Conference on Learning Representations (2019)"},{"key":"3_CR30","doi-asserted-by":"crossref","unstructured":"Le, H., Chen, N.F., Hoi, S.C.: VGNMN: video-grounded neural module network to video-grounded language tasks. arXiv preprint arXiv:2104.07921 (2021)","DOI":"10.18653\/v1\/2022.naacl-main.247"},{"key":"3_CR31","doi-asserted-by":"crossref","unstructured":"Le, H., Sahoo, D., Chen, N., Hoi, S.: Multimodal transformer networks for end-to-end video-grounded dialogue systems. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 5612\u20135623 (2019)","DOI":"10.18653\/v1\/P19-1564"},{"key":"3_CR32","unstructured":"Lee, Y.J., Ko, B., Kim, H.G., Choi, H.J.: DialogCC: large-scale multi-modal dialogue dataset. arXiv preprint arXiv:2212.04119 (2022)"},{"key":"3_CR33","doi-asserted-by":"crossref","unstructured":"Li, J., Galley, M., Brockett, C., Gao, J., Dolan, W.B.: A diversity-promoting objective function for neural conversation models. In: Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 110\u2013119 (2016)","DOI":"10.18653\/v1\/N16-1014"},{"key":"3_CR34","unstructured":"Li, S., Tajbakhsh, N.: SciGraphQA: a large-scale synthetic multi-turn question-answering dataset for scientific graphs. arXiv preprint arXiv:2308.03349 (2023)"},{"key":"3_CR35","unstructured":"Liang, Z., et al.: Maria: a visual experience powered conversational agent. arXiv preprint arXiv:2105.13073 (2021)"},{"key":"3_CR36","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text summarization branches out, pp. 74\u201381 (2004)"},{"key":"3_CR37","doi-asserted-by":"crossref","unstructured":"Lin, H., et\u00a0al.: TikTalk: a video-based dialogue dataset for multi-modal chitchat in real world. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 1303\u20131313 (2023)","DOI":"10.1145\/3581783.3612425"},{"key":"3_CR38","doi-asserted-by":"crossref","unstructured":"Liu, G., Wang, S., Yu, J., Yin, J.: A survey on multimodal dialogue systems: recent advances and new frontiers. In: 2022 5th International Conference on Advanced Electronic Materials, Computers and Software Engineering (AEMCSE), pp. 845\u2013853. IEEE (2022)","DOI":"10.1109\/AEMCSE55572.2022.00170"},{"key":"3_CR39","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Adv. Neural Info. Process. Syst. 32 (2019)"},{"key":"3_CR40","unstructured":"Lu, J., Clark, C., Zellers, R., Mottaghi, R., Kembhavi, A.: Unified-IO: a unified model for vision, language, and multi-modal tasks. In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"3_CR41","first-page":"2507","volume":"35","author":"P Lu","year":"2022","unstructured":"Lu, P., et al.: Learn to explain: Multimodal reasoning via thought chains for science question answering. Adv. Neural. Inf. Process. Syst. 35, 2507\u20132521 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR42","doi-asserted-by":"crossref","unstructured":"Ma, S., Cui, L., Dai, D., Wei, F., Sun, X.: LiveBot: generating live video comments based on visual and textual contexts. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 6810\u20136817 (2019)","DOI":"10.1609\/aaai.v33i01.33016810"},{"key":"3_CR43","doi-asserted-by":"crossref","unstructured":"Masry, A., Do, X.L., Tan, J.Q., Joty, S., Hoque, E.: ChartQA: a benchmark for question answering about charts with visual and logical reasoning. In: Findings of the Association for Computational Linguistics: ACL 2022, pp. 2263\u20132279 (2022)","DOI":"10.18653\/v1\/2022.findings-acl.177"},{"key":"3_CR44","unstructured":"Meng, Y., et al.: OpenViDial: a large-scale, open-domain dialogue dataset with visual contexts. arXiv preprint arXiv:2012.15015 (2020)"},{"key":"3_CR45","doi-asserted-by":"crossref","unstructured":"Methani, N., Ganguly, P., Khapra, M.M., Kumar, P.: PlotQA: reasoning over scientific plots. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1527\u20131536 (2020)","DOI":"10.1109\/WACV45572.2020.9093523"},{"key":"3_CR46","unstructured":"Mostafazadeh, N., et al.: Image-grounded conversations: multimodal context for natural question and response generation. arXiv preprint arXiv:1701.08251 (2017)"},{"key":"3_CR47","doi-asserted-by":"publisher","first-page":"336","DOI":"10.1007\/978-3-030-58523-5_20","volume-title":"Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XVIII","author":"V Murahari","year":"2020","unstructured":"Murahari, V., Batra, D., Parikh, D., Das, A.: Large-Scale Pretraining for Visual Dialog: A Simple State-of-the-Art Baseline. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XVIII, pp. 336\u2013352. Springer International Publishing, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58523-5_20"},{"key":"3_CR48","doi-asserted-by":"crossref","unstructured":"Nguyen, V.Q., Suganuma, M., Okatani, T.: Efficient attention mechanism for visual dialog that can handle all the interactions between multiple inputs. arXiv preprint arXiv:1911.11390 (2019)","DOI":"10.1007\/978-3-030-58586-0_14"},{"key":"3_CR49","unstructured":"Oquab, M., et\u00a0al.: Dinov2: learning robust visual features without supervision. Trans. Mach. Learn. Res. (2023)"},{"key":"3_CR50","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.: Im2Text: describing images using 1 Million captioned photographs. Adv. Neural Info. Process. Syst. 24 (2011)"},{"key":"3_CR51","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"3_CR52","doi-asserted-by":"crossref","unstructured":"Pi, R., et al.: MLLM-protector: ensuring MLLM\u2019s safety without hurting performance. arXiv preprint arXiv:2401.02906 (2024)","DOI":"10.18653\/v1\/2024.emnlp-main.895"},{"key":"3_CR53","doi-asserted-by":"crossref","unstructured":"Poria, S., Hazarika, D., Majumder, N., Naik, G., Cambria, E., Mihalcea, R.: MELD: a multimodal multi-party dataset for emotion recognition in conversations. arXiv preprint arXiv:1810.02508 (2018)","DOI":"10.18653\/v1\/P19-1050"},{"issue":"140","key":"3_CR54","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(140), 1\u201367 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"3_CR55","doi-asserted-by":"crossref","unstructured":"Saha, A., Khapra, M., Sankaranarayanan, K.: Towards building large scale multimodal domain-aware conversation systems. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a032 (2018)","DOI":"10.1609\/aaai.v32i1.11331"},{"key":"3_CR56","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR57","unstructured":"Schuhmann, C., et al.: LAION-400M: open dataset of CLIP-filtered 400 Million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)"},{"key":"3_CR58","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"3_CR59","doi-asserted-by":"crossref","unstructured":"Shen, L., Zhan, H., Shen, X., Song, Y., Zhao, X.: Text is not enough: integrating visual impressions into open-domain dialogue generation. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 4287\u20134296 (2021)","DOI":"10.1145\/3474085.3475568"},{"key":"3_CR60","doi-asserted-by":"crossref","unstructured":"Shuster, K., Humeau, S., Bordes, A., Weston, J.: Image-Chat: engaging grounded conversations. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 2414\u20132429 (2020)","DOI":"10.18653\/v1\/2020.acl-main.219"},{"key":"3_CR61","doi-asserted-by":"crossref","unstructured":"Shuster, K., Smith, E.M., Ju, D., Weston, J.: Multi-modal open-domain dialogue. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 4863\u20134883 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.398"},{"key":"3_CR62","unstructured":"Shuster, K., et\u00a0al.: BlenderBot 3: a deployed conversational agent that continually learns to responsibly engage. arXiv preprint arXiv:2208.03188 (2022)"},{"key":"3_CR63","unstructured":"Sun, Q., et al.: Multimodal dialogue response generation. arXiv preprint arXiv:2110.08515 (2021)"},{"key":"3_CR64","doi-asserted-by":"crossref","unstructured":"Sun, Y., et al.: Ernie 2.0: a continual pre-training framework for language understanding. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 8968\u20138975 (2020)","DOI":"10.1609\/aaai.v34i05.6428"},{"key":"3_CR65","doi-asserted-by":"crossref","unstructured":"Sundar, A., Heck, L.: Multimodal conversational AI: a survey of datasets and approaches. In: Proceedings of the 4th Workshop on NLP for Conversational AI, pp. 131\u2013147 (2022)","DOI":"10.18653\/v1\/2022.nlp4convai-1.12"},{"key":"3_CR66","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Info. Process. Syst.30 (2017)"},{"key":"3_CR67","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"3_CR68","unstructured":"Wang, S., Meng, Y., Li, X., Sun, X., Ouyang, R., Li, J.: OpenViDial 2.0: a larger-scale, open-domain dialogue generation dataset with visual contexts. arXiv preprint arXiv:2109.12761 (2021)"},{"key":"3_CR69","doi-asserted-by":"crossref","unstructured":"Wang, W., Chen, J., Jin, Q.: VideoIC: a video interactive comments dataset and multimodal multitask learning for comments generation. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 2599\u20132607 (2020)","DOI":"10.1145\/3394171.3413890"},{"key":"3_CR70","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R., Doll\u00e1r, P., Tu, Z., He, K.: Aggregated residual transformations for deep neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1492\u20131500 (2017)","DOI":"10.1109\/CVPR.2017.634"},{"key":"3_CR71","unstructured":"Yuan, L., et\u00a0al.: Florence: a new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)"},{"key":"3_CR72","doi-asserted-by":"crossref","unstructured":"Zang, X., Liu, L., Wang, M., Song, Y., Zhang, H., Chen, J.: PhotoChat: a human-human dialogue dataset with photo sharing behavior for joint image-text modeling. arXiv preprint arXiv:2108.01453 (2021)","DOI":"10.18653\/v1\/2021.acl-long.479"},{"key":"3_CR73","unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y.: BERTScore: evaluating text generation with BERT. In: International Conference on Learning Representations (2019)"},{"key":"3_CR74","doi-asserted-by":"crossref","unstructured":"Zhao, J., et al.: M3ED: multi-modal multi-scene multi-label emotional dialogue database. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 5699\u20135710 (2022)","DOI":"10.18653\/v1\/2022.acl-long.391"},{"key":"3_CR75","unstructured":"Zhao, W.X., et\u00a0al.: A survey of large language models. arXiv preprint arXiv:2303.18223 (2023)"},{"key":"3_CR76","unstructured":"Zheng, Y., Chen, G., Liu, X., Sun, J.: MMChat: multi-modal chat dataset on social media. In: Proceedings of the Thirteenth Language Resources and Evaluation Conference, pp. 5778\u20135786 (2022)"}],"container-title":["Lecture Notes in Computer Science","Advanced Data Mining and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-0847-8_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,13]],"date-time":"2024-12-13T05:03:06Z","timestamp":1734066186000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-0847-8_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,14]]},"ISBN":["9789819608461","9789819608478"],"references-count":76,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-0847-8_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,14]]},"assertion":[{"value":"14 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ADMA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Advanced Data Mining and Applications","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Sydney, NSW","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Australia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"adma2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/adma2024.github.io\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}