{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T05:16:29Z","timestamp":1743138989765,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":54,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819620708"},{"type":"electronic","value":"9789819620715"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-2071-5_20","type":"book-chapter","created":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T16:01:18Z","timestamp":1735747278000},"page":"268-282","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Understanding the\u00a0Roles of\u00a0Visual Modality in\u00a0Multimodal Dialogue: An Empirical Study"],"prefix":"10.1007","author":[{"given":"Qian","family":"Cao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruihua","family":"Song","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xu","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,1,2]]},"reference":[{"unstructured":"Adiwardana, D., et\u00a0al.: Towards a human-like open-domain chatbot. arXiv preprint arXiv:2001.09977 (2020)","key":"20_CR1"},{"doi-asserted-by":"crossref","unstructured":"Alamri, H., et\u00a0al.: Audio visual scene-aware dialog. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7558\u20137567 (2019)","key":"20_CR2","DOI":"10.1109\/CVPR.2019.00774"},{"unstructured":"Alayrac, J., et\u00a0al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022 (2022)","key":"20_CR3"},{"doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., et\u00a0al.: VQA: visual question answering. In: International Conference on Computer Vision (ICCV) (2015)","key":"20_CR4","DOI":"10.1109\/ICCV.2015.279"},{"doi-asserted-by":"crossref","unstructured":"Bao, S., He, H., Wang, F., Wu, H., Wang, H.: Plato: pre-trained dialogue generation model with discrete latent variable. arXiv preprint arXiv:1910.07931 (2019)","key":"20_CR5","DOI":"10.18653\/v1\/2020.acl-main.9"},{"unstructured":"Brown, T.B., et\u00a0al.: Language models are few-shot learners. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M., Lin, H. (eds.) Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020 (2020)","key":"20_CR6"},{"doi-asserted-by":"crossref","unstructured":"Cao, Q., Chen, X., Song, R., Wang, X., Huang, X., Ren, Y.: See or guess: counterfactually regularized image captioning. arXiv preprint arXiv:2408.16809 (2024)","key":"20_CR7","DOI":"10.1145\/3664647.3681458"},{"doi-asserted-by":"crossref","unstructured":"Carion, N., et\u00a0al.: End-to-end object detection with transformers. In: European Conference on Computer Vision, pp. 213\u2013229. Springer (2020)","key":"20_CR8","DOI":"10.1007\/978-3-030-58452-8_13"},{"issue":"2","key":"20_CR9","doi-asserted-by":"publisher","first-page":"53:1","DOI":"10.1145\/3606368","volume":"42","author":"X Chen","year":"2024","unstructured":"Chen, X., et al.: Multimodal dialog systems with dual knowledge-enhanced generative pretrained language model. ACM Trans. Inf. Syst. 42(2), 53:1-53:25 (2024)","journal-title":"ACM Trans. Inf. Syst."},{"unstructured":"Chen, X., Fang, H., Lin, T.Y., Vedantam, R., et\u00a0al.: Microsoft coco captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)","key":"20_CR10"},{"unstructured":"Chowdhery, A., et\u00a0al.: Palm: scaling language modeling with pathways. J. Mach. Learn. Res. 24, 240:1\u2013240:113 (2023). http:\/\/jmlr.org\/papers\/v24\/22-1144.html","key":"20_CR11"},{"doi-asserted-by":"crossref","unstructured":"Das, A., et\u00a0al.: Visual dialog. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 326\u2013335 (2017)","key":"20_CR12","DOI":"10.1109\/CVPR.2017.121"},{"unstructured":"Devlin, J., et\u00a0al.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)","key":"20_CR13"},{"unstructured":"Dong, L., et al.: Unified language model pre-training for natural language understanding and generation. Adv. Neural Inf. Process. Syst. (2019)","key":"20_CR14"},{"unstructured":"Du, Z., Zeng, A., Dong, Y., Tang, J.: Understanding emergent abilities of language models from the loss perspective. arXiv preprint arXiv:2403.15796 (2024)","key":"20_CR15"},{"doi-asserted-by":"crossref","unstructured":"Dziri, N., Kamalloo, E., et\u00a0al.: Augmenting neural response generation with context-aware topical attention. arXiv preprint arXiv:1811.01063 (2018)","key":"20_CR16","DOI":"10.18653\/v1\/W19-4103"},{"doi-asserted-by":"crossref","unstructured":"Elliott, D.: Adversarial evaluation of multimodal machine translation. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pp. 2974\u20132978 (2018)","key":"20_CR17","DOI":"10.18653\/v1\/D18-1329"},{"doi-asserted-by":"crossref","unstructured":"Hu, R., Singh, A.: Unit: multimodal multitask learning with a unified transformer. In: Proceedings of the International Conference on Computer Vision (2021)","key":"20_CR18","DOI":"10.1109\/ICCV48922.2021.00147"},{"doi-asserted-by":"crossref","unstructured":"Huang, L., et\u00a0al.: A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions. arXiv preprint arXiv:2311.05232 (2023)","key":"20_CR19","DOI":"10.1145\/3703155"},{"key":"20_CR20","first-page":"10944","volume":"34","author":"Y Huang","year":"2021","unstructured":"Huang, Y., et al.: What makes multi-modal learning better than single (provably). Adv. Neural. Inf. Process. Syst. 34, 10944\u201310956 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"doi-asserted-by":"publisher","unstructured":"Ji, Z., et\u00a0al.: Survey of hallucination in natural language generation. ACM Comput. Surv. 55(12), 248:1\u2013248:38 (2023). https:\/\/doi.org\/10.1145\/3571730","key":"20_CR21","DOI":"10.1145\/3571730"},{"issue":"3","key":"20_CR22","doi-asserted-by":"publisher","first-page":"535","DOI":"10.1109\/TBDATA.2019.2921572","volume":"7","author":"J Johnson","year":"2019","unstructured":"Johnson, J., Douze, M., J\u00e9gou, H.: Billion-scale similarity search with GPUs. IEEE Trans. Big Data 7(3), 535\u2013547 (2019)","journal-title":"IEEE Trans. Big Data"},{"doi-asserted-by":"crossref","unstructured":"Kuznetsova, A., et\u00a0al.: The open images dataset V4. Int. J. Comput. Vis., pp. 1956\u20131981 (2020)","key":"20_CR23","DOI":"10.1007\/s11263-020-01316-z"},{"doi-asserted-by":"crossref","unstructured":"Lee, N., et\u00a0al.: Constructing multi-modal dialogue dataset by replacing text with semantically relevant images. Association for Computational Linguistics (2021)","key":"20_CR24","DOI":"10.18653\/v1\/2021.acl-short.113"},{"doi-asserted-by":"crossref","unstructured":"Li, J., Galley, M., Brockett, C., Gao, J., Dolan, B.: A diversity-promoting objective function for neural conversation models. arXiv preprint arXiv:1510.03055 (2015)","key":"20_CR25","DOI":"10.18653\/v1\/N16-1014"},{"unstructured":"Li, J., et\u00a0al.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)","key":"20_CR26"},{"unstructured":"Liang, Z., Hu, H., Xu, C., Tao, C., Geng, X., et\u00a0al.: Maria: a visual experience powered conversational agent. arXiv preprint arXiv:2105.13073 (2021)","key":"20_CR27"},{"unstructured":"Lin, C.Y.: Rouge: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)","key":"20_CR28"},{"unstructured":"Lin, H., Ruan, L., Xia, W., Liu, P., Wen, J., et\u00a0al.: Tiktalk: a multi-modal dialogue dataset for real-world chitchat (2023)","key":"20_CR29"},{"doi-asserted-by":"crossref","unstructured":"Liu, G., Wang, S., Yu, J., Yin, J.: A survey on multimodal dialogue systems: recent advances and new frontiers. In: 2022 5th International Conference on Advanced Electronic Materials, Computers and Software Engineering (AEMCSE) (2022)","key":"20_CR30","DOI":"10.1109\/AEMCSE55572.2022.00170"},{"unstructured":"Liu, H., et\u00a0al.: A survey on hallucination in large vision-language models. arXiv preprint arXiv:2402.00253 (2024)","key":"20_CR31"},{"unstructured":"Liu, H., et\u00a0al.: Visual instruction tuning. In: Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023 (2023)","key":"20_CR32"},{"unstructured":"Meng, Y., Wang, S., Han, Q., et\u00a0al.: OpenViDial: a large-scale, open-domain dialogue dataset with visual contexts. arXiv preprint arXiv:2012.15015 (2020)","key":"20_CR33"},{"unstructured":"Mostafazadeh, N., et\u00a0al.: Image-grounded conversations: multimodal context for natural question and response generation. In: Proceedings of the Eighth International Joint Conference on Natural Language Processing (2017)","key":"20_CR34"},{"doi-asserted-by":"crossref","unstructured":"Nie, L., Wang, W., et\u00a0al.: Multimodal dialog system: generating responses via adaptive decoders. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 1098\u20131106 (2019)","key":"20_CR35","DOI":"10.1145\/3343031.3350923"},{"unstructured":"OpenAI: GPT-4 technical report (2023)","key":"20_CR36"},{"doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","key":"20_CR37","DOI":"10.3115\/1073083.1073135"},{"doi-asserted-by":"crossref","unstructured":"Poria, S., et\u00a0al.: MELD: a multimodal multi-party dataset for emotion recognition in conversations, pp. 527\u2013536. Association for Computational Linguistics (2019)","key":"20_CR38","DOI":"10.18653\/v1\/P19-1050"},{"unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning. PMLR (2021)","key":"20_CR39"},{"unstructured":"Schaeffer, R., et\u00a0al.: Are emergent abilities of large language models a mirage? In: NeurIPS 2023 (2023)","key":"20_CR40"},{"doi-asserted-by":"crossref","unstructured":"Serban, I., et\u00a0al.: Building end-to-end dialogue systems using generative hierarchical neural network models. In: Proceedings of the AAAI Conference (2016)","key":"20_CR41","DOI":"10.1609\/aaai.v30i1.9883"},{"unstructured":"Shuster, K., Humeau, S., Bordes, A., Weston, J.: Image chat: engaging grounded conversations. arXiv preprint arXiv:1811.00945 (2018)","key":"20_CR42"},{"doi-asserted-by":"crossref","unstructured":"Shuster, K., Smith, E.M., Ju, D., Weston, J.: Multi-modal open-domain dialogue. arXiv preprint arXiv:2010.01082 (2020)","key":"20_CR43","DOI":"10.18653\/v1\/2021.emnlp-main.398"},{"doi-asserted-by":"crossref","unstructured":"Specia, L., Frank, S., Sima\u2019An, K., et\u00a0al.: A shared task on multimodal machine translation and crosslingual image description. In: Proceedings of the First Conference on Machine Translation: Volume 2, pp. 543\u2013553 (2016)","key":"20_CR44","DOI":"10.18653\/v1\/W16-2346"},{"doi-asserted-by":"crossref","unstructured":"Sundar, A., Heck, L.: Multimodal conversational AI: a survey of datasets and approaches. arXiv preprint arXiv:2205.06907 (2022)","key":"20_CR45","DOI":"10.18653\/v1\/2022.nlp4convai-1.12"},{"unstructured":"Touvron, H., et\u00a0al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)","key":"20_CR46"},{"unstructured":"Wang, S., Meng, Y., et\u00a0al.: Modeling text-visual mutual dependency for multi-modal dialog generation. arXiv preprint arXiv:2105.14445 (2021)","key":"20_CR47"},{"unstructured":"Wei, J., et\u00a0al.: Emergent abilities of large language models. Trans. Mach. Learn. Res. (2022)","key":"20_CR48"},{"doi-asserted-by":"crossref","unstructured":"Wu, Z., et\u00a0al.: Good for misconceived reasons: an empirical revisiting on the need for visual context in multimodal machine translation. arXiv:2105.14462 (2021)","key":"20_CR49","DOI":"10.18653\/v1\/2021.acl-long.480"},{"doi-asserted-by":"crossref","unstructured":"Yang, Z., et\u00a0al.: Open domain dialogue generation with latent images. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 14239\u201314247 (2021)","key":"20_CR50","DOI":"10.1609\/aaai.v35i16.17675"},{"unstructured":"Zhang, T., Kishore, V., Wu, F., Weinberger, K.Q., Artzi, Y.: BERTScore: evaluating text generation with BERT. arXiv preprint arXiv:1904.09675 (2019)","key":"20_CR51"},{"doi-asserted-by":"crossref","unstructured":"Zhang, Y., et\u00a0al.: DialoGPT: large-scale generative pre-training for conversational response generation. arXiv preprint arXiv:1911.00536 (2019)","key":"20_CR52","DOI":"10.18653\/v1\/2020.acl-demos.30"},{"unstructured":"Zheng, Y., Chen, G., Liu, X., Lin, K.: MMChat: multi-modal chat dataset on social media. arXiv preprint arXiv:2108.07154 (2021)","key":"20_CR53"},{"unstructured":"Zhu, D., et\u00a0al.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. In: ICLR 2024 (2024)","key":"20_CR54"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-2071-5_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T16:03:06Z","timestamp":1735747386000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-2071-5_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819620708","9789819620715"],"references-count":54,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-2071-5_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"2 January 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Nara","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 January 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 January 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/mmm2025.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}