{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,28]],"date-time":"2025-03-28T06:35:41Z","timestamp":1743143741097,"version":"3.40.3"},"publisher-location":"Cham","reference-count":51,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031727504"},{"type":"electronic","value":"9783031727511"}],"license":[{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72751-1_22","type":"book-chapter","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T09:52:13Z","timestamp":1729849933000},"page":"378-396","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["BI-MDRG: Bridging Image History in\u00a0Multimodal Dialogue Response Generation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2115-8459","authenticated-orcid":false,"given":"Hee Suk","family":"Yoon","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5580-5354","authenticated-orcid":false,"given":"Eunseop","family":"Yoon","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5119-2802","authenticated-orcid":false,"given":"Joshua Tian Jin","family":"Tee","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2761-9383","authenticated-orcid":false,"given":"Kang","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5725-9545","authenticated-orcid":false,"given":"Yu-Jung","family":"Heo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6692-962X","authenticated-orcid":false,"given":"Du-Seong","family":"Chang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0756-7179","authenticated-orcid":false,"given":"Chang D.","family":"Yoo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,26]]},"reference":[{"key":"22_CR1","doi-asserted-by":"crossref","unstructured":"Alamri, H., et al.: Audio-visual scene-aware dialog. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2019)","DOI":"10.1109\/CVPR.2019.00774"},{"key":"22_CR2","unstructured":"Alayrac, J.B., et al.: Flamingo: a Visual Language Model for Few-Shot Learning. In: Oh, A.H., Agarwal, A., Belgrave, D., Cho, K. (eds.) Advances in Neural Information Processing Systems (2022). https:\/\/openreview.net\/forum?id=EbMuimAbPbs"},{"key":"22_CR3","doi-asserted-by":"crossref","unstructured":"Zijia Zhao, et al.: ChatSearch: a Dataset and a Generative Retrieval Model for General Conversational Image Retrieval. In: ICLR (2023). https:\/\/openreview.net\/forum?id=0unbjYPmbC","DOI":"10.2139\/ssrn.5034805"},{"key":"22_CR4","unstructured":"Awadalla, A., et al.: OpenFlamingo: an open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390 (2023)"},{"key":"22_CR5","unstructured":"Balaji, Y., et al.: eDIFF-I: text-to-image diffusion models with ensemble of expert denoisers. arXiv preprint arXiv:2211.01324 (2022)"},{"key":"22_CR6","doi-asserted-by":"crossref","unstructured":"Chen, X., Huang, L., Liu, Y., Shen, Y., Zhao, D., Zhao, H.: AnyDoor: zero-shot object-level image customization. arXiv preprint arXiv:2307.09481 (2023)","DOI":"10.1109\/CVPR52733.2024.00630"},{"key":"22_CR7","doi-asserted-by":"publisher","first-page":"558","DOI":"10.1007\/978-3-031-20044-1_32","volume-title":"Computer Vision \u2013 ECCV 2022","author":"N Cohen","year":"2022","unstructured":"Cohen, N., Gal, R., Meirom, E.A., Chechik, G., Atzmon, Y.: This is my unicorn, fluffy: personalizing frozen vision-language representations. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, pp. 558\u2013577. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20044-1_32"},{"key":"22_CR8","doi-asserted-by":"crossref","unstructured":"Das, A., et al.: Visual dialog. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 326\u2013335 (2017)","DOI":"10.1109\/CVPR.2017.121"},{"key":"22_CR9","doi-asserted-by":"publisher","unstructured":"Feng, J., et al.: MMDialog: a large-scale multi-turn dialogue dataset towards multi-modal open-domain conversation. In: Rogers, A., Boyd-Graber, J., Okazaki, N. (eds.) Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 7348\u20137363. Association for Computational Linguistics, Toronto, Canada (2023). https:\/\/doi.org\/10.18653\/v1\/2023.acl-long.405","DOI":"10.18653\/v1\/2023.acl-long.405"},{"key":"22_CR10","first-page":"01618","volume":"2208","author":"R Gal","year":"2022","unstructured":"Gal, R., et al.: An image is worth one word: Personalizing text-to-image generation using textual inversion 2208, 01618 (2022)","journal-title":"An image is worth one word: Personalizing text-to-image generation using textual inversion"},{"key":"22_CR11","doi-asserted-by":"publisher","unstructured":"Gan, Z., Cheng, Y., Kholy, A., Li, L., Liu, J., Gao, J.: Multi-step reasoning via recurrent dual attention for visual dialog. In: Korhonen, A., Traum, D., M\u00e0rquez, L. (eds.) Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 6463\u20136474. Association for Computational Linguistics, Florence, Italy (2019). https:\/\/doi.org\/10.18653\/v1\/P19-1648","DOI":"10.18653\/v1\/P19-1648"},{"key":"22_CR12","doi-asserted-by":"crossref","unstructured":"Han, S., Hessel, J., Dziri, N., Choi, Y., Yu, Y.: CHAMPAGNE: learning real-world conversation from large-scale web videos. arXiv preprint arXiv:2303.09713 (2023)","DOI":"10.1109\/ICCV51070.2023.01421"},{"key":"22_CR13","unstructured":"Honnibal, M., Montani, I.: spaCy 2: natural language understanding with Bloom embeddings, convolutional neural networks and incremental parsing (2017)"},{"key":"22_CR14","first-page":"13361","volume":"2103","author":"J Kim","year":"2021","unstructured":"Kim, J., Yoon, S., Kim, D., Yoo, C.D.: Structured co-reference graph attention for video-grounded dialogue 2103, 13361 (2021)","journal-title":"Structured co-reference graph attention for video-grounded dialogue"},{"key":"22_CR15","unstructured":"Kirillov, A., et al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)"},{"key":"22_CR16","unstructured":"Koh, J.Y., Salakhutdinov, R., Fried, D.: Grounding language models to images for multimodal inputs and outputs. In: ICML (2023)"},{"key":"22_CR17","doi-asserted-by":"crossref","unstructured":"Koo, G., Yoon, S., Yoo, C.D.: Wavelet-guided acceleration of text inversion in diffusion-based image editing. In: ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4380\u20134384. IEEE (2024)","DOI":"10.1109\/ICASSP48485.2024.10446603"},{"key":"22_CR18","doi-asserted-by":"crossref","unstructured":"Kumari, N., Zhang, B., Zhang, R., Shechtman, E., Zhu, J.Y.: Multi-concept customization of text-to-image diffusion (2023)","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"22_CR19","doi-asserted-by":"crossref","unstructured":"Kumari, N., Zhang, B., Zhang, R., Shechtman, E., Zhu, J.Y.: Multi-concept customization of text-to-image diffusion (2023)","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"22_CR20","doi-asserted-by":"crossref","unstructured":"Lee, N., Shin, S., Choo, J., Choi, H.J., Myaeng, S.H.: Constructing multi-modal dialogue dataset by replacing text with semantically relevant images. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers), pp. 897\u2013906. Association for Computational Linguistics (2021). https:\/\/aclanthology.org\/2021.acl-short.113","DOI":"10.18653\/v1\/2021.acl-short.113"},{"key":"22_CR21","unstructured":"Lee, Y.J., Ko, B., Kim, H.G., Choi, H.J.: DialogCC: large-scale multi-modal dialogue dataset. arXiv preprint arXiv:2212.04119 (2022)"},{"key":"22_CR22","unstructured":"Levy, M., Ben-Ari, R., Darshan, N., Lischinski, D.: Chatting makes perfect: chat-based image retrieval. In: Oh, A., Neumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems, vol.\u00a036, pp. 61437\u201361449. Curran Associates, Inc. (2023). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/c1b3d1e2cf53bb28cabd801bd58b3521-Paper-Conference.pdf"},{"key":"22_CR23","unstructured":"Li, D., Li, J., Hoi, S.C.H.: BLIP-diffusion: pre-trained subject representation for controllable text-to-image generation and editing (2023)"},{"key":"22_CR24","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"22_CR25","doi-asserted-by":"publisher","unstructured":"Li, Y., Hui, B., Yin, Z., Yang, M., Huang, F., Li, Y.: PaCE: unified multi-modal dialogue pre-training with progressive and compositional experts. In: Rogers, A., Boyd-Graber, J., Okazaki, N. (eds.) Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 13402\u201313416. Association for Computational Linguistics, Toronto, Canada (2023). https:\/\/doi.org\/10.18653\/v1\/2023.acl-long.749","DOI":"10.18653\/v1\/2023.acl-long.749"},{"key":"22_CR26","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381. Association for Computational Linguistics, Barcelona, Spain (2004). https:\/\/aclanthology.org\/W04-1013"},{"key":"22_CR27","doi-asserted-by":"crossref","unstructured":"Lin, H., et al.: Tiktalk: a video-based dialogue dataset for multi-modal chitchat in real world. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 1303\u20131313 (2023)","DOI":"10.1145\/3581783.3612425"},{"key":"22_CR28","doi-asserted-by":"crossref","unstructured":"Liu, S., et al.: Grounding DINO: marrying DINO with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"22_CR29","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2017)"},{"key":"22_CR30","doi-asserted-by":"crossref","unstructured":"Ma, J., Liang, J., Chen, C., Lu, H.: Subject-diffusion: open domain personalized text-to-image generation without test-time fine-tuning. arXiv preprint arXiv:2307.11410 (2023)","DOI":"10.1145\/3641519.3657469"},{"key":"22_CR31","unstructured":"Meng, Y., et al.: OpenViDial: a large-scale, open-domain dialogue dataset with visual contexts. arXiv preprint arXiv:2012.15015 (2020)"},{"key":"22_CR32","doi-asserted-by":"crossref","unstructured":"Niu, Y., Zhang, H., Zhang, M., Zhang, J., Lu, Z., Wen, J.R.: Recursive visual attention in visual dialog. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6679\u20136688 (2019)","DOI":"10.1109\/CVPR.2019.00684"},{"key":"22_CR33","unstructured":"Oquab, M., et al.: DINOv2: learning robust visual features without supervision (2023)"},{"key":"22_CR34","doi-asserted-by":"publisher","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting on Association for Computational Linguistics, pp. 311\u2013318. ACL \u201902, Association for Computational Linguistics, USA (2002). https:\/\/doi.org\/10.3115\/1073083.1073135","DOI":"10.3115\/1073083.1073135"},{"key":"22_CR35","doi-asserted-by":"publisher","unstructured":"Poria, S., Hazarika, D., Majumder, N., Naik, G., Cambria, E., Mihalcea, R.: MELD: a multimodal multi-party dataset for emotion recognition in conversations. In: Korhonen, A., Traum, D., M\u00e0rquez, L. (eds.) Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 527\u2013536. Association for Computational Linguistics, Florence, Italy (2019). https:\/\/doi.org\/10.18653\/v1\/P19-1050","DOI":"10.18653\/v1\/P19-1050"},{"key":"22_CR36","doi-asserted-by":"crossref","unstructured":"Qi, J., Niu, Y., Huang, J., Zhang, H.: Two causal principles for improving visual dialog. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10860\u201310869 (2020)","DOI":"10.1109\/CVPR42600.2020.01087"},{"key":"22_CR37","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I.: Language models are unsupervised multitask learners (2019)"},{"key":"22_CR38","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"22_CR39","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: DreamBooth: fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22500\u201322510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"22_CR40","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. In: Oh, A.H., Agarwal, A., Belgrave, D., Cho, K. (eds.) Advances in Neural Information Processing Systems (2022). https:\/\/openreview.net\/forum?id=08Yk-n5l2Al"},{"key":"22_CR41","unstructured":"Salimans, T., et al.: Improved techniques for training GANs. In: Lee, D., Sugiyama, M., Luxburg, U., Guyon, I., Garnett, R. (eds.) Advances in Neural Information Processing Systems, vol.\u00a029. Curran Associates, Inc. (2016). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2016\/file\/8a3363abe792db2d8761d6403605aeb7-Paper.pdf"},{"key":"22_CR42","doi-asserted-by":"publisher","unstructured":"Shuster, K., Humeau, S., Bordes, A., Weston, J.: Image-chat: engaging grounded conversations. In: Jurafsky, D., Chai, J., Schluter, N., Tetreault, J. (eds.) Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 2414\u20132429. Association for Computational Linguistics (2020). https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.219","DOI":"10.18653\/v1\/2020.acl-main.219"},{"key":"22_CR43","doi-asserted-by":"publisher","unstructured":"Sun, Q., et al.: Multimodal dialogue response generation. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2854\u20132866. Association for Computational Linguistics, Dublin, Ireland (2022). https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.204","DOI":"10.18653\/v1\/2022.acl-long.204"},{"key":"22_CR44","unstructured":"Wang, S., Meng, Y., Li, X., Sun, X., Ouyang, R., Li, J.: OpenViDial 2.0: a larger-scale, open-domain dialogue generation dataset with visual contexts. arXiv preprint arXiv:2109.12761 (2021)"},{"key":"22_CR45","doi-asserted-by":"crossref","unstructured":"Yoon, S., Kim, D., Yoon, E., Yoon, H.S., Kim, J., Yoo, C.D.: HEAR: hearing enhanced audio response for video-grounded dialogue. arXiv preprint arXiv:2312.09736 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.797"},{"key":"22_CR46","doi-asserted-by":"publisher","unstructured":"Yoon, S., Yoon, E., Yoon, H.S., Kim, J., Yoo, C.: Information-theoretic text hallucination reduction for video-grounded dialogue. In: Goldberg, Y., Kozareva, Z., Zhang, Y. (eds.) Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp. 4182\u20134193. Association for Computational Linguistics, Abu Dhabi, United Arab Emirates (2022). https:\/\/doi.org\/10.18653\/v1\/2022.emnlp-main.280","DOI":"10.18653\/v1\/2022.emnlp-main.280"},{"key":"22_CR47","doi-asserted-by":"publisher","unstructured":"Zang, X., Liu, L., Wang, M., Song, Y., Zhang, H., Chen, J.: PhotoChat: a human-human dialogue dataset with photo sharing behavior for joint image-text modeling. In: Zong, C., Xia, F., Li, W., Navigli, R. (eds.) Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 6142\u20136152. Association for Computational Linguistics (2021). https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.479","DOI":"10.18653\/v1\/2021.acl-long.479"},{"key":"22_CR48","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: DIALOGPT: large-scale generative pre-training for conversational response generation. In: ACL: System Demonstration (2020)","DOI":"10.18653\/v1\/2020.acl-demos.30"},{"key":"22_CR49","doi-asserted-by":"publisher","unstructured":"Zhao, J., et al.: M3ED: multi-modal multi-scene multi-label emotional dialogue database. In: Muresan, S., Nakov, P., Villavicencio, A. (eds.) Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 5699\u20135710. Association for Computational Linguistics, Dublin, Ireland (2022). https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.391","DOI":"10.18653\/v1\/2022.acl-long.391"},{"key":"22_CR50","unstructured":"Zheng, K., He, X., Wang, X.E.: MiniGPT-5: interleaved vision-and-language generation via generative Vokens. arXiv preprint arXiv:2310.02239 (2023)"},{"key":"22_CR51","unstructured":"Zheng, Y., Chen, G., Liu, X., Sun, J.: MMChat: multi-modal chat dataset on social media. In: Calzolari, N., et al. (eds.) Proceedings of the Thirteenth Language Resources and Evaluation Conference, pp. 5778\u20135786. European Language Resources Association, Marseille, France (2022). https:\/\/aclanthology.org\/2022.lrec-1.621"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72751-1_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T07:49:06Z","timestamp":1732952946000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72751-1_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,26]]},"ISBN":["9783031727504","9783031727511"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72751-1_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,26]]},"assertion":[{"value":"26 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}