{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T14:52:28Z","timestamp":1774536748095,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475568","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T05:04:15Z","timestamp":1634533455000},"page":"4287-4296","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Text is NOT Enough"],"prefix":"10.1145","author":[{"given":"Lei","family":"Shen","sequence":"first","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}]},{"given":"Haolan","family":"Zhan","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"given":"Xin","family":"Shen","sequence":"additional","affiliation":[{"name":"Australian National University, Canberra, ACT, China"}]},{"given":"Yonghao","family":"Song","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]},{"given":"Xiaofang","family":"Zhao","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Fusion of detected objects in text for visual question answering. arXiv preprint arXiv:1908.05054","author":"Alberti Chris","year":"2019"},{"key":"e_1_3_2_2_2_1","volume-title":"Neural machine translation by jointly learning to align and translate. ICLR","author":"Bahdanau Dzmitry","year":"2015"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1540"},{"key":"e_1_3_2_2_4_1","volume-title":"Proceedings of the 12th Language Resources and Evaluation Conference. European Language Resources Association","author":"Chen Meng","year":"2020"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331226"},{"key":"e_1_3_2_2_6_1","unstructured":"Abhishek Das Satwik Kottur Khushi Gupta Avi Singh Deshraj Yadav Stefan Lee Jose Moura Devi Parikh and Dhruv Batra. [n.d.]. Visual Dialog. IEEE transactions on pattern analysis and machine intelligence ([n. d.]).  Abhishek Das Satwik Kottur Khushi Gupta Avi Singh Deshraj Yadav Stefan Lee Jose Moura Devi Parikh and Dhruv Batra. [n.d.]. Visual Dialog. IEEE transactions on pattern analysis and machine intelligence ([n. d.])."},{"key":"e_1_3_2_2_7_1","volume-title":"Devi Parikh, and Dhruv Batra.","author":"Das Abhishek","year":"2017"},{"key":"e_1_3_2_2_8_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT (1).","author":"Devlin Jacob","year":"2019"},{"key":"e_1_3_2_2_9_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT.","author":"Devlin Jacob","year":"2019"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"crossref","unstructured":"Zhe Gan Yu Cheng Ahmed Kholy Linjie Li Jingjing Liu and Jianfeng Gao. 2019. Multi-step Reasoning via Recurrent Dual Attention for Visual Dialog. In ACL.  Zhe Gan Yu Cheng Ahmed Kholy Linjie Li Jingjing Liu and Jianfeng Gao. 2019. Multi-step Reasoning via Recurrent Dual Attention for Visual Dialog. In ACL.","DOI":"10.18653\/v1\/P19-1648"},{"key":"e_1_3_2_2_11_1","volume-title":"Dialogwae: Multimodal response generation with conditional wasserstein auto-encoder. ICLR","author":"Gu Xiaodong","year":"2018"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01068"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350943"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413679"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-4101"},{"key":"e_1_3_2_2_16_1","unstructured":"Archibald A Hill. 1952. The Structure of English an Introduction to the Construction of English Sentences.  Archibald A Hill. 1952. The Structure of English an Introduction to the Construction of English Sentences."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3383123"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173574.3173851"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6769"},{"key":"e_1_3_2_2_20_1","volume-title":"DAM: Deliberation, Abandon and Memory Networks for Generating Detailed and Non-repetitive Responses in Visual Dialogue. In IJCAI.","author":"Jiang Xiaoze","year":"2020"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1209"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1139"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.192"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_10"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"e_1_3_2_2_26_1","unstructured":"Jiwei Li Michel Galley Chris Brockett Jianfeng Gao and Bill Dolan. 2016. A Diversity-Promoting Objective Function for Neural Conversation Models. In NAACL.  Jiwei Li Michel Galley Chris Brockett Jianfeng Gao and Bill Dolan. 2016. A Diversity-Promoting Objective Function for Neural Conversation Models. In NAACL."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1094"},{"key":"e_1_3_2_2_28_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian Harold","year":"2019"},{"key":"e_1_3_2_2_29_1","unstructured":"Yanran Li Hui Su Xiaoyu Shen Wenjie Li Ziqiang Cao and Shuzi Niu. 2017. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. In IJCNLP.  Yanran Li Hui Su Xiaoyu Shen Wenjie Li Ziqiang Cao and Shuzi Niu. 2017. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. In IJCNLP."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.5555\/3367722.3367752"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"crossref","unstructured":"Junwei Liang Lu Jiang Liangliang Cao Li-Jia Li and Alexander G Hauptmann. 2018. Focal visual-text attention for visual question answering. In CVPR.  Junwei Liang Lu Jiang Liangliang Cao Li-Jia Li and Alexander G Hauptmann. 2018. Focal visual-text attention for visual question answering. In CVPR.","DOI":"10.1109\/CVPR.2018.00642"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240605"},{"key":"e_1_3_2_2_33_1","volume-title":"Peng Xu, Zihan Liu, and Pascale Fung.","author":"Lin Zhaojiang","year":"2020"},{"key":"e_1_3_2_2_34_1","volume-title":"Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC'16)","author":"Lison Pierre","year":"2016"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1230"},{"key":"e_1_3_2_2_36_1","volume-title":"Enhancing Multi-turn Dialogue Modeling with Intent Information for E-Commerce Customer Service. In CCF International Conference on Natural Language Processing and Chinese Computing. Springer, 65--77","author":"Liu Ruixue","year":"2020"},{"key":"e_1_3_2_2_37_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. arXiv preprint arXiv:1908.02265","author":"Lu Jiasen","year":"2019"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1075"},{"key":"e_1_3_2_2_39_1","volume-title":"RefNet: A reference-aware network for background based conversation. arXiv preprint arXiv:1908.06449","author":"Meng Chuan","year":"2019"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350923"},{"key":"e_1_3_2_2_41_1","volume-title":"Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). 1782--1792","author":"Niu Zheng-Yu","year":"2019"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11331"},{"key":"e_1_3_2_2_44_1","volume-title":"Hierarchical neural network generative models for movie dialogues. arXiv preprint arXiv:1507.04808 7, 8","author":"Serban Iulian V","year":"2015"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.5555\/3298023.3298047"},{"key":"e_1_3_2_2_46_1","volume-title":"CDL: Curriculum Dual Learning for Emotion- Controllable Response Generation. In ACL.","author":"Shen Lei","year":"2020"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"crossref","unstructured":"Lei Shen Yang Feng and Haolan Zhan. 2019. Modeling Semantic Relationship in Multi-turn Conversations with Hierarchical Latent Variables. In ACL.  Lei Shen Yang Feng and Haolan Zhan. 2019. Modeling Semantic Relationship in Multi-turn Conversations with Hierarchical Latent Variables. In ACL.","DOI":"10.18653\/v1\/P19-1549"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.271"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414730"},{"key":"e_1_3_2_2_50_1","volume-title":"Image- Chat: Engaging Grounded Conversations. In ACL.","author":"Shuster Kurt","year":"2020"},{"key":"e_1_3_2_2_51_1","volume-title":"VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In International Conference on Learning Representations.","author":"Su Weijie","year":"2019"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"e_1_3_2_2_53_1","volume-title":"Vokenization: Improving Language Understanding with Contextualized, Visual-Grounded Supervision. In EMNLP.","author":"Tan Hao","year":"2020"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"crossref","unstructured":"Oriol Vinyals Alexander Toshev Samy Bengio and Dumitru Erhan. 2015. Show and tell: A neural image caption generator. In CVPR.  Oriol Vinyals Alexander Toshev Samy Bengio and Dumitru Erhan. 2015. Show and tell: A neural image caption generator. In CVPR.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_2_56_1","volume-title":"Vd-bert: A unified vision and dialog transformer with bert. EMNLP","author":"Wang Yue","year":"2020"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"crossref","unstructured":"Wei Wu Xu Sun and Houfeng Wang. 2018. Question condensing networks for answer selection in community question answering. In ACL.  Wei Wu Xu Sun and Houfeng Wang. 2018. Question condensing networks for answer selection in community question answering. In ACL.","DOI":"10.18653\/v1\/P18-1162"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.5555\/3298023.3298055"},{"key":"e_1_3_2_2_60_1","volume-title":"Attngan: Fine-grained text to image generation with attentional generative adversarial networks. In CVPR.","author":"Xu Tao","year":"2018"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00265"},{"key":"e_1_3_2_2_62_1","volume-title":"From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. TACL 2","author":"Young Peter","year":"2014"},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.446"},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"crossref","unstructured":"Saizheng Zhang Emily Dinan Jack Urbanek Arthur Szlam Douwe Kiela and Jason Weston. 2018. Personalizing Dialogue Agents: I have a dog do you have pets too?. In ACL.  Saizheng Zhang Emily Dinan Jack Urbanek Arthur Szlam Douwe Kiela and Jason Weston. 2018. Personalizing Dialogue Agents: I have a dog do you have pets too?. In ACL.","DOI":"10.18653\/v1\/P18-1205"},{"key":"e_1_3_2_2_65_1","unstructured":"Zhuosheng Zhang Kehai Chen Rui Wang Masao Utiyama Eiichiro Sumita Zuchao Li and Hai Zhao. 2020. Neural Machine Translation with Universal Visual Representation. In ICLR.  Zhuosheng Zhang Kehai Chen Rui Wang Masao Utiyama Eiichiro Sumita Zuchao Li and Hai Zhao. 2020. Neural Machine Translation with Universal Visual Representation. In ICLR."},{"key":"e_1_3_2_2_66_1","unstructured":"Tiancheng Zhao Ran Zhao and Maxine Eskenazi. 2017. Learning Discourse-level Diversity for Neural Dialog Models using Conditional Variational Autoencoders. In ACL. 654--664.  Tiancheng Zhao Ran Zhao and Maxine Eskenazi. 2017. Learning Discourse-level Diversity for Neural Dialog Models using Conditional Variational Autoencoders. In ACL. 654--664."},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11325"},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.5555\/3304222.3304413"},{"key":"e_1_3_2_2_69_1","volume-title":"MSMO: Multimodal Summarization with Multimodal Output. In EMNLP.","author":"Zhu Junnan","year":"2018"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475568","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475568","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:49:11Z","timestamp":1750193351000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475568"}},"subtitle":["Integrating Visual Impressions into Open-domain Dialogue Generation"],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":69,"alternative-id":["10.1145\/3474085.3475568","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475568","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}