{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,18]],"date-time":"2026-06-18T16:01:50Z","timestamp":1781798510247,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,10,12]],"date-time":"2020-10-12T00:00:00Z","timestamp":1602460800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foundation of China","award":["61703386"],"award-info":[{"award-number":["61703386"]}]},{"name":"National Natural Science Foundation of China","award":["U1605251"],"award-info":[{"award-number":["U1605251"]}]},{"name":"National Natural Science Foundation of China","award":["61727809"],"award-info":[{"award-number":["61727809"]}]},{"name":"National Key Research and Development Program of China","award":["2018YFB1402600"],"award-info":[{"award-number":["2018YFB1402600"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,10,12]]},"DOI":"10.1145\/3394171.3413679","type":"proceedings-article","created":{"date-parts":[[2020,10,12]],"date-time":"2020-10-12T12:27:38Z","timestamp":1602505658000},"page":"2755-2764","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":25,"title":["Multimodal Dialogue Systems via Capturing Context-aware Dependencies of Semantic Elements"],"prefix":"10.1145","author":[{"given":"Weidong","family":"He","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhi","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dongcai","family":"Lu","sequence":"additional","affiliation":[{"name":"HUAWEI Technologies, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Enhong","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tong","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Baoxing","family":"Huai","sequence":"additional","affiliation":[{"name":"HUAWEI Technologies, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jing","family":"Yuan","sequence":"additional","affiliation":[{"name":"HUAWEI Technologies, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2020,10,12]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_2_2_2_1","unstructured":"Jimmy Lei Ba Jamie Ryan Kiros and Geoffrey E Hinton. 2016. Layer normalization. arXiv preprint arXiv:1607.06450 (2016).  Jimmy Lei Ba Jamie Ryan Kiros and Geoffrey E Hinton. 2016. Layer normalization. arXiv preprint arXiv:1607.06450 (2016)."},{"key":"e_1_3_2_2_3_1","volume-title":"Adaptive Input Representations for Neural Language Modeling. In International Conference on Learning Representations.","author":"Baevski Alexei","year":"2019"},{"key":"e_1_3_2_2_4_1","volume-title":"Learning End-to-End Goal-Oriented Dialog. In International Conference on Learning Representations.","author":"Bordes Antoine","year":"2017"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1540"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1360\/SSI-2019-0292"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331226"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1045"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.3115\/1289189.1289273"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"crossref","unstructured":"Joseph L Fleiss. 1971. Measuring nominal scale agreement among many raters. Psychological bulletin Vol. 76 5 (1971) 378.  Joseph L Fleiss. 1971. Measuring nominal scale agreement among many raters. Psychological bulletin Vol. 76 5 (1971) 378.","DOI":"10.1037\/h0031619"},{"key":"e_1_3_2_2_12_1","volume-title":"Proceedings of the thirteenth international conference on artificial intelligence and statistics. 249--256","author":"Glorot Xavier","year":"2010"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/650"},{"key":"e_1_3_2_2_15_1","volume-title":"International Conference on Learning Representations.","author":"Kingma Diederik P","year":"2015"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1133"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1014"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1127"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1002"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/489"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240646"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240605"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1230"},{"key":"e_1_3_2_2_24_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In Advances in Neural Information Processing Systems. 13--23.","author":"Lu Jiasen","year":"2019"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1166"},{"key":"e_1_3_2_2_26_1","volume-title":"Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). Association for Computational Linguistics, 1777--1788","author":"Mrkv\u0161i\u0107 Nikola","year":"2017"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350923"},{"key":"e_1_3_2_2_28_1","volume-title":"Proceedings of the 40th annual meeting on association for computational linguistics. Association for Computational Linguistics, 311--318","author":"Papineni Kishore","year":"2002"},{"key":"e_1_3_2_2_29_1","volume-title":"Thirty-Second AAAI Conference on Artificial Intelligence.","author":"Saha Amrita","year":"2018"},{"key":"e_1_3_2_2_30_1","unstructured":"Iulian V Serban Alessandro Sordoni Yoshua Bengio Aaron Courville and Joelle Pineau. 2015. Hierarchical neural network generative models for movie dialogues. arXiv preprint arXiv:1507.04808 Vol. 7 8 (2015).  Iulian V Serban Alessandro Sordoni Yoshua Bengio Aaron Courville and Joelle Pineau. 2015. Hierarchical neural network generative models for movie dialogues. arXiv preprint arXiv:1507.04808 Vol. 7 8 (2015)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.5555\/3016387.3016435"},{"key":"e_1_3_2_2_32_1","volume-title":"Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)","author":"Shang Lifeng"},{"key":"e_1_3_2_2_33_1","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014).  Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_2_34_1","unstructured":"Nitish Srivastava Geoffrey Hinton Alex Krizhevsky Ilya Sutskever and Ruslan Salakhutdinov. 2014. Dropout: a simple way to prevent neural networks from overfitting. The journal of machine learning research Vol. 15 1 (2014) 1929--1958.  Nitish Srivastava Geoffrey Hinton Alex Krizhevsky Ilya Sutskever and Ruslan Salakhutdinov. 2014. Dropout: a simple way to prevent neural networks from overfitting. The journal of machine learning research Vol. 15 1 (2014) 1929--1958."},{"key":"e_1_3_2_2_35_1","volume-title":"International Conference on Learning Representations.","author":"Su Weijie","year":"2020"},{"key":"e_1_3_2_2_36_1","unstructured":"Ilya Sutskever Oriol Vinyals and Quoc V Le. 2014. Sequence to sequence learning with neural networks. In Advances in neural information processing systems. 3104--3112.  Ilya Sutskever Oriol Vinyals and Quoc V Le. 2014. Sequence to sequence learning with neural networks. In Advances in neural information processing systems. 3104--3112."},{"key":"e_1_3_2_2_37_1","volume-title":"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics. Association for Computational Linguistics, 6558--6569","author":"Hubert Tsai Yao-Hung","year":"2019"},{"key":"e_1_3_2_2_38_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998--6008.  Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998--6008."},{"key":"e_1_3_2_2_39_1","unstructured":"Oriol Vinyals and Quoc Le. 2015. A neural conversational model. arXiv preprint arXiv:1506.05869 (2015).  Oriol Vinyals and Quoc Le. 2015. A neural conversational model. arXiv preprint arXiv:1506.05869 (2015)."},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/E17-1042"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"crossref","unstructured":"Ronald J Williams and David Zipser. 1989. A learning algorithm for continually running fully recurrent neural networks. Neural computation Vol. 1 2 (1989) 270--280.  Ronald J Williams and David Zipser. 1989. A learning algorithm for continually running fully recurrent neural networks. Neural computation Vol. 1 2 (1989) 270--280.","DOI":"10.1162\/neco.1989.1.2.270"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1046"},{"key":"e_1_3_2_2_43_1","unstructured":"Yingce Xia Fei Tian Lijun Wu Jianxin Lin Tao Qin Nenghai Yu and Tie-Yan Liu. 2017. Deliberation networks: Sequence generation beyond one-pass decoding. In Advances in Neural Information Processing Systems. 1784--1794.  Yingce Xia Fei Tian Lijun Wu Jianxin Lin Tao Qin Nenghai Yu and Tie-Yan Liu. 2017. Deliberation networks: Sequence generation beyond one-pass decoding. In Advances in Neural Information Processing Systems. 1784--1794."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33017338"},{"key":"e_1_3_2_2_45_1","volume-title":"International conference on machine learning. 2048--2057","author":"Xu Kelvin","year":"2015"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/2911451.2911542"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.5555\/3298023.3298238"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3209978.3210011"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2012.2225812"},{"key":"e_1_3_2_2_50_1","unstructured":"Peilun Zhou Tong Xu Zhizhuo Yin Dong Liu Enhong Chen Guangyi Lv and Changliang Li. 2019. Character-oriented Video Summarization with Visual and Textual Cues. IEEE Transactions on Multimedia (2019).  Peilun Zhou Tong Xu Zhizhuo Yin Dong Liu Enhong Chen Guangyi Lv and Changliang Li. 2019. Character-oriented Video Summarization with Visual and Textual Cues. IEEE Transactions on Multimedia (2019)."}],"event":{"name":"MM '20: The 28th ACM International Conference on Multimedia","location":"Seattle WA USA","acronym":"MM '20","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 28th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3394171.3413679","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3394171.3413679","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:47:16Z","timestamp":1750193236000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3394171.3413679"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,10,12]]},"references-count":50,"alternative-id":["10.1145\/3394171.3413679","10.1145\/3394171"],"URL":"https:\/\/doi.org\/10.1145\/3394171.3413679","relation":{},"subject":[],"published":{"date-parts":[[2020,10,12]]},"assertion":[{"value":"2020-10-12","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}