{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,24]],"date-time":"2026-04-24T10:37:59Z","timestamp":1777027079568,"version":"3.51.4"},"reference-count":79,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/taslp.2023.3293030","type":"journal-article","created":{"date-parts":[[2023,7,6]],"date-time":"2023-07-06T17:24:32Z","timestamp":1688664272000},"page":"765-778","source":"Crossref","is-referenced-by-count":2,"title":["Overview of the Tenth Dialog System Technology Challenge: DSTC10"],"prefix":"10.1109","volume":"32","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6579-361X","authenticated-orcid":false,"given":"Koichiro","family":"Yoshino","sequence":"first","affiliation":[{"name":"Guardian Robot Project, R-IH, RIKEN, 2-2-2 Hikaridai, Seika, Shoraku, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1777-3942","authenticated-orcid":false,"given":"Yun-Nung","family":"Chen","sequence":"additional","affiliation":[{"name":"Computer Science and Information Engineering, National Taiwan University, Taipei, Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0252-0769","authenticated-orcid":false,"given":"Paul","family":"Crook","sequence":"additional","affiliation":[{"name":"Meta Inc., Palo Alto, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Satwik","family":"Kottur","sequence":"additional","affiliation":[{"name":"Meta Inc., Palo Alto, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinchao","family":"Li","sequence":"additional","affiliation":[{"name":"Meta Inc., Palo Alto, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Behnam","family":"Hedayatnia","sequence":"additional","affiliation":[{"name":"Alexa AI, Amazon.com Inc., Sunnyvale, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Seungwhan","family":"Moon","sequence":"additional","affiliation":[{"name":"Meta Seattle, Redmond, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhengcong","family":"Fei","sequence":"additional","affiliation":[{"name":"Institute of Computing Technology Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3318-5975","authenticated-orcid":false,"given":"Zekang","family":"Li","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Information Processing, Institute of Computing Technology Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jinchao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tencent AI Lab Beijing, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2579-1366","authenticated-orcid":false,"given":"Yang","family":"Feng","sequence":"additional","affiliation":[{"name":"Kexueyuan South Road Zhongguancun, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5899-5165","authenticated-orcid":false,"given":"Jie","family":"Zhou","sequence":"additional","affiliation":[{"name":"Kexueyuan South Road Zhongguancun, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7443-1212","authenticated-orcid":false,"given":"Seokhwan","family":"Kim","sequence":"additional","affiliation":[{"name":"Alexa AI, Amazon.com Inc., Sunnyvale, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yang","family":"Liu","sequence":"additional","affiliation":[{"name":"1120 Enterprise way, Sunnyvale, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9587-1698","authenticated-orcid":false,"given":"Di","family":"Jin","sequence":"additional","affiliation":[{"name":"Amazon.com Inc., Seattle, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alexandros","family":"Papangelis","sequence":"additional","affiliation":[{"name":"Amazon.com Inc., Seattle, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Karthik","family":"Gopalakrishnan","sequence":"additional","affiliation":[{"name":"Alexa AI, Amazon.com Inc., Sunnyvale, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dilek","family":"Hakkani-Tur","sequence":"additional","affiliation":[{"name":"Alexa AI, Amazon.com Inc., Sunnyvale, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Babak","family":"Damavandi","sequence":"additional","affiliation":[{"name":"Meta Inc., Palo Alto, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alborz","family":"Geramifard","sequence":"additional","affiliation":[{"name":"Amazon.com Inc., Menlo Park, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4201-7578","authenticated-orcid":false,"given":"Chiori","family":"Hori","sequence":"additional","affiliation":[{"name":"Audio and Speech Group, Mitsubishi Electric Research Laboratories, Cambridge, MA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ankit","family":"Shah","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Department of Language and Information Technologies or just Carnegie Mellon University, Pittsburgh, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chen","family":"Zhang","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9158-9401","authenticated-orcid":false,"given":"Haizhou","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, National University of Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jo\u00e3o","family":"Sedoc","sequence":"additional","affiliation":[{"name":"New York University, New York, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3411-7384","authenticated-orcid":false,"given":"Luis F.","family":"D'Haro","sequence":"additional","affiliation":[{"name":"ETSI de Telecomunicaci&#x00F3;n - Speech Technology and Machine Learning Group, Universidad Politecnica de Madrid Ciudad Universitaria, Madrid, Spain"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rafael","family":"Banchs","sequence":"additional","affiliation":[{"name":"Nanyang Technological University, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alexander","family":"Rudnicky","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"404","article-title":"The dialog state tracking challenge","volume-title":"Proc. Meeting Special Int. Group Discourse Dialogue","author":"Williams","year":"2013"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-4337"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2014.7078595"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-10-2585-3_36"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2016.7846311"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2018.09.004"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2020.101068"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W19-4107"},{"key":"ref9","article-title":"Grounded response generation task at DSTC7","volume-title":"Proc. AAAI Dialog Syst. Technol. Challenges Workshop.","author":"Galley","year":"2019"},{"key":"ref10","article-title":"Audio visual scene-aware dialog (AVSD) challenge at DSTC7","author":"Alamri","year":"2018"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3078368"},{"key":"ref12","article-title":"Overview of the ninth dialog system technology challenge: DSTC9","author":"Gunasekara","year":"2020"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3359170"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2019.102170"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-49788-0_2"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1057\/ejis.2010.15"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.121"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00774"},{"key":"ref19","article-title":"Towards expressive communication with internet memes: A new multimodal conversation dataset and benchmark","author":"Fei","year":"2021"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-60450-9_8"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/w17-5526"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/E17-1042"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1547"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6394"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.2002-374"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2005.07.005"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2012.6424218"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2013-580"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462030"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2016-1583"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2016.7846295"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003825"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.347"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.sigdial-1.35"},{"key":"ref35","first-page":"12449","article-title":"wav2vec 2.0: A. framework for self-supervised learning of speech representations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Baevski","year":"2020"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.sigdial-1.35"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.sigdial-1.4"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3301222"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.coling-main.96"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.401"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1564"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00774"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682583"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"ref46","article-title":"A better use of audio-visual cues: Dense video captioning with bi-modal transformer","volume-title":"Proc. 31st Brit. Mach. Vis. Virtual Conf.","author":"Iashin","year":"2020"},{"key":"ref47","article-title":"Neural machine translation by jointly learning to align and translate","author":"Bahdanau","year":"2014"},{"key":"ref48","first-page":"577","article-title":"Attention-based models for speech recognition","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Chorowski","year":"2015"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref51","article-title":"DSTC10-AVSD submission system with reasoning using audio-visual transformers with joint student-teacher learning","volume-title":"Proc. AAAI-DSTC10","author":"Shah","year":"2022"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3143"},{"key":"ref53","article-title":"Interpretable multimodal dialogue system with natural language-based multimodal integration","volume-title":"Proc. AAAI-DSTC10","author":"Heo","year":"2022"},{"key":"ref54","article-title":"Audio visual scene-aware dialog generation withtransformer-based video representations","volume-title":"Proc. AAAI-DSTC10","author":"Yamazaki","year":"2022"},{"key":"ref55","article-title":"Investigation on transformer-based multi-modal fusionfor audio-visual scene-aware dialog","volume-title":"Proc. DSTC10 Workshop AAAI-2022","author":"Huang","year":"2022"},{"key":"ref56","first-page":"986","article-title":"DailyDialog: A manually labelled multi-turn dialogue dataset","volume-title":"Proc. Int. Joint Conf. Natural Lang. Process.","author":"Li","year":"2017"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3079"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1205"},{"key":"ref59","article-title":"Automatic evaluation and moderation of open-domain dialogue systems","author":"Zhang","year":"2021"},{"key":"ref60","first-page":"203","article-title":"Movie-DIC: A movie dialogue corpus for research and development","volume-title":"Proc. Assoc. Comput. Linguistics","author":"Banchs","year":"2012"},{"key":"ref61","first-page":"76","article-title":"Chameleons in imagined conversations: A new approach to understanding coordination of linguistic style in dialogs","volume-title":"Proc. 2nd Workshop Cogn. Model. Comput. Linguistics","author":"Danescu-Niculescu-Mizil","year":"2011"},{"key":"ref62","article-title":"Results of the multi-domain task-completion dialog challenge","volume-title":"Proc. AAAI-DSTC8","author":"Li","year":"2020"},{"key":"ref63","article-title":"Colbert: Using bert sentence embedding for humor detection","author":"Annamoradnejad","year":"2020"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1404"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1534"},{"key":"ref66","first-page":"1597","article-title":"Emotionlines: An emotion corpus of multi-party conversations","volume-title":"Proc. 11th Int. Conf. Lang. Resour. Eval.","author":"Chen","year":"2018"},{"key":"ref67","article-title":"Genuine2: An open domain chatbot based on generative models","volume-title":"Proc. Alexa Socialbot Grand Challenge SGC4","author":"Rodrguez-Cantelar","year":"2021"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-15-8395-7_5"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.333"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-demos.30"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref72","first-page":"74","article-title":"ROUGE: A. package for automatic evaluation of summaries","volume-title":"Text Summarization Branches Out.","author":"Lin","year":"2004"},{"key":"ref73","article-title":"Bertscore: Evaluating text generation with bert","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhang","year":"2020"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.704"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eancs-1.3"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.356"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.579"},{"key":"ref78","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Brown","year":"2020"},{"key":"ref79","article-title":"Report from the NSF future directions workshop on automatic evaluation of dialog: Research directions and challenges","author":"Mehri","year":"2022"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/10304349\/10174647.pdf?arnumber=10174647","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,12]],"date-time":"2024-01-12T00:22:17Z","timestamp":1705018937000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10174647\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":79,"URL":"https:\/\/doi.org\/10.1109\/taslp.2023.3293030","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}