{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:06:47Z","timestamp":1776888407905,"version":"3.51.2"},"reference-count":65,"publisher":"MIT Press","license":[{"start":{"date-parts":[[2023,3,16]],"date-time":"2023-03-16T00:00:00Z","timestamp":1678924800000},"content-version":"vor","delay-in-days":74,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["direct.mit.edu"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,3,14]]},"abstract":"<jats:title>Abstract<\/jats:title><jats:p>We introduce dGSLM, the first \u201ctextless\u201d model able to generate audio samples of naturalistic spoken dialogues. It uses recent work on unsupervised spoken unit discovery coupled with a dual-tower transformer architecture with cross-attention trained on 2000 hours of two-channel raw conversational audio (Fisher dataset) without any text or labels. We show that our model is able to generate speech, laughter, and other paralinguistic signals in the two channels simultaneously and reproduces more naturalistic and fluid turn taking compared to a text-based cascaded model.1,2<\/jats:p>","DOI":"10.1162\/tacl_a_00545","type":"journal-article","created":{"date-parts":[[2023,3,16]],"date-time":"2023-03-16T14:38:44Z","timestamp":1678977524000},"page":"250-266","update-policy":"https:\/\/doi.org\/10.1162\/mitpressjournals.corrections.policy","source":"Crossref","is-referenced-by-count":39,"title":["Generative Spoken Dialogue Language Modeling"],"prefix":"10.1162","volume":"11","author":[{"given":"Tu Anh","family":"Nguyen","sequence":"first","affiliation":[{"name":"Meta AI Research, France. ntuanh@meta.com"},{"name":"Inria, Paris, France"}]},{"given":"Eugene","family":"Kharitonov","sequence":"additional","affiliation":[{"name":"Meta AI Research, France"}]},{"given":"Jade","family":"Copet","sequence":"additional","affiliation":[{"name":"Meta AI Research, France"}]},{"given":"Yossi","family":"Adi","sequence":"additional","affiliation":[{"name":"Meta AI Research, Isra\u00ebl"}]},{"given":"Wei-Ning","family":"Hsu","sequence":"additional","affiliation":[{"name":"Meta AI Research, United States"}]},{"given":"Ali","family":"Elkahky","sequence":"additional","affiliation":[{"name":"Meta AI Research, United States"}]},{"given":"Paden","family":"Tomasello","sequence":"additional","affiliation":[{"name":"Meta AI Research, United States"}]},{"given":"Robin","family":"Algayres","sequence":"additional","affiliation":[{"name":"Meta AI Research, France"}]},{"given":"Beno\u00eet","family":"Sagot","sequence":"additional","affiliation":[{"name":"Inria, Paris, France"}]},{"given":"Abdelrahman","family":"Mohamed","sequence":"additional","affiliation":[{"name":"Meta AI Research, France. abdo@meta.com"}]},{"given":"Emmanuel","family":"Dupoux","sequence":"additional","affiliation":[{"name":"Meta AI Research, France. dpx@meta.com"},{"name":"EHESS, ENS-PSL, CNRS, Paris, France"}]}],"member":"281","published-online":{"date-parts":[[2023,3,14]]},"reference":[{"key":"2023031614352694600_","article-title":"Towards a human-like open-domain chatbot","author":"Adiwardana","year":"2020","journal-title":"CoRR"},{"key":"2023031614352694600_","article-title":"Prosody-based automatic detection of annoyance and frustration in human-computer dialog","volume-title":"INTERSPEECH","author":"Ang","year":"2002"},{"key":"2023031614352694600_","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"arXiv preprint arXiv:2006.11477"},{"key":"2023031614352694600_","article-title":"A brief overview of unsupervised neural speech representation learning","author":"Borgholt","year":"2022","journal-title":"arXiv preprint arXiv:2203.01829"},{"key":"2023031614352694600_","article-title":"AudioLM: A language modeling approach to audio generation","author":"Borsos","year":"2022"},{"issue":"1","key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"73","DOI":"10.1002\/j.1538-7305.1968.tb00031.x","article-title":"A statistical analysis of on-off patterns in 16 conversations","volume":"47","author":"Brady","year":"1968","journal-title":"Bell System Technical Journal"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052974","article-title":"pyannote.audio: Neural building blocks for speaker diarization","volume-title":"ICASSP 2020, IEEE International Conference on Acoustics, Speech, and Signal Processing","author":"Bredin","year":"2020"},{"key":"2023031614352694600_","first-page":"29","volume-title":"Human Conversation as a System Framework: Designing Embodied Conversational Agents","author":"Cassell","year":"2001"},{"key":"2023031614352694600_","doi-asserted-by":"crossref","first-page":"3497","DOI":"10.1109\/ICASSP40776.2020.9054438","article-title":"Generative pre-training for speech with autoregressive predictive coding","volume-title":"ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Chung","year":"2020"},{"key":"2023031614352694600_","article-title":"The Fisher corpus: A resource for the next generations of speech-to-text","volume-title":"LREC","author":"Cieri","year":"2004"},{"key":"2023031614352694600_","article-title":"Variable-rate discrete representation learning","author":"Dieleman","year":"2021","journal-title":"arXiv preprint arXiv:2103.06089"},{"issue":"2","key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"283","DOI":"10.1037\/h0033031","article-title":"Some signals and rules for taking speaking turns in conversations.","volume":"23","author":"Duncan","year":"1972","journal-title":"Journal of Personality and Social Psychology"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.268","article-title":"Turngpt: A transformer-based language model for predicting turn-taking in spoken dialog","author":"Ekstedt","year":"2020","journal-title":"arXiv preprint arXiv:2010.10874"},{"key":"2023031614352694600_","article-title":"Revisiting the boundary between ASR and NLU in the age of conversational dialog systems","author":"Faruqui","year":"2021","journal-title":"CoRR"},{"key":"2023031614352694600_","doi-asserted-by":"crossref","first-page":"2481","DOI":"10.21437\/Interspeech.2021-353","article-title":"Robust laughter detection in noisy environments","volume-title":"Proceedings of Interspeech 2021","author":"Gillick","year":"2021"},{"issue":"3","key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"601","DOI":"10.1016\/j.csl.2010.10.003","article-title":"Turn-taking cues in task-oriented dialogue","volume":"25","author":"Gravano","year":"2011","journal-title":"Computer Speech & Language"},{"issue":"4","key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"555","DOI":"10.1016\/j.wocn.2010.08.002","article-title":"Pauses, gaps and overlaps in conversations","volume":"38","author":"Heldner","year":"2010","journal-title":"Journal of Phonetics"},{"key":"2023031614352694600_","article-title":"Hubert: Self-supervised speech representation learning by masked prediction of hidden units","author":"Hsu","year":"2021","journal-title":"CoRR"},{"key":"2023031614352694600_","doi-asserted-by":"crossref","first-page":"721","DOI":"10.21437\/Interspeech.2021-236","article-title":"Robust wav2vec 2.0: Analyzing domain shift in self-supervised pre-training","volume-title":"Proceedings of Interspeech 2021","author":"Hsu","year":"2021"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"7669","DOI":"10.1109\/ICASSP40776.2020.9052942","article-title":"Libri-light: A benchmark for asr with limited or no supervision","volume-title":"ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Kahn","year":"2020"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.593","article-title":"Text-free prosody-aware generative spoken language modeling","volume-title":"arXiv preprint arXiv:2109.03264","author":"Kharitonov","year":"2021"},{"key":"2023031614352694600_","article-title":"Adam: A method for stochastic optimization","volume-title":"3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings","author":"Kingma","year":"2015"},{"key":"2023031614352694600_","article-title":"Internet-augmented dialogue generation","author":"Komeili","year":"2021","journal-title":"CoRR"},{"key":"2023031614352694600_","article-title":"HiFi-GAN: Generative adversarial networks for efficient and high fidelity speech synthesis","volume-title":"Proceedings of NeurIPS","author":"Kong","year":"2020"},{"key":"2023031614352694600_","article-title":"Textless speech emotion conversion using decomposed and discrete representations","author":"Kreuk","year":"2021","journal-title":"arXiv preprint arXiv:2111.07402"},{"key":"2023031614352694600_","article-title":"Nemo: A toolkit for building ai applications using neural modules","author":"Kuchaiev","year":"2019"},{"key":"2023031614352694600_","article-title":"On Generative Spoken Language Modeling from Raw Audio","author":"Lakhotia","year":"2021","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"2023031614352694600_","doi-asserted-by":"crossref","first-page":"731","DOI":"10.3389\/fpsyg.2015.00731","article-title":"Timing in turn-taking and its implications for processing models of language","volume":"6","author":"Levinson","year":"2015","journal-title":"Frontiers in Psychology"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.703","article-title":"BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension","author":"Lewis","year":"2019","journal-title":"CoRR"},{"key":"2023031614352694600_","article-title":"Retrieval-augmented generation for knowledge-intensive NLP tasks","author":"Lewis","year":"2020","journal-title":"CoRR"},{"key":"2023031614352694600_","article-title":"A diversity-promoting objective function for neural conversation models","author":"Li","year":"2015","journal-title":"CoRR"},{"key":"2023031614352694600_","doi-asserted-by":"crossref","first-page":"6419","DOI":"10.1109\/ICASSP40776.2020.9054458","article-title":"Mockingjay: Unsupervised speech representation learning with deep bidirectional transformer encoders","volume-title":"ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Liu","year":"2020"},{"key":"2023031614352694600_","doi-asserted-by":"crossref","first-page":"224","DOI":"10.18653\/v1\/W18-5024","article-title":"Neural dialogue context online end-of-turn detection","volume-title":"Proceedings of the 19th Annual SIGdial Meeting on Discourse and Dialogue","author":"Masumura","year":"2018"},{"issue":"4","key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"903","DOI":"10.1016\/j.csl.2014.02.002","article-title":"Data-driven models for timing feedback responses in a map task dialogue system","volume":"28","author":"Meena","year":"2014","journal-title":"Computer Speech & Language"},{"key":"2023031614352694600_","doi-asserted-by":"crossref","DOI":"10.1111\/cdev.13754","article-title":"A systematic review and Bayesian meta-analysis of the development of turn taking in adult-child vocal interactions","author":"Nguyen","year":"2022"},{"key":"2023031614352694600_","article-title":"Recent advances in deep learning based dialogue systems: A systematic survey","author":"Ni","year":"2021","journal-title":"arXiv preprint arXiv:2105.04387"},{"key":"2023031614352694600_","first-page":"80","article-title":"Variational inference for acoustic unit discovery","volume-title":"SLTU","author":"Ondel","year":"2016"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"48","DOI":"10.18653\/v1\/N19-4009","article-title":"fairseq: A fast, extensible toolkit for sequence modeling","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics (Demonstrations)","author":"Ott","year":"2019"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-475","article-title":"Speech resynthesis from discrete disentangled self-supervised representations","volume-title":"Proceedings of INTERSPEECH","author":"Polyak","year":"2021"},{"key":"2023031614352694600_","article-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"629","DOI":"10.3115\/1620754.1620846","article-title":"A finite-state turn-taking model for spoken dialog systems","volume-title":"Proceedings of Human Language Technologies: The 2009 Annual Conference of the North American Chapter of the Association for Computational Linguistics","author":"Raux","year":"2009"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"2416","DOI":"10.1109\/ICASSP.2011.5946971","article-title":"Crowdmos: An approach for crowdsourcing mean opinion score studies","volume-title":"2011 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Ribeiro","year":"2011"},{"key":"2023031614352694600_","doi-asserted-by":"crossref","first-page":"156","DOI":"10.1109\/SLT48900.2021.9383461","article-title":"Towards unsupervised learning of speech features in the wild","volume-title":"2021 IEEE Spoken Language Technology Workshop (SLT)","author":"Rivi\u00e8re","year":"2021"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2124","article-title":"Investigating speech features for continuous turn-taking prediction using LSTMs","author":"Roddy","year":"2018","journal-title":"arXiv preprint arXiv:1806.11461"},{"key":"2023031614352694600_","article-title":"Recipes for building an open-domain chatbot","author":"Roller","year":"2020","journal-title":"CoRR"},{"issue":"4","key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"696","DOI":"10.1353\/lan.1974.0010","article-title":"A simplest systematics for the organization of turn-taking for conversation","volume":"50","author":"Sacks","year":"1974","journal-title":"Language"},{"key":"2023031614352694600_","first-page":"71","article-title":"Discourse as an interactional achievement: Some uses of \u2019uh huh\u2019and other things that come between sentences","volume":"71","author":"Schegloff","year":"1982","journal-title":"Analyzing Discourse: Text and Talk"},{"issue":"1","key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1017\/S0047404500001019","article-title":"Overlapping talk and the organization of turn-taking for conversation","volume":"29","author":"Schegloff","year":"2000","journal-title":"Language in Society"},{"issue":"1","key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1016\/j.csl.2012.02.005","article-title":"Paralinguistics in speech and language\u2014state-of-the-art and the challenge","volume":"27","author":"Schuller","year":"2013","journal-title":"Computer Speech & Language"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"1715","DOI":"10.18653\/v1\/P16-1162","article-title":"Neural machine translation of rare words with subword units","volume-title":"Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Sennrich","year":"2016"},{"key":"2023031614352694600_","article-title":"Generative deep neural networks for dialogue: A short review","author":"Serban","year":"2016","journal-title":"CoRR"},{"key":"2023031614352694600_","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2022.findings-emnlp.27","article-title":"Language models that seek for knowledge: Modular search & generation for dialogue and prompt completion","author":"Shuster","year":"2022"},{"key":"2023031614352694600_","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/W17-5527","article-title":"Towards a general, continuous model of turn-taking in spoken dialogue using LSTM recurrent neural networks","volume-title":"SIGdial","author":"Skantze","year":"2017"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"101178","DOI":"10.1016\/j.csl.2020.101178","article-title":"Turn-taking in conversational systems and human-robot interaction: A review","volume":"67","author":"Skantze","year":"2021","journal-title":"Computer Speech & Language"},{"issue":"26","key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"10587","DOI":"10.1073\/pnas.0903616106","article-title":"Universals and cultural variation in turn-taking in conversation","volume":"106","author":"Stivers","year":"2009","journal-title":"Proceedings of the National Academy of Sciences"},{"issue":"1-2","key":"2023031614352694600_","doi-asserted-by":"publisher","first-page":"80","DOI":"10.1016\/j.specom.2005.05.009","article-title":"On temporal aspects of turn taking in conversational dialogues","volume":"47","author":"Bosch","year":"2005","journal-title":"Speech Communication"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","DOI":"10.1007\/978-94-017-2367-1_8","article-title":"Natural turn-taking needs no manual: Computational theory and model, from perception to action","author":"Th\u00f3risson","year":"2002"},{"key":"2023031614352694600_","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"CoRR"},{"key":"2023031614352694600_","article-title":"Neural discrete representation learning","author":"van den Oord","year":"2017","journal-title":"CoRR"},{"key":"2023031614352694600_","article-title":"A neural conversational model","author":"Vinyals","year":"2015","journal-title":"CoRR"},{"key":"2023031614352694600_","doi-asserted-by":"crossref","DOI":"10.1017\/9781316848265","volume-title":"Prosodic Patterns in English Conversation","author":"Ward","year":"2019"},{"key":"2023031614352694600_","article-title":"Beyond goldfish memory: Long-term open-domain conversation","author":"Jing","year":"2021","journal-title":"CoRR"},{"key":"2023031614352694600_","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1775","article-title":"Superb: Speech processing universal performance benchmark","author":"Yang","year":"2021","journal-title":"arXiv preprint arXiv:2105.01051"},{"key":"2023031614352694600_","first-page":"567","article-title":"On getting a word in edgewise","volume-title":"Chicago Linguistics Society, 6th Meeting, 1970","author":"Yngve","year":"1970"},{"key":"2023031614352694600_","article-title":"Dialogpt: Large-scale generative pre-training for conversational response generation","author":"Zhang","year":"2019","journal-title":"CoRR"}],"container-title":["Transactions of the Association for Computational Linguistics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/direct.mit.edu\/tacl\/article-pdf\/doi\/10.1162\/tacl_a_00545\/2074878\/tacl_a_00545.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/direct.mit.edu\/tacl\/article-pdf\/doi\/10.1162\/tacl_a_00545\/2074878\/tacl_a_00545.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,9]],"date-time":"2023-12-09T02:16:28Z","timestamp":1702088188000},"score":1,"resource":{"primary":{"URL":"https:\/\/direct.mit.edu\/tacl\/article\/doi\/10.1162\/tacl_a_00545\/115240\/Generative-Spoken-Dialogue-Language-Modeling"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"references-count":65,"URL":"https:\/\/doi.org\/10.1162\/tacl_a_00545","relation":{},"ISSN":["2307-387X"],"issn-type":[{"value":"2307-387X","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2023]]},"published":{"date-parts":[[2023]]}}}