{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T14:30:12Z","timestamp":1774449012413,"version":"3.50.1"},"publisher-location":"Cham","reference-count":23,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031223204","type":"print"},{"value":"9783031223211","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-22321-1_16","type":"book-chapter","created":{"date-parts":[[2022,11,30]],"date-time":"2022-11-30T09:06:15Z","timestamp":1669799175000},"page":"231-245","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["TransFusion: Transcribing Speech with\u00a0Multinomial Diffusion"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3001-6292","authenticated-orcid":false,"given":"Matthew","family":"Baas","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1355-8743","authenticated-orcid":false,"given":"Kevin","family":"Eloff","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2980-3475","authenticated-orcid":false,"given":"Herman","family":"Kamper","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,11,28]]},"reference":[{"key":"16_CR1","unstructured":"Ba, J.L., Kiros, J.R., Hinton, G.E.: Layer normalization. arXiv preprint arXiv:1607.06450 (2016)"},{"key":"16_CR2","unstructured":"Baevski, A., Zhou, Y., Mohamed, A., Auli, M.: wav2vec 2.0: a framework for self-supervised learning of speech representations. In: NeurIPS (2020)"},{"key":"16_CR3","doi-asserted-by":"publisher","first-page":"1505","DOI":"10.1109\/JSTSP.2022.3188113","volume":"16","author":"S Chen","year":"2022","unstructured":"Chen, S., et al.: WavLM: large-scale self-supervised pre-training for full stack speech processing. IEEE J. Sel. Topics Signal Process. 16, 1505\u20131518 (2022)","journal-title":"IEEE J. Sel. Topics Signal Process."},{"key":"16_CR4","doi-asserted-by":"crossref","unstructured":"Chung, Y.A., Zhang, Y., Han, W., Chiu, C.C., Qin, J., Pang, R., Wu, Y.: W2v-BERT: combining contrastive learning and masked language modeling for self-supervised speech pre-training. In: ASRU (2021)","DOI":"10.1109\/ASRU51503.2021.9688253"},{"key":"16_CR5","doi-asserted-by":"crossref","unstructured":"Graves, A., Fern\u00e1ndez, S., Gomez, F., Schmidhuber, J.: Connectionist temporal classification: labelling unsegmented sequence data with recurrent neural networks. In: ICML (2006)","DOI":"10.1145\/1143844.1143891"},{"key":"16_CR6","doi-asserted-by":"publisher","unstructured":"Hannun, A.: Sequence modeling with CTC. Distill (2017). https:\/\/doi.org\/10.23915\/distill.00008. https:\/\/distill.pub\/2017\/ctc","DOI":"10.23915\/distill.00008"},{"key":"16_CR7","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. In: NeurIPS Workshop on Deep Generative Models and Downstream Applications (2021)"},{"key":"16_CR8","unstructured":"Hoogeboom, E., Nielsen, D., Jaini, P., Forr\u00e9, P., Welling, M.: Argmax flows and multinomial diffusion: Learning categorical distributions. In: NeurIPS (2021)"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Hsu, W.N., Bolte, B., Tsai, Y.H.H., Lakhotia, K., Salakhutdinov, R., et al.: HuBERT: self-supervised speech representation learning by masked prediction of hidden units. arXiv preprint arXiv:2106.07447 (2021)","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"16_CR10","doi-asserted-by":"crossref","unstructured":"Kahn, J., Rivi\u00e8re, M., Zheng, W., Kharitonov, E., Xu, Q., et al.: Libri-light: a benchmark for ASR with limited or no supervision. In: ICASSP (2020)","DOI":"10.1109\/ICASSP40776.2020.9052942"},{"key":"16_CR11","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: ICLR (2015)"},{"key":"16_CR12","unstructured":"Kong, Z., Ping, W., Huang, J., Zhao, K., Catanzaro, B.: DiffWave: a versatile diffusion model for audio synthesis. arXiv preprint arXiv:2009.09761 (2020)"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Lugmayr, A., Danelljan, M., Romero, A., Yu, F., Timofte, R., Gool, L.V.: RePaint: inpainting using denoising diffusion probabilistic models. CoRR (2022)","DOI":"10.1109\/CVPR52688.2022.01117"},{"key":"16_CR14","unstructured":"Movellan, J.R., Mineiro, P.: A diffusion network approach to visual speech recognition. In: AVSP (1999)"},{"key":"16_CR15","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: PMLR (2021)"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: Librispeech: an ASR corpus based on public domain audio books. In: ICASSP (2015)","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"16_CR17","doi-asserted-by":"crossref","unstructured":"Park, D.S., et al.: Improved noisy student training for automatic speech recognition. In: Interspeech (2020)","DOI":"10.21437\/Interspeech.2020-1470"},{"key":"16_CR18","unstructured":"Ramachandran, P., Zoph, B., Le, Q.V.: Searching for activation functions. arXiv preprint arXiv:1710.05941 (2017)"},{"key":"16_CR19","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"16_CR20","doi-asserted-by":"crossref","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., et al.: Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487 (2022)","DOI":"10.1145\/3528233.3530757"},{"key":"16_CR21","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: ICML (2015)"},{"key":"16_CR22","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"16_CR23","unstructured":"Zhang, Y., et al.: Pushing the limits of semi-supervised learning for automatic speech recognition. arXiv preprint arXiv:2010.10504 (2020)"}],"container-title":["Communications in Computer and Information Science","Artificial Intelligence Research"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-22321-1_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,30]],"date-time":"2022-11-30T09:10:01Z","timestamp":1669799401000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-22321-1_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031223204","9783031223211"],"references-count":23,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-22321-1_16","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"value":"1865-0929","type":"print"},{"value":"1865-0937","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"28 November 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"SACAIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Southern African Conference for Artificial Intelligence Research","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Stellenbosch","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"South Africa","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 December 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"3","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"sacair2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/sacair.org.za\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"73","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"26","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"36% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}