{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:57:56Z","timestamp":1776887876400,"version":"3.51.2"},"reference-count":30,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,4,14]]},"DOI":"10.1109\/icasspw62465.2024.10669905","type":"proceedings-article","created":{"date-parts":[[2024,9,10]],"date-time":"2024-09-10T18:23:22Z","timestamp":1725992602000},"page":"823-827","source":"Crossref","is-referenced-by-count":8,"title":["High-Fidelity Neural Phonetic Posteriorgrams"],"prefix":"10.1109","author":[{"given":"Cameron","family":"Churchwell","sequence":"first","affiliation":[{"name":"Northwestern University,Evanston,IL,USA"}]},{"given":"Max","family":"Morrison","sequence":"additional","affiliation":[{"name":"Northwestern University,Evanston,IL,USA"}]},{"given":"Bryan","family":"Pardo","sequence":"additional","affiliation":[{"name":"Northwestern University,Evanston,IL,USA"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2009.5372889"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2016.7552917"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3076867"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096220"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.21437\/VCC_BC.2020-14"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ISCSLP57327.2022.10037585"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1778"},{"key":"ref8","article-title":"Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech","volume-title":"International Conference on Machine Learning","author":"Kim"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/325165.325242"},{"key":"ref10","volume-title":"A course in phonetics","author":"Ladefoged","year":"2014"},{"key":"ref11","article-title":"Neural codec language models are zero-shot text to speech synthesizers","author":"Wang","year":"2023"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746112"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/ICSLP.2000-537"},{"key":"ref14","article-title":"Attention is all you need","volume-title":"Neural Information Processing Systems","author":"Vaswani"},{"key":"ref15","article-title":"Common Voice: A massively-multilingual speech corpus","volume-title":"International Conference on Language Resources and Evaluation","author":"Ardila"},{"key":"ref16","article-title":"The impact of neural network overparameterization on gradient confusion and stochastic gradient descent","volume-title":"International Conference on Machine Learning","author":"Sankararaman"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1406.3269"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097075"},{"key":"ref19","article-title":"Cross-domain neural pitch and periodicity estimation","author":"Morrison","year":"2023"},{"key":"ref20","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"Baevski","year":"2020","journal-title":"Neural Information Processing Systems"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"ref22","article-title":"High fidelity neural audio compression","author":"D\u00e9fossez","year":"2022"},{"key":"ref23","article-title":"The CMU Arctic speech databases","volume-title":"ISCA Workshop on Speech Synthesis","author":"Kominek"},{"key":"ref24","article-title":"DARPA TIMIT acoustic-phonetic continous speech corpus CD-ROM. NIST speech disc 1-1.1","volume-title":"NASA STI\/Recon Technical Report","author":"Garofolo","year":"1993"},{"key":"ref25","article-title":"CSTR VCTK Corpus: English multi-speaker corpus for CSTR voice cloning toolkit (version 0.92)","author":"Yamagishi","year":"2019"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2005.06.042"},{"key":"ref27","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"International Conference on Machine Learning","author":"Radford"},{"key":"ref28","article-title":"Reproducible subjective evaluation","volume-title":"ICLR Workshop on ML Evaluation Standards","author":"Morrison"},{"key":"ref29","article-title":"Method for the subjective assessment of intermediate sound quality","year":"2001"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1016\/j.wocn.2022.101137"}],"event":{"name":"2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)","location":"Seoul, Korea, Republic of","start":{"date-parts":[[2024,4,14]]},"end":{"date-parts":[[2024,4,19]]}},"container-title":["2024 IEEE International Conference on Acoustics, Speech, and Signal Processing Workshops (ICASSPW)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10625769\/10625780\/10669905.pdf?arnumber=10669905","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,11]],"date-time":"2024-09-11T17:29:40Z","timestamp":1726075780000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10669905\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,14]]},"references-count":30,"URL":"https:\/\/doi.org\/10.1109\/icasspw62465.2024.10669905","relation":{},"subject":[],"published":{"date-parts":[[2024,4,14]]}}}