{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,26]],"date-time":"2025-08-26T06:47:19Z","timestamp":1756190839824,"version":"3.37.3"},"reference-count":63,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Ministry of Science and Technology, R.O.C","award":["110-2628-E002-001","110-2223-E-002-007-MY3"],"award-info":[{"award-number":["110-2628-E002-001","110-2223-E-002-007-MY3"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2022]]},"DOI":"10.1109\/taslp.2021.3138720","type":"journal-article","created":{"date-parts":[[2021,12,28]],"date-time":"2021-12-28T21:22:55Z","timestamp":1640726575000},"page":"230-243","source":"Crossref","is-referenced-by-count":7,"title":["Learning Phone Recognition From Unpaired Audio and Phone Sequences Based on Generative Adversarial Network"],"prefix":"10.1109","volume":"30","author":[{"given":"Da-rong","family":"Liu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Po-chun","family":"Hsu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi-chen","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sung-feng","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shun-po","family":"Chuang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Da-yi","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9654-5747","authenticated-orcid":false,"given":"Hung-yi","family":"Lee","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"577","article-title":"Attention-based models for speech recognition","volume-title":"Proc. 28th Int. Conf. Neural Inf. Process. Syst.","author":"Chorowski","year":"2015"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462105"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2059"},{"key":"ref5","article-title":"Unsupervised neural machine translation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Artetxe","year":"2018"},{"key":"ref6","article-title":"Word translation without parallel data","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Conneau","year":"2018"},{"key":"ref7","article-title":"Unsupervised machine translation using monolingual corpora only","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Lample","year":"2018"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.3156\/jsoft.29.5_177_2"},{"key":"ref9","first-page":"214","article-title":"Wasserstein GAN","volume-title":"Proc. 34th Int. Conf. Mach. Learn.","author":"Arjovsky","year":"2017"},{"key":"ref10","first-page":"5769","article-title":"Improved training of wasserstein GANs","volume-title":"Proc. 31st Int. Conf. Neural Inf. Process. Syst.","author":"Gulrajani","year":"2017"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-877"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-3011"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2008.4518528"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2398"},{"key":"ref15","first-page":"1","article-title":"Phoneme boundary detection using deep bidirectional LSTMs","volume-title":"Proc. Speech Commun., 12. ITG Symp.","author":"Franke","year":"2016"},{"key":"ref16","first-page":"2817","article-title":"Basic cuts revisited: Temporal segmentation of speech into phone-like units with statistical learning at a pre-linguistic level","volume-title":"Proc. Annu. Meeting Cogn. Sci. Soc.","volume":"36","author":"Rasanen","year":"2014"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-50"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2006.1660044"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2007-429"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.3115\/1609179.1609180"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2013.6707765"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7179089"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2014-273"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178970"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472619"},{"key":"ref26","article-title":"Multi-view recurrent neural acoustic word embeddings","volume-title":"5th Int. Conf. Learn. Representations, Conf. Track Proc.","author":"He","year":"2017"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1592"},{"key":"ref28","article-title":"Word-level acoustic modeling with convolutional vector regression","volume-title":"Proc. ICML Workshop Representation Learn.","author":"Maas","year":"2012"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2341"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-2364"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683639"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2016.7846310"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-82"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2904"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2743"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1785"},{"key":"ref37","article-title":"vq-wav2vec: Self-supervised learning of discrete speech representations","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Baevski","year":"2019"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1518"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2938863"},{"key":"ref40","first-page":"40","article-title":"A nonparametric Bayesian approach to acoustic model discovery","volume-title":"Proc. 50th Annu. Meeting Assoc. Comput. Linguistics","author":"Lee","year":"2012"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1016\/j.procs.2016.04.033"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8269009"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2224"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414899"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/E17-2076"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1800"},{"key":"ref47","first-page":"3550","article-title":"Unsupervised sequence classification using sequential output statistics","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Liu","year":"2017"},{"key":"ref48","article-title":"Unsupervised speech recognition via segmental empirical output distribution matching","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yeh","year":"2018"},{"key":"ref49","first-page":"1856","article-title":"Completely unsupervised speech recognition by a generative adversarial network harmonized with iteratively refined hidden Markov models","volume-title":"Proc. Conf. Int. Speech Commun. Assoc.","author":"Chen","year":"2019"},{"key":"ref50","first-page":"7354","article-title":"Unsupervised cross-modal alignment of speech and text embedding spaces","volume":"31","author":"Chung","year":"2018","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683550"},{"key":"ref52","article-title":"Categorical reparameterization with gumbel-softmax","volume-title":"5th Int. Conf. Learn. Representations, Conf. Track Proc.","author":"Jang","year":"2017"},{"key":"ref53","article-title":"The concrete distribution: A continuous relaxation of discrete random variables","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Maddison","year":"2017"},{"volume-title":"Statistical Theory of Extreme Values and Some Practical Applications: A Series of Lectures","year":"1954","author":"Gumbel","key":"ref54"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00067"},{"key":"ref57","article-title":"On the variance of the adaptive learning rate and beyond","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Liu","year":"2019"},{"key":"ref58","first-page":"5998","article-title":"Attention is all you need","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Vaswani","year":"2017"},{"key":"ref59","first-page":"1","article-title":"The kaldi speech recognition toolkit","volume-title":"Proc. IEEE Workshop Autom. Speech Recognit. Understanding","author":"Povey","year":"2011"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1064"},{"key":"ref61","first-page":"1851","article-title":"An improved speech segmentation quality measure: The R-value","volume-title":"Proc. 10th Annu. Conf. Int. Speech Commun. Assoc.","author":"Rsnen","year":"2009"},{"article-title":"Unsupervised speech recognition","year":"2021","author":"Baevski","key":"ref62"},{"key":"ref63","first-page":"12449","article-title":"Wav2Vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Proc. Adv. Neural Inf. Process. Syst."}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/9657755\/09664381.pdf?arnumber=9664381","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,13]],"date-time":"2024-01-13T21:55:30Z","timestamp":1705182930000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9664381\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"references-count":63,"URL":"https:\/\/doi.org\/10.1109\/taslp.2021.3138720","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"type":"print","value":"2329-9290"},{"type":"electronic","value":"2329-9304"}],"subject":[],"published":{"date-parts":[[2022]]}}}