{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:27:22Z","timestamp":1775230042389,"version":"3.50.1"},"reference-count":65,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"JSPS KAKENHI","award":["JP17H06101"],"award-info":[{"award-number":["JP17H06101"]}]},{"name":"JSPS KAKENHI","award":["JP17K00237"],"award-info":[{"award-number":["JP17K00237"]}]},{"name":"NII CRIS"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2020]]},"DOI":"10.1109\/taslp.2020.2977776","type":"journal-article","created":{"date-parts":[[2020,3,2]],"date-time":"2020-03-02T21:03:14Z","timestamp":1583182994000},"page":"976-989","source":"Crossref","is-referenced-by-count":23,"title":["Machine Speech Chain"],"prefix":"10.1109","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1246-5908","authenticated-orcid":false,"given":"Andros","family":"Tjandra","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5509-8963","authenticated-orcid":false,"given":"Sakriani","family":"Sakti","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6956-3803","authenticated-orcid":false,"given":"Satoshi","family":"Nakamura","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1166"},{"key":"ref38","article-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling","author":"chung","year":"0","journal-title":"Proc Neural Inf Process Syst 2014 Workshop on Deep Learn"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1710"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003990"},{"key":"ref31","first-page":"5410","article-title":"Almost unsupervised text to speech and automatic speech recognition","author":"ren","year":"2019","journal-title":"Proceedings 36th Int Conf Mach Learn"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1746"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683307"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-3167"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682816"},{"key":"ref60","article-title":"Learning from labeled and unlabeled data with label propagation","author":"zhu","year":"2002"},{"key":"ref62","article-title":"Estimating or propagating gradients through stochastic neurons for conditional computation","author":"bengio","year":"2013","journal-title":"arXiv 1308 3432"},{"key":"ref61","article-title":"Distilling the knowledge in a neural network","author":"hinton","year":"0","journal-title":"Proc Neural Inf Process Syst Deep Learn Representation Learn Workshop"},{"key":"ref63","article-title":"Neural networks for machine learning, Coursera video lectures","author":"hinton","year":"2012"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1185"},{"key":"ref64","article-title":"Categorical reparameterization with gumbel-softmax","author":"jang","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref27","first-page":"820","article-title":"Dual learning for machine translation","author":"he","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref65","article-title":"The concrete distribution: A continuous relaxation of discrete random variables","author":"maddison","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1009"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1121\/1.1906946"},{"key":"ref1","author":"denes","year":"1993","journal-title":"The Speech Chain"},{"key":"ref20","author":"council","year":"2004","journal-title":"Hearing Loss Determining Eligibility for Social Security Benefits"},{"key":"ref22","first-page":"3104","article-title":"Sequence-to-Sequence learning with neural networks","author":"sutskever","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref21","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2015","journal-title":"Proc 3rd Int Conf Learn Representations"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"ref23","article-title":"End-to-end continuous speech recognition using attention-based recurrent NN: First results","author":"chorowski","year":"0","journal-title":"Proc Neural Inf Process Syst 2014 Workshop Deep Learn"},{"key":"ref26","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"Proc 32nd Int Conf Mach Learn"},{"key":"ref25","first-page":"4006","article-title":"Tacotron: Towards End-to-End Speech Synthesis","author":"wang","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1158"},{"key":"ref51","first-page":"207","article-title":"Distance metric learning for large margin nearest neighbor classification","volume":"10","author":"weinberger","year":"2009","journal-title":"J Mach Learn Res"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU.2017.8268951"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953075"},{"key":"ref57","first-page":"8024","article-title":"PyTorch: An imperative style, high-performance deep learning library","author":"paszke","year":"2019","journal-title":"Advances in Neural IInformation Processing Systems"},{"key":"ref56","article-title":"The Kaldi speech recognition toolkit","author":"povey","year":"0","journal-title":"Proc IEEE Workshop Autom Speech Recognit Understanding"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.3115\/1075527.1075614"},{"key":"ref54","first-page":"694","article-title":"Perceptual losses for real-time style transfer and super-resolution","author":"johnson","year":"0","journal-title":"Proc Eur Conf Comput Vision"},{"key":"ref53","first-page":"1558","article-title":"Autoencoding beyond pixels using a learned similarity metric","author":"larsen","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"ref10","first-page":"483","article-title":"Atr $\\upsilon$-talk speech","author":"sagisaka","year":"0","journal-title":"Proc 4th Int Conf Spoken Lang Process"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1996.541110"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/SLT.2018.8639528"},{"key":"ref12","first-page":"2347","article-title":"Simultaneous modeling of spectrum, pitch and duration in HMM-based speech synthesis","author":"yoshimura","year":"0","journal-title":"Proc EUROSPEECH"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1995.479684"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178781"},{"key":"ref16","first-page":"1","article-title":"Learning the speech front-end with raw waveform cldnns","author":"sainath","year":"2015","journal-title":"InterSpeech"},{"key":"ref17","first-page":"7962","article-title":"Statistical parametric speech synthesis using deep neural networks","author":"zen","year":"0","journal-title":"Proc IEEE Int Conf Acoust Speech Signal Process"},{"key":"ref18","article-title":"Wavenet: A generative model for raw audio","author":"oord","year":"2016","journal-title":"arXiv 1609 03499"},{"key":"ref19","first-page":"195","article-title":"Deep voice: Real-time neural text-to-speech","author":"arik","year":"0","journal-title":"Proc 34th Int Conf Mach Learn"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1978.1163055"},{"key":"ref3","first-page":"81","article-title":"Speech discrimination by dynamic programming","volume":"4","author":"vintsyuk","year":"1968","journal-title":"Kibernetika"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/29.103088"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/PROC.1976.10159"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1977.1170350"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1177\/002383096400700301"},{"key":"ref49","article-title":"Deep speaker: An end-to-end neural speaker embedding system","author":"li","year":"2017","journal-title":"arXiv 1705 02304"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1988.196677"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7472618"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref48","first-page":"4828","article-title":"Full-covariance UBM and heavy-tailed PLDA in i-vector speaker verification","author":"mat?jka","year":"0","journal-title":"Proc IEEE Int Conf Acoust Speech Signal Process"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.21236\/ADA613971"},{"key":"ref42","article-title":"The LJ speech dataset","author":"ito","year":"2017"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1984.1164317"},{"key":"ref44","article-title":"Empirical evaluation of rectified activations in convolutional network","author":"xu","year":"2015","journal-title":"arXiv 1505 00853"},{"key":"ref43","article-title":"librosa 0.5.0","author":"mcfee","year":"2017"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/8938144\/09020132.pdf?arnumber=9020132","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,12]],"date-time":"2022-01-12T01:07:34Z","timestamp":1641949654000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9020132\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"references-count":65,"URL":"https:\/\/doi.org\/10.1109\/taslp.2020.2977776","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]}}}