{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T17:43:13Z","timestamp":1779385393519,"version":"3.53.1"},"reference-count":61,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"JSPS KAKENHI","award":["19H01116"],"award-info":[{"award-number":["19H01116"]}]},{"name":"JSPS KAKENHI","award":["18K18100"],"award-info":[{"award-number":["18K18100"]}]},{"name":"JSPS KAKENHI","award":["18J22090"],"award-info":[{"award-number":["18J22090"]}]},{"name":"JSPS KAKENHI","award":["17H06101"],"award-info":[{"award-number":["17H06101"]}]},{"name":"MIC\/SCOPE","award":["#182103104"],"award-info":[{"award-number":["#182103104"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/taslp.2021.3059114","type":"journal-article","created":{"date-parts":[[2021,2,17]],"date-time":"2021-02-17T01:59:09Z","timestamp":1613527149000},"page":"1033-1048","source":"Crossref","is-referenced-by-count":7,"title":["Perceptual-Similarity-Aware Deep Speaker Representation Learning for Multi-Speaker Generative Modeling"],"prefix":"10.1109","volume":"29","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7967-2613","authenticated-orcid":false,"given":"Yuki","family":"Saito","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0520-7847","authenticated-orcid":false,"given":"Shinnosuke","family":"Takamichi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0876-5617","authenticated-orcid":false,"given":"Hiroshi","family":"Saruwatari","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1789"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1464"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1587\/transinf.2017EDP7165"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7953089"},{"key":"ref31","article-title":"Active learning literature survey","author":"settles","year":"2009"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2018.03.022"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2018.03.002"},{"key":"ref36","first-page":"101","article-title":"Regression approaches to voice quality control based on one-to-many eigenvoice conversion","author":"ohta","year":"0","journal-title":"Proc Int Conf Spoken Lang Process"},{"key":"ref35","first-page":"2438","article-title":"A technique for controlling voice quality of synthetic speech using multiple regression HSMM","author":"tachibana","year":"0","journal-title":"Proc Int Conf Spoken Lang Process"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/215"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462628"},{"key":"ref61","first-page":"933","article-title":"Language modeling with gated convolutional networks","author":"dauphin","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054535"},{"key":"ref27","first-page":"879","article-title":"A study of speaker adaptation for DNN-based speech synthesis","author":"wu","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2019-10"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2009.08.009"},{"key":"ref1","article-title":"Deep representation learning in speech processing: challenges, recent advances, and future trends","author":"latif","year":"2020"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461384"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/SSW.2019-28"},{"key":"ref21","first-page":"4480","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","author":"jia","year":"0","journal-title":"Proc NeurIPS"},{"key":"ref24","first-page":"1282","article-title":"Preferential bayesian optimization","author":"gonz\u00e1lez","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682816"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2008.2006647"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3386569.3392444"},{"key":"ref50","first-page":"315","article-title":"Deep sparse rectifier neural networks","author":"glorot","year":"0","journal-title":"Proc Int Conf Artif Intell Statist"},{"key":"ref51","article-title":"Auto-encoding variational bayes","author":"kingma","year":"2013"},{"key":"ref59","first-page":"10019","article-title":"Neural voice cloning with a few samples","author":"arik","year":"0","journal-title":"Proc NeurIPS"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1113"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1016\/0167-6393(94)00051-B"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1148\/radiology.143.1.7063747"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1016\/j.chemolab.2005.05.004"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1037\/h0031619"},{"key":"ref53","first-page":"2266","article-title":"Maximum likelihood voice conversion based on GMM with STRAIGHT mixed excitation","author":"ohtani","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref52","first-page":"1","article-title":"Aperiodicity extraction and control using mixed mode excitation and group delay manipulation for a high quality speech analysis, modification and synthesis system STRAIGHT","author":"kawahara","year":"0","journal-title":"Proc Second Int Workshop Models Anal Vocal Emissions Biomed Appl"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.907344"},{"key":"ref11","first-page":"125","article-title":"WaveNet: A generative model for raw audio","author":"oord","year":"0","journal-title":"Proc SSW"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2016.7471631"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-314"},{"key":"ref13","first-page":"4006","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"wang rj","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref14","first-page":"3171","article-title":"FastSpeech: Fast, robust and controllable text to speech","author":"ren","year":"0","journal-title":"Proc NeurIPS"},{"key":"ref15","article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","author":"ren","year":"0","journal-title":"Proc Int Conf Learn Representations (ICLR)"},{"key":"ref16","article-title":"MelNet: A. generative model for audio in the frequency domain","author":"vasquez","year":"0"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2761547"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1038"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2014.6854363"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1155\/S1110865704310024"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2010.2064307"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461375"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2009.04.004"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1988.196677"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2016.7552917"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/89.661472"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2000.861820"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1250\/ast.20.199"},{"key":"ref48","first-page":"2121","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"duchi","year":"2011","journal-title":"J Mach Learn Res"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-6393(98)00085-5"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2978386"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2012.2235192"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053844"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.11345"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/9289074\/09354556.pdf?arnumber=9354556","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,12,17]],"date-time":"2021-12-17T20:00:23Z","timestamp":1639771223000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9354556\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":61,"URL":"https:\/\/doi.org\/10.1109\/taslp.2021.3059114","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"value":"2329-9290","type":"print"},{"value":"2329-9304","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]}}}