{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,27]],"date-time":"2025-10-27T16:20:59Z","timestamp":1761582059298,"version":"3.37.3"},"reference-count":61,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"JSPS KAKENHI","award":["19H01116","18K18100","18J22090","17H06101"],"award-info":[{"award-number":["19H01116","18K18100","18J22090","17H06101"]}]},{"name":"MIC\/SCOPE","award":["#182103104"],"award-info":[{"award-number":["#182103104"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE\/ACM Trans. Audio Speech Lang. Process."],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/taslp.2021.3059114","type":"journal-article","created":{"date-parts":[[2021,2,17]],"date-time":"2021-02-17T01:59:09Z","timestamp":1613527149000},"page":"1033-1048","source":"Crossref","is-referenced-by-count":6,"title":["Perceptual-Similarity-Aware Deep Speaker Representation Learning for Multi-Speaker Generative Modeling"],"prefix":"10.1109","volume":"29","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7967-2613","authenticated-orcid":false,"given":"Yuki","family":"Saito","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0520-7847","authenticated-orcid":false,"given":"Shinnosuke","family":"Takamichi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0876-5617","authenticated-orcid":false,"given":"Hiroshi","family":"Saruwatari","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"ref39","DOI":"10.21437\/Interspeech.2020-1789"},{"doi-asserted-by":"publisher","key":"ref38","DOI":"10.21437\/Interspeech.2020-1464"},{"doi-asserted-by":"publisher","key":"ref33","DOI":"10.1587\/transinf.2017EDP7165"},{"doi-asserted-by":"publisher","key":"ref32","DOI":"10.1109\/ICASSP.2017.7953089"},{"year":"2009","author":"settles","article-title":"Active learning literature survey","key":"ref31"},{"doi-asserted-by":"publisher","key":"ref30","DOI":"10.1016\/j.knosys.2018.03.022"},{"doi-asserted-by":"publisher","key":"ref37","DOI":"10.1016\/j.specom.2018.03.002"},{"key":"ref36","first-page":"101","article-title":"Regression approaches to voice quality control based on one-to-many eigenvoice conversion","author":"ohta","year":"0","journal-title":"Proc Int Conf Spoken Lang Process"},{"key":"ref35","first-page":"2438","article-title":"A technique for controlling voice quality of synthetic speech using multiple regression HSMM","author":"tachibana","year":"0","journal-title":"Proc Int Conf Spoken Lang Process"},{"doi-asserted-by":"publisher","key":"ref34","DOI":"10.24963\/ijcai.2018\/215"},{"doi-asserted-by":"publisher","key":"ref60","DOI":"10.1109\/ICASSP.2018.8462628"},{"key":"ref61","first-page":"933","article-title":"Language modeling with gated convolutional networks","author":"dauphin","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"doi-asserted-by":"publisher","key":"ref28","DOI":"10.1109\/ICASSP40776.2020.9054535"},{"key":"ref27","first-page":"879","article-title":"A study of speaker adaptation for DNN-based speech synthesis","author":"wu","year":"0","journal-title":"Proc INTERSPEECH"},{"doi-asserted-by":"publisher","key":"ref29","DOI":"10.21437\/SSW.2019-10"},{"doi-asserted-by":"publisher","key":"ref2","DOI":"10.1016\/j.specom.2009.08.009"},{"year":"2020","author":"latif","article-title":"Deep representation learning in speech processing: challenges, recent advances, and future trends","key":"ref1"},{"doi-asserted-by":"publisher","key":"ref20","DOI":"10.1109\/ICASSP.2018.8461384"},{"doi-asserted-by":"publisher","key":"ref22","DOI":"10.21437\/SSW.2019-28"},{"key":"ref21","first-page":"4480","article-title":"Transfer learning from speaker verification to multispeaker text-to-speech synthesis","author":"jia","year":"0","journal-title":"Proc NeurIPS"},{"key":"ref24","first-page":"1282","article-title":"Preferential bayesian optimization","author":"gonz\u00e1lez","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"doi-asserted-by":"publisher","key":"ref23","DOI":"10.1109\/ICASSP.2019.8682816"},{"doi-asserted-by":"publisher","key":"ref26","DOI":"10.1109\/TASL.2008.2006647"},{"doi-asserted-by":"publisher","key":"ref25","DOI":"10.1145\/3386569.3392444"},{"key":"ref50","first-page":"315","article-title":"Deep sparse rectifier neural networks","author":"glorot","year":"0","journal-title":"Proc Int Conf Artif Intell Statist"},{"year":"2013","author":"kingma","article-title":"Auto-encoding variational bayes","key":"ref51"},{"key":"ref59","first-page":"10019","article-title":"Neural voice cloning with a few samples","author":"arik","year":"0","journal-title":"Proc NeurIPS"},{"doi-asserted-by":"publisher","key":"ref58","DOI":"10.21437\/Interspeech.2018-1113"},{"doi-asserted-by":"publisher","key":"ref57","DOI":"10.1016\/0167-6393(94)00051-B"},{"doi-asserted-by":"publisher","key":"ref56","DOI":"10.1148\/radiology.143.1.7063747"},{"doi-asserted-by":"publisher","key":"ref55","DOI":"10.1016\/j.chemolab.2005.05.004"},{"doi-asserted-by":"publisher","key":"ref54","DOI":"10.1037\/h0031619"},{"key":"ref53","first-page":"2266","article-title":"Maximum likelihood voice conversion based on GMM with STRAIGHT mixed excitation","author":"ohtani","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref52","first-page":"1","article-title":"Aperiodicity extraction and control using mixed mode excitation and group delay manipulation for a high quality speech analysis, modification and synthesis system STRAIGHT","author":"kawahara","year":"0","journal-title":"Proc Second Int Workshop Models Anal Vocal Emissions Biomed Appl"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1109\/TASL.2007.907344"},{"key":"ref11","first-page":"125","article-title":"WaveNet: A generative model for raw audio","author":"oord","year":"0","journal-title":"Proc SSW"},{"doi-asserted-by":"publisher","key":"ref40","DOI":"10.1109\/ICASSP.2016.7471631"},{"doi-asserted-by":"publisher","key":"ref12","DOI":"10.21437\/Interspeech.2017-314"},{"key":"ref13","first-page":"4006","article-title":"Tacotron: Towards end-to-end speech synthesis","author":"wang rj","year":"0","journal-title":"Proc INTERSPEECH"},{"key":"ref14","first-page":"3171","article-title":"FastSpeech: Fast, robust and controllable text to speech","author":"ren","year":"0","journal-title":"Proc NeurIPS"},{"key":"ref15","article-title":"FastSpeech 2: Fast and high-quality end-to-end text to speech","author":"ren","year":"0","journal-title":"Proc Int Conf Learn Representations (ICLR)"},{"year":"0","author":"vasquez","article-title":"MelNet: A. generative model for audio in the frequency domain","key":"ref16"},{"doi-asserted-by":"publisher","key":"ref17","DOI":"10.1109\/TASLP.2017.2761547"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.21437\/Interspeech.2017-1038"},{"doi-asserted-by":"publisher","key":"ref4","DOI":"10.1109\/ICASSP.2014.6854363"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1155\/S1110865704310024"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.1109\/TASL.2010.2064307"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.1109\/ICASSP.2018.8461375"},{"doi-asserted-by":"publisher","key":"ref8","DOI":"10.1016\/j.specom.2009.04.004"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1109\/ICASSP.1988.196677"},{"doi-asserted-by":"publisher","key":"ref49","DOI":"10.1109\/ICME.2016.7552917"},{"doi-asserted-by":"publisher","key":"ref9","DOI":"10.1109\/89.661472"},{"doi-asserted-by":"publisher","key":"ref46","DOI":"10.1109\/ICASSP.2000.861820"},{"doi-asserted-by":"publisher","key":"ref45","DOI":"10.1250\/ast.20.199"},{"key":"ref48","first-page":"2121","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"duchi","year":"2011","journal-title":"J Mach Learn Res"},{"doi-asserted-by":"publisher","key":"ref47","DOI":"10.1016\/S0167-6393(98)00085-5"},{"doi-asserted-by":"publisher","key":"ref42","DOI":"10.1109\/TNNLS.2020.2978386"},{"doi-asserted-by":"publisher","key":"ref41","DOI":"10.1109\/MSP.2012.2235192"},{"doi-asserted-by":"publisher","key":"ref44","DOI":"10.1109\/ICASSP40776.2020.9053844"},{"doi-asserted-by":"publisher","key":"ref43","DOI":"10.1613\/jair.1.11345"}],"container-title":["IEEE\/ACM Transactions on Audio, Speech, and Language Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6570655\/9289074\/09354556.pdf?arnumber=9354556","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,12,17]],"date-time":"2021-12-17T20:00:23Z","timestamp":1639771223000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9354556\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":61,"URL":"https:\/\/doi.org\/10.1109\/taslp.2021.3059114","relation":{},"ISSN":["2329-9290","2329-9304"],"issn-type":[{"type":"print","value":"2329-9290"},{"type":"electronic","value":"2329-9304"}],"subject":[],"published":{"date-parts":[[2021]]}}}