{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,23]],"date-time":"2025-06-23T14:44:39Z","timestamp":1750689879796,"version":"3.40.3"},"publisher-location":"Cham","reference-count":26,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030923099"},{"type":"electronic","value":"9783030923105"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-92310-5_13","type":"book-chapter","created":{"date-parts":[[2021,12,6]],"date-time":"2021-12-06T14:04:20Z","timestamp":1638799460000},"page":"110-118","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Exploring Effective Speech Representation via\u00a0ASR for\u00a0High-Quality End-to-End Multispeaker TTS"],"prefix":"10.1007","author":[{"given":"Dawei","family":"Liu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4005-5036","authenticated-orcid":false,"given":"Longbiao","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7636-3797","authenticated-orcid":false,"given":"Sheng","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haoyu","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chenchen","family":"Ding","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ju","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9237-4821","authenticated-orcid":false,"given":"Jianwu","family":"Dang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2021,12,2]]},"reference":[{"key":"13_CR1","unstructured":"Arik, S., et al.: Deep voice: real-time neural text-to-speech. In: Proceedings of ICML, pp. 264\u2013273 (2017)"},{"key":"13_CR2","unstructured":"Ren, Y., et al.: Fastspeech: fast, robust and controllable text to speech. In: Advances in Neural Information Processing Systems (2019)"},{"key":"13_CR3","doi-asserted-by":"crossref","unstructured":"Shen, J., et al.: Natural TTS synthesis by conditioning WaveNet on Mel spectrogram predictions. In: Proceedings of ICASSP, pp. 4779\u20134783 (2018)","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"13_CR4","unstructured":"Chen, Y., et al.: Sample efficient adaptive text-to-speech. In: Proceedings of ICLR (2019)"},{"key":"13_CR5","doi-asserted-by":"crossref","unstructured":"Kons, Z., et al.: High quality, lightweight and adaptable TTS using LPCNet. In: Proceedings of INTERSPEECH, pp. 176\u2013180 (2019)","DOI":"10.21437\/Interspeech.2019-1705"},{"key":"13_CR6","unstructured":"Nachmani, E., et al.: Fitting new speakers based on a short untranscribed sample. In: Proceedings of ICML, pp. 5932\u20135940 (2018)"},{"key":"13_CR7","doi-asserted-by":"crossref","unstructured":"Cooper, E., et al.: Zero-shot multi-speaker text-to-speech with state-of-the-art neural speaker embeddings. In: Proceedings of ICASSP, pp. 6184\u20136188 (2020)","DOI":"10.1109\/ICASSP40776.2020.9054535"},{"key":"13_CR8","unstructured":"Jia, Y., et al.: Transfer learning from speaker verification to multispeaker text-to-speech synthesis. In: Advances in Neural Information Processing Systems, pp. 4480\u20134490 (2018)"},{"key":"13_CR9","doi-asserted-by":"crossref","unstructured":"Chen, M., et al.: Cross-lingual, multi-speaker text-to-speech synthesis using neural speaker embedding. In: Proceedings of INTERSPEECH, pp. 2105\u20132109 (2019)","DOI":"10.21437\/Interspeech.2019-1632"},{"key":"13_CR10","doi-asserted-by":"crossref","unstructured":"Li, C., et al.: What does a network layer hear? Analyzing hidden representations of end-to-end ASR through speech synthesis. In: Proceedings of ICASSP, pp. 6434\u20136438 (2020)","DOI":"10.1109\/ICASSP40776.2020.9054675"},{"key":"13_CR11","doi-asserted-by":"crossref","unstructured":"Li, S., et al.: Improving transformer-based speech recognition systems with compressed structure and speech attributes augmentation. In: Proceedings of INTERSPEECH, pp. 1408\u20131412 (2019)","DOI":"10.21437\/Interspeech.2019-2112"},{"key":"13_CR12","doi-asserted-by":"crossref","unstructured":"Hori, T., et al.: Cycle-consistency training for end-to-end speech recognition. In: Proceedings of ICASSP, pp. 6271\u20136275 (2019)","DOI":"10.1109\/ICASSP.2019.8683307"},{"key":"13_CR13","doi-asserted-by":"crossref","unstructured":"Karita, S., et al.: Semi-supervised end-to-end speech recognition using text-to-speech and autoencoders. In: Proceedings of ICASSP, pp. 6166\u20136170 (2019)","DOI":"10.1109\/ICASSP.2019.8682890"},{"key":"13_CR14","doi-asserted-by":"crossref","unstructured":"Tjandra, A., et al.: Listening while speaking: speech chain by deep learning. In: Proceedings of ASRU, pp. 301\u2013308 (2017)","DOI":"10.1109\/ASRU.2017.8268950"},{"key":"13_CR15","doi-asserted-by":"crossref","unstructured":"Tjandra, A., et al.: Machine speech chain with one-shot speaker adaptation. In: Proceedings of INTERSPEECH, pp. 887\u2013891 (2018)","DOI":"10.21437\/Interspeech.2018-1558"},{"key":"13_CR16","unstructured":"Vaswani, A., et al.: Attention is all you need. CoRR abs\/1706.03762 (2017)"},{"key":"13_CR17","unstructured":"Kalchbrenner, N., et al.: Efficient neural audio synthesis. In: Proceedings of ICML, pp. 3775\u20133784 (2018)"},{"key":"13_CR18","doi-asserted-by":"crossref","unstructured":"Panayotov, V., et al.: Librispeech: an ASR corpus based on public domain audio books. In: Proceedings of ICASSP, pp. 5206\u20135210 (2015)","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"13_CR19","unstructured":"Yamagishi, J., et al.: CSTR VCTK Corpus: English multi-speaker corpus for CSTR voice cloning toolkit (version 0.92) (2019). https:\/\/doi.org\/10.7488\/ds\/2645"},{"key":"13_CR20","doi-asserted-by":"crossref","unstructured":"Wan, L., et al.: Generalized end-to-end loss for speaker verification. In: Proceedings of ICASSP, pp. 4879\u20134883 (2018)","DOI":"10.1109\/ICASSP.2018.8462665"},{"key":"13_CR21","doi-asserted-by":"crossref","unstructured":"Paul, D., et al.: Speaker conditional WaveRNN: towards universal neural vocoder for unseen speaker and recording conditions. In: Proceedings of INTERSPEECH (2020)","DOI":"10.21437\/Interspeech.2020-2786"},{"key":"13_CR22","doi-asserted-by":"crossref","unstructured":"Lorenzo-Trueba, J., et al.: The voice conversion challenge 2018: Promoting development of parallel and nonparallel methods. In: Odyssey 2018 The Speaker and Language Recognition Workshop (2018)","DOI":"10.21437\/Odyssey.2018-28"},{"key":"13_CR23","doi-asserted-by":"crossref","unstructured":"Zhou, D., et al.: Dynamic margin softmax loss for speaker verification. In: Proceedings of INTERSPEECH (2020)","DOI":"10.21437\/Interspeech.2020-1106"},{"key":"13_CR24","volume-title":"The Speech Chain","author":"P Denes","year":"1993","unstructured":"Denes, P., Pinson, E.: The Speech Chain, 2nd edn. Worth Publisher, New York (1993)","edition":"2"},{"issue":"5","key":"13_CR25","first-page":"391","volume":"62","author":"M Kashino","year":"2006","unstructured":"Kashino, M.: The motor theory of speech perception: its history, progress and perspective (Japanese). Acoust. Sci. Tech. 62(5), 391\u2013396 (2006)","journal-title":"Acoust. Sci. Tech."},{"key":"13_CR26","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/0010-0277(85)90021-6","volume":"21","author":"A Liberman","year":"1985","unstructured":"Liberman, A., Mattingly, I.: The motor theory of speech perception revised. Cognition 21, 1\u201336 (1985)","journal-title":"Cognition"}],"container-title":["Communications in Computer and Information Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-92310-5_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,21]],"date-time":"2022-06-21T08:08:22Z","timestamp":1655798902000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-92310-5_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030923099","9783030923105"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-92310-5_13","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"2 December 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Sanur, Bali","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Indonesia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 December 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iconip2021.apnns.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1093","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"226","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"177","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"21% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.57","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"6","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Due to the COVID-19 pandemic the conference was held online.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}