{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T06:13:36Z","timestamp":1774419216294,"version":"3.50.1"},"reference-count":44,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,4,6]]},"DOI":"10.1109\/icassp49660.2025.10888480","type":"proceedings-article","created":{"date-parts":[[2025,3,12]],"date-time":"2025-03-12T13:52:43Z","timestamp":1741787563000},"page":"1-5","source":"Crossref","is-referenced-by-count":2,"title":["DistillW2N: A Lightweight One-Shot Whisper to Normal Voice Conversion Model Using Distillation of Self-Supervised Features"],"prefix":"10.1109","author":[{"given":"Tianyi","family":"Tan","sequence":"first","affiliation":[{"name":"Key Laboratory of Modern Acoustic, Nanjing University,Nanjing,China,210093"}]},{"given":"Haoxin","family":"Ruan","sequence":"additional","affiliation":[{"name":"Key Laboratory of Modern Acoustic, Nanjing University,Nanjing,China,210093"}]},{"given":"Xinan","family":"Chen","sequence":"additional","affiliation":[{"name":"Key Laboratory of Modern Acoustic, Nanjing University,Nanjing,China,210093"}]},{"given":"Kai","family":"Chen","sequence":"additional","affiliation":[{"name":"Key Laboratory of Modern Acoustic, Nanjing University,Nanjing,China,210093"}]},{"given":"Zhibin","family":"Lin","sequence":"additional","affiliation":[{"name":"Key Laboratory of Modern Acoustic, Nanjing University,Nanjing,China,210093"}]},{"given":"Jing","family":"Lu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Modern Acoustic, Nanjing University,Nanjing,China,210093"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-250"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389801"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2003.10.005"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178927"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682897"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2236"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/s12559-023-10108-9"},{"key":"ref8","article-title":"CinC-GAN for Effective F0 prediction for Whisper-to-Normal Speech Conversion","author":"Patel","journal-title":"EUSIPCO 2020."},{"key":"ref9","article-title":"MaskCycleGAN-based Whisper to Normal Speech Conversion","author":"Gupta","journal-title":"arXiv:2408.14797; 2024\/08\/27."},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2019.2917232"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052966"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3526113.3545685"},{"key":"ref13","author":"Baevski","journal-title":"wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3580706"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746484"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447197"},{"key":"ref18","article-title":"Distilling the knowledge in a neural network","author":"Hinton","journal-title":"arXiv:1503.02531; 2015\/05\/09."},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747490"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11112"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-51"},{"key":"ref22","article-title":"Attention is all you need","author":"Vaswani","journal-title":"NeurIPS 2017."},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446456"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref25","article-title":"Meta-stylespeech: Multi-speaker adaptive text-to-speech generation","author":"Min","journal-title":"ICML 2021."},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3019084"},{"key":"ref27","article-title":"Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier","author":"Team"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746806"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446863"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"ref31","article-title":"The LJ Speech Dataset","author":"Ito","year":"2017"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.35111\/17gk-bn40"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095191"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389621"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414878"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-439"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097255"},{"key":"ref39","article-title":"SpeechBrain: A General-Purpose Speech Toolkit","author":"Ravanelli","journal-title":"arXiv:2106.04624; 2021\/06\/08"},{"key":"ref40","article-title":"ptflops: a flops counting tool for neural networks in pytorch framework","author":"Sovrasov"},{"key":"ref41","article-title":"HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis","author":"Kong","journal-title":"NeurIPS 2020"},{"key":"ref42","article-title":"FastSpeech 2: Fast and High-Quality End-to-End Text to Speech","author":"Ren","journal-title":"ICLR 2021."},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref44","article-title":"Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech","author":"Kim","journal-title":"ICML 2021."}],"event":{"name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Hyderabad, India","start":{"date-parts":[[2025,4,6]]},"end":{"date-parts":[[2025,4,11]]}},"container-title":["ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10887540\/10887541\/10888480.pdf?arnumber=10888480","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T05:21:37Z","timestamp":1774416097000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10888480\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,6]]},"references-count":44,"URL":"https:\/\/doi.org\/10.1109\/icassp49660.2025.10888480","relation":{},"subject":[],"published":{"date-parts":[[2025,4,6]]}}}