{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:37:34Z","timestamp":1776883054003,"version":"3.51.2"},"reference-count":70,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T00:00:00Z","timestamp":1764979200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,6]]},"DOI":"10.1109\/asru65441.2025.11434709","type":"proceedings-article","created":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T19:48:04Z","timestamp":1775159284000},"page":"1-8","source":"Crossref","is-referenced-by-count":4,"title":["Post-training for Deepfake Speech Detection"],"prefix":"10.1109","author":[{"given":"Wanying","family":"Ge","sequence":"first","affiliation":[{"name":"National Institute of Informatics,Tokyo,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xin","family":"Wang","sequence":"additional","affiliation":[{"name":"National Institute of Informatics,Tokyo,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuechen","family":"Liu","sequence":"additional","affiliation":[{"name":"National Institute of Informatics,Tokyo,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junichi","family":"Yamagishi","sequence":"additional","affiliation":[{"name":"National Institute of Informatics,Tokyo,Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2022-16"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.acl-long.493"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2025.3626963"},{"key":"ref4","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume-title":"Proc. NeurIPS 2020","volume":"33","author":"Baevski"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3389631"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5446"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1087"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2022-14"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/ASVspoof.2024-25"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3233236"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888070"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1283"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-254"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1242"},{"key":"ref16","article-title":"A survey on post-training of large language models","volume-title":"arXiv:2503.06072","author":"Tie"},{"key":"ref17","article-title":"Qwen2.5 technical report","volume-title":"arXiv:2412.15115","author":"Yang"},{"issue":"97","key":"ref18","first-page":"1","article-title":"Scaling speech technology to 1,000+ languages","volume-title":"Journal of Machine Learning Research","volume":"25","author":"Pratap"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-143"},{"key":"ref20","article-title":"Deepfake-Eval-2024: A multi-modal in-the-wild benchmark of deepfakes circulated in 2024","volume-title":"arXiv:2503.02857","author":"Chandra"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446331"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2020.101114"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.21437\/ASVSPOOF.2021-8"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/ASVspoof.2024-1"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2024.103122"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3543507.3583222"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832250"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/BIOSIG61931.2024.10786752"},{"key":"ref30","doi-asserted-by":"crossref","DOI":"10.1109\/ICASSP49660.2025.10889450","article-title":"DiffSSD: A diffusion-based dataset for speech forensics","volume-title":"arXiv:2409.13049","author":"Bhagtani"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3658644.3690311"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-2272"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN60899.2024.10650962"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/OJSP.2025.3529377"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-970"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3658644.3670285"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2441"},{"key":"ref38","article-title":"The LJ Speech dataset","author":"Ito"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"ref40","article-title":"WaveFake: A data set to facilitate audio deepfake detection","volume-title":"Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track.","author":"Frank"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023141"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1356"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1584"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TASLPRO.2025.3525966"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-2093"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-755"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2022.01.002"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2826"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2017-950"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10094779"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10022496"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/lsp.2025.3547861"},{"key":"ref53","article-title":"Speech Arena: Speech deepfake leaderboard","author":"Arena"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747077"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/3664647.3681345"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-659"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICME59968.2025.11210096"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446331"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747766"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414234"},{"issue":"333","key":"ref61","first-page":"1","article-title":"Open-source conversational AI with SpeechBrain 1.0","volume-title":"Journal of Machine Learning Research","volume":"25","author":"Ravanelli"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746213"},{"key":"ref63","article-title":"Decoupled Weight Decay Regularization","volume-title":"Proc. ICLR","author":"Loshchilov"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/SPED.2019.8906599"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-108"},{"key":"ref66","article-title":"Real-time detection of AI-generated speech for deepFake voice conversion","volume-title":"arXiv:2308.12734","author":"Bird"},{"key":"ref67","first-page":"125","article-title":"ADD 2023: The second audio deepfake detection challenge","volume-title":"IJCAI 2023 Workshop on Deepfake Audio Detection and Analysis","author":"Yi"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-329"},{"key":"ref69","article-title":"MUSAN: A music, speech, and noise corpus","volume-title":"arXiv:1510.08484","author":"Snyder"},{"key":"ref70","first-page":"18661","article-title":"Supervised contrastive learning","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Khosla"}],"event":{"name":"2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,12,6]]},"end":{"date-parts":[[2025,12,10]]}},"container-title":["2025 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11434577\/11433836\/11434709.pdf?arnumber=11434709","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T04:58:36Z","timestamp":1775192316000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11434709\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":70,"URL":"https:\/\/doi.org\/10.1109\/asru65441.2025.11434709","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]}}}