{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,17]],"date-time":"2026-06-17T16:21:31Z","timestamp":1781713291392,"version":"3.54.5"},"reference-count":39,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"4","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2025,4,1]]},"DOI":"10.1587\/transinf.2024mui0001","type":"journal-article","created":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T22:12:08Z","timestamp":1728252728000},"page":"300-310","source":"Crossref","is-referenced-by-count":12,"title":["Deepfake Speech Detection: Approaches from Acoustic Features to Deep Neural Networks"],"prefix":"10.1587","volume":"E108.D","author":[{"given":"Masashi","family":"UNOKI","sequence":"first","affiliation":[{"name":"School of Information Science, JAIST"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kai","family":"LI","sequence":"additional","affiliation":[{"name":"School of Information Science, JAIST"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Anuwat","family":"CHAIWONGYEN","sequence":"additional","affiliation":[{"name":"School of Information Science, JAIST"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Quoc-Huy","family":"NGUYEN","sequence":"additional","affiliation":[{"name":"School of Information Science, JAIST"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Khalid","family":"ZAMAN","sequence":"additional","affiliation":[{"name":"School of Information Science, JAIST"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"publisher","unstructured":"[1] M. Westerlund, \u201cThe emergence of deepfake technology: A review,\u201d Technology Innovation Management Review, vol.9, no.11, pp.39-52, 2019. 10.22215\/timreview\/1282","DOI":"10.22215\/timreview\/1282"},{"key":"2","doi-asserted-by":"crossref","unstructured":"[2] J. Yi, R. Fu, J. Tao, S. Nie, H. Ma, C. Wang, T. Wang, Z. Tian, Y. Bai, C. Fan, S. Liang, S. Wang, S. Zhang, X. Yan, L. Xu, Z. Wen, and H. Li, \u201cADD 2022: the first audio deep synthesis detection challenge,\u201d Proc. ICASSP2022, pp.9216-9220, 2022. 10.1109\/icassp43922.2022.9746939","DOI":"10.1109\/ICASSP43922.2022.9746939"},{"key":"3","doi-asserted-by":"crossref","unstructured":"[3] J. Yamagishi, X. Wang, M. Todisco, M. Sahidullah, J. Patino, A. Nautsch, X. Liu, K.A. Lee, T. Kinnunen, N. Evans, and H. Delgado, \u201cASVspoof 2021: accelerating progress in spoofed and deepfake speech detection,\u201d Proc. ASVspoof 2021 WorkshopAutomatic Speaker Verification and Spoofing Coutermeasures Challenge, 2021. 10.21437\/asvspoof.2021-8","DOI":"10.21437\/ASVSPOOF.2021-8"},{"key":"4","unstructured":"[4] J. Yi, J. Tao, R. Fu, X. Yan, C. Wang, T. Wang, C.Y. Zhang, X. Zhang, Y. Zhao, Y. Ren, et al., \u201cADD 2023: the second audio deepfake detection challenge,\u201d ArXiv, vol.abs\/2305.13774, 2023."},{"key":"5","doi-asserted-by":"crossref","unstructured":"[5] T. Chen, A. Kumar, P. Nagarsheth, G. Sivaraman, and E. Khoury, \u201cGeneralization of Audio Deepfake Detection,\u201d Proc. Odyssey, pp.132-137, 2020. 10.21437\/odyssey.2020-19","DOI":"10.21437\/Odyssey.2020-19"},{"key":"6","doi-asserted-by":"crossref","unstructured":"[6] R. Wang, F. Juefei-Xu, Y. Huang, Q. Guo, X. Xie, L. Ma, and Y. Liu, \u201cDeepsonar: Towards effective and robust detection of ai-synthesized fake voices,\u201d Proc. the 28th ACM international conference on multimedia, pp.1207-1216, 2020. 10.1145\/3394171.3413716","DOI":"10.1145\/3394171.3413716"},{"key":"7","unstructured":"[7] J. Yi, et al., \u201cAudio Deepfake Detection: A Survey,\u201d ArXiv Preprint, ArXiv:2308.14970, 2023."},{"key":"8","doi-asserted-by":"crossref","unstructured":"[8] M. Todisco, X. Wang, V. Vestman, M. Sahidullah, H. Delgado, A. Nautsch, J. Yamagishi, N. Evans, T.H. Kinnunen, and K.A. Lee, \u201cASVspoof 2019: Future horizons in spoofed and fake audio detection,\u201d arXiv preprint arXiv:1904.05441, 2019.","DOI":"10.21437\/Interspeech.2019-2249"},{"key":"9","doi-asserted-by":"publisher","unstructured":"[9] X. Wang, J. Yamagishi, M. Todisco, H. Delgado, A. Nautsch, N. Evans, M. Sahidullah, V. Vestman, T. Kinnunen, K.A. Lee, L. Juvela, P. Alku, Y.-H. Peng, H.-T. Hwang, Y. Tsao, H.-M. Wang, S.L. Maguer, M. Becker, F. Henderson, R. Clark, Y. Zhang, Q. Wang, Y. Jia, K. Onuma, K. Mushika, T. Kaneda, Y. Jiang, L.-J. Liu, Y.-C. Wu, W.-C. Huang, T. Toda, K. Tanaka, H. Kameoka, I. Steiner, D. Matrouf, J.-F. Bonastre, A. Govender, S. Ronanki, J.-X. Zhang, and Z.-H. Ling, \u201cASVspoof 2019: A large-scale public database of synthesized, converted and replayed speech,\u201d Computer Speech &amp; Language, vol.64, p.101114, 2020. 10.1016\/j.csl.2020.101114","DOI":"10.1016\/j.csl.2020.101114"},{"key":"10","doi-asserted-by":"publisher","unstructured":"[10] K. Zaman, M. Sah, C. Direkoglu, and M. Unoki, \u201cA Survey of Audio Classification using Deep Learning,\u201d IEEE Access, vol.11, pp.106620-106649, Sept. 2023. 10.1109\/access.2023.3318015","DOI":"10.1109\/ACCESS.2023.3318015"},{"key":"11","doi-asserted-by":"publisher","unstructured":"[11] M. Unoki, T. Irino, B. Glasberg, B.C.J. Moore, and R.D. Patterson, \u201cComparison of the roex and gammachirp filters as representations of the auditory filter,\u201d J. Acoust. Soc. Am., vol.120, no.3, pp.1474-1492, 2006. 10.1121\/1.2228539","DOI":"10.1121\/1.2228539"},{"key":"12","doi-asserted-by":"publisher","unstructured":"[12] S. Davis and P. Mermelstein, \u201cComparison of parametric representations for monosyllabic word recognition in continuously spoken sentences,\u201d IEEE Transactions On Acoustics, Speech, And Signal Processing, vol.28, no.4, pp.357-366, 1980. 10.1109\/tassp.1980.1163420","DOI":"10.1109\/TASSP.1980.1163420"},{"key":"13","doi-asserted-by":"publisher","unstructured":"[13] T. Irino and M. Unoki, \u201cAn Analysis\/Synthesis Auditory Filterbank based on an IIR Implementation of the Gammachirp,\u201d J. Acoust. Soc. Jpn. (E), vol.20, no.6, pp.397-406, Nov. 1999. 10.1250\/ast.20.397","DOI":"10.1250\/ast.20.397"},{"key":"14","doi-asserted-by":"publisher","unstructured":"[14] M. Unoki and M. Akagi, \u201cA Method of Signal Extraction from Noisy Signal based on Auditory Scene Analysis,\u201d Speech Communication, vol.27, no.3, pp.261-279, April 1999. 10.1016\/s0167-6393(98)00077-6","DOI":"10.1016\/S0167-6393(98)00077-6"},{"key":"15","doi-asserted-by":"publisher","unstructured":"[15] K. Jensen, \u201cThe timbre model,\u201d J. Acoust. Soc. Am., vol.112, no.5, pp.2238-2238, 2002. 10.1121\/1.4778881","DOI":"10.1121\/1.4778881"},{"key":"16","unstructured":"[16] A. Pearce, T. Brookes, and R. Mason, \u201cTimbral attributes for sound effect library searching,\u201d Proc. AES Int. Conf. Semantic Audio, pp.1-8, June 2017. Accessed: March 16, 2023. [Online]. Available: https:\/\/www.aes.org\/e-lib\/browse.cfm?elib=18754"},{"key":"17","doi-asserted-by":"publisher","unstructured":"[17] A. Pearce, T. Brookes, and R. Mason, \u201cModelling timbral hardness,\u201d Applied Sciences, vol.9, no.3, p.466, 2019. 10.3390\/app9030466","DOI":"10.3390\/app9030466"},{"key":"18","unstructured":"[18] A. Pearce, S. Safavi, B. Tim, M. Russell, W. Wang, and M. Plumbley, \u201cEvaluation report on the second prototypes of the timbral characterisation tools,\u201d 2018. [Online]. Available: http:\/\/www.audiocommons.org\/materials\/"},{"key":"19","unstructured":"[19] A. Pearce, B. Tim, and M. Russell, \u201cRelease of timbral characterisation tools for semantically annotating non-musical content,\u201d 2020. [Online]. Available: https:\/\/www.audiocommons.org\/materials\/"},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] S. Ystad, M. Aramaki, and R. Kronland-Martinet, \u201cTimbre from Sound Synthesis and High-Level Control Perspectives,\u201d Timbre: Acoustics, Perception, and Cognition, pp.361-389, 2019. 10.1007\/978-3-030-14832-4_13","DOI":"10.1007\/978-3-030-14832-4_13"},{"key":"21","unstructured":"[21] S. Hatano and T. Hashimoto, \u201cBooming index as a measure for evaluating booming sensation,\u201d Proc. Inter-Noise2000, no.233, pp.1-6, 2000."},{"key":"22","doi-asserted-by":"publisher","unstructured":"[22] T. Isoyama, S. Kidani, and M. Unoki, \u201cComputational models of sound-quality metrics using method for calculating loudness with gammatone\/gammachirp auditory filterbank,\u201d Applied Acoustics, vol.218, March 2024. DOI: https:\/\/doi.org\/10.1016\/j.apacoust.2024.109914 10.1016\/j.apacoust.2024.109914","DOI":"10.1016\/j.apacoust.2024.109914"},{"key":"23","doi-asserted-by":"crossref","unstructured":"[23] M. Vashkevich, A. Petrovsky, and Y. Rushkevich, \u201cBulbar als detection based on analysis of voice perturbation and vibrato,\u201d 2019 Signal Processing: Algorithms, Architectures, Arrangements, and Applications (SPA), pp.267-272, 2019. 10.23919\/spa.2019.8936657","DOI":"10.23919\/SPA.2019.8936657"},{"key":"24","unstructured":"[24] E. Azarov, M. Vashkevich, and A. Petrovsky, \u201cInstantaneous pitch estimation based on rapt framework,\u201d Proc. EUSIPCO2012, pp.2787-2791, 2012."},{"key":"25","doi-asserted-by":"publisher","unstructured":"[25] A. de Cheveign\u00e9 and H. Kawahara, \u201cYin, a fundamental frequency estimator for speech and music,\u201d J. Acoust. Soc. Am., vol.111, no.4, pp.1917-1930, 2002. 10.1121\/1.1458024","DOI":"10.1121\/1.1458024"},{"key":"26","doi-asserted-by":"crossref","unstructured":"[26] M. Mauch and S. Dixon, \u201cpyin: A fundamental frequency estimator using probabilistic threshold distributions,\u201d Proc. ICASSP2014, pp.659-663, 2014. 10.1109\/icassp.2014.6853678","DOI":"10.1109\/ICASSP.2014.6853678"},{"key":"27","doi-asserted-by":"publisher","unstructured":"[27] A. Camacho and J.G. Harris, \u201cA sawtooth waveform inspired pitch estimator for speech and music,\u201d J. Acoust. Soc. Am., vol.124, no.3, pp.1638-1652, 2008. 10.1121\/1.2951592","DOI":"10.1121\/1.2951592"},{"key":"28","doi-asserted-by":"crossref","unstructured":"[28] R.J. Baken and R.F. Orlikoff, Clinical measurement of speech and voice, 2nd ed. San Diego, CA: Singular Thomson Learning, 2000.","DOI":"10.3109\/14417040008996786"},{"key":"29","doi-asserted-by":"crossref","unstructured":"[29] H. Cheng, C.O. Mawalim, K. Li, L. Wang, and M. Unoki, \u201cAnalysis of Spectro-Temporal Modulation Representation for Deep-Fake Speech Detection,\u201d Proc. APSIPA ASC 2023, pp.1822-1829, Nov. 2023. 10.1109\/apsipaasc58517.2023.10317309","DOI":"10.1109\/APSIPAASC58517.2023.10317309"},{"key":"30","doi-asserted-by":"crossref","unstructured":"[30] A. Chaiwongyen, N. Songsriboonsit, S. Duangpummet, J. Karnjana, W. Kongprawechnon, and M. Unoki, \u201cContribution of Timbre and Shimmer Features to Deepfake Speech Detection,\u201d Proc. APSIPA2022, pp.97-103, Chiang Mai, Thailand, Nov. 2022. 10.23919\/apsipaasc55919.2022.9980281","DOI":"10.23919\/APSIPAASC55919.2022.9980281"},{"key":"31","doi-asserted-by":"crossref","unstructured":"[31] A. Chaiwongyen, S. Duangpummet, J. Karnjana, W. Kongprawechnon, and M. Unoki, \u201cDeepfake-speech Detection with Pathological Features and Multilayer Perceptron Neural Network,\u201d Proc. APSIPA ASC 2023, pp.2182-2188, Nov. 2023. 10.1109\/apsipaasc58517.2023.10317331","DOI":"10.1109\/APSIPAASC58517.2023.10317331"},{"key":"32","doi-asserted-by":"publisher","unstructured":"[32] K. Li, X. Lu, M. Akagi, and M. Unoki, \u201cContributions of Jitter and Shimmer in the Voice for Fake Audio Detection,\u201d IEEE Access, vol.11, pp.2169-3536, 2023. DOI: 10.1109\/ACCESS.2023.3301616 10.1109\/access.2023.3301616","DOI":"10.1109\/ACCESS.2023.3301616"},{"key":"33","doi-asserted-by":"crossref","unstructured":"[33] K. Li, S. Li, X. Lu, M. Akagi, M. Liu, L. Zhang, C. Zeng, L. Wang, J. Dang, and M. Unoki, \u201cData Augmentation Using McAdams-Coefficient-Based Speaker Anonymization for Fake Audio Detection,\u201d Proc. Interspeech2022, pp.664-668, Incheon, Korea, 2022. 10.21437\/interspeech.2022-10088","DOI":"10.21437\/Interspeech.2022-10088"},{"key":"34","unstructured":"[34] B. Alexei, Z. Yuhao, M. Abdelrahman, and A. Michael, \u201cwav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations,\u201d Advances in Neural Information Processing Systems, vol.33, pp.12449-12460, 2020."},{"key":"35","doi-asserted-by":"crossref","unstructured":"[35] G. Lavrentyeva, S. Novoselov, E. Malykh, A. Kozlov, O. Kudashev, and V. Shchemelinin, \u201cAudio replay attack detection with deep learning frameworks,\u201d Proc. Interspeech, pp.82-86, 2017 10.21437\/interspeech.2017-360","DOI":"10.21437\/Interspeech.2017-360"},{"key":"36","doi-asserted-by":"crossref","unstructured":"[36] X. Wang and J. Yamagishi, \u201cInvestigating self-supervised front ends for speech spoofing countermeasures,\u201d arXiv preprint arXiv:2111.07725, 2021.","DOI":"10.21437\/Odyssey.2022-14"},{"key":"37","doi-asserted-by":"crossref","unstructured":"[37] X. Wang and J. Yamagishi, \u201cA comparative study on recent neural spoofing countermeasures for synthetic speech detection,\u201d arXiv preprint arXiv:2103.11326, 2021.","DOI":"10.21437\/Interspeech.2021-702"},{"key":"38","unstructured":"[38] M. Lin, Q. Chen, and S. Yan, \u201cNetwork in network,\u201d arXiv preprint arXiv:1312.4400, 2013."},{"key":"39","doi-asserted-by":"publisher","unstructured":"[39] J. Kim and M. Hahn, \u201cVoice activity detection using an adaptive context attention model,\u201d IEEE Signal Process. Lett., vol.25, no.8, pp.1181-1185, 2018. 10.1109\/lsp.2018.2811740","DOI":"10.1109\/LSP.2018.2811740"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E108.D\/4\/E108.D_2024MUI0001\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T03:26:56Z","timestamp":1743823616000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E108.D\/4\/E108.D_2024MUI0001\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,1]]},"references-count":39,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2024mui0001","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,1]]},"article-number":"2024MUI0001"}}