{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T16:36:18Z","timestamp":1775838978898,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,8]],"date-time":"2024-04-08T00:00:00Z","timestamp":1712534400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Brno University of Technology","award":["FIT-S-23-8151"],"award-info":[{"award-number":["FIT-S-23-8151"]}]},{"name":"Ministry of the Interior of the Czech Republic","award":["VB02000060"],"award-info":[{"award-number":["VB02000060"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,8]]},"DOI":"10.1145\/3605098.3635911","type":"proceedings-article","created":{"date-parts":[[2024,5,21]],"date-time":"2024-05-21T17:59:16Z","timestamp":1716314356000},"page":"1312-1320","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":16,"title":["Deepfake Speech Detection: A Spectrogram Analysis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4717-1910","authenticated-orcid":false,"given":"Anton","family":"Firc","sequence":"first","affiliation":[{"name":"Brno University of Technology, Brno, Czech Republic"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9009-2193","authenticated-orcid":false,"given":"Kamil","family":"Malinka","sequence":"additional","affiliation":[{"name":"Brno University of Technology, Brno, Czech Republic"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5507-0768","authenticated-orcid":false,"given":"Petr","family":"Han\u00e1\u010dek","sequence":"additional","affiliation":[{"name":"Brno University of Technology, Brno, Czech Republic"}]}],"member":"320","published-online":{"date-parts":[[2024,5,21]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Iljoo Kim, Taekkyung Oh, and Hyoungshick Kim.","author":"Ahmed Muhammad Ejaz","year":"2020","unstructured":"Muhammad Ejaz Ahmed, Il-Youp Kwak, Jun Ho Huh, Iljoo Kim, Taekkyung Oh, and Hyoungshick Kim. 2020. Void: A Fast and Light Voice Liveness Detection System. USENIX Association, USA."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2109.00281"},{"key":"e_1_3_2_1_4_1","unstructured":"Thomas Brewster. 2022. Fraudsters cloned company director's voice in $35 million bank heist police find. https:\/\/www.forbes.com\/sites\/thomasbrewster\/2021\/10\/14\/huge-bank-fraud-uses-deep-fake-voice-tech-to-steal-millions\/?sh=776258a75591"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1121\/1.400476"},{"key":"e_1_3_2_1_6_1","unstructured":"Tom B\u00e4ckstr\u00f6m. 2019. Spectrogram and the STFT. online. https:\/\/wiki.aalto.fi\/display\/ITSP\/Spectrogram+and+the+STFT"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.21437\/ASVSPOOF.2021-11"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-19"},{"key":"e_1_3_2_1_9_1","unstructured":"Jemine Corentin. 2019. Real-time Voice Cloning. Master thesis. Universit\u00e9 de Li\u00e8ge Li\u00e8ge Belgique. https:\/\/matheo.uliege.be\/handle\/2268.2\/6801?locale=en"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477314.3507013"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.heliyon.2023.e15090"},{"key":"e_1_3_2_1_12_1","unstructured":"Joel Frank and Lea Sch\u00f6nherr. 2021. WaveFake: A Data Set to Facilitate Audio Deepfake Detection. arXiv:2111.02813 [cs.LG]"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.21437\/IberSPEECH.2018-10"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2016.7820786"},{"key":"e_1_3_2_1_15_1","unstructured":"Laurie Iacono Josh Hickman and Caitlin Muniz. 2022. The rise of vishing and Smishing attacks - the monitor issue 21. https:\/\/www.kroll.com\/en\/insights\/publications\/cyber\/monitor\/vishing-smishing-attacks"},{"key":"e_1_3_2_1_16_1","unstructured":"Keith Ito and Linda Johnson. 2017. The LJ Speech Dataset. online. https:\/\/keithito.com\/LJ-Speech-Dataset\/"},{"key":"e_1_3_2_1_17_1","volume-title":"End-to-End Music Transcription Using Fine-Tuned Variable-Q Filterbanks. Thesis","author":"Cwitkowitz Frank C.","unstructured":"Frank C. Cwitkowitz Jr. 2019. End-to-End Music Transcription Using Fine-Tuned Variable-Q Filterbanks. Thesis. Rochester Institute of Technology."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.21437\/ASVSPOOF.2021-13"},{"key":"e_1_3_2_1_19_1","volume-title":"Woo","author":"Khalid Hasam","year":"2021","unstructured":"Hasam Khalid, Shahroz Tariq, Minha Kim, and Simon S. Woo. 2021. FakeAVCeleb: A Novel Audio-Video Multimodal Deepfake Dataset. In Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2). https:\/\/openreview.net\/forum?id=TAXFsg6ZaOl"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/s13369-021-06297-w"},{"key":"e_1_3_2_1_21_1","volume-title":"Advances in Neural Information Processing Systems","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis. In Advances in Neural Information Processing Systems, H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin (Eds.), Vol. 33. Curran Associates, Inc., 17022--17033. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/c5d736809766d46260d816d8dbc9eb44-Paper.pdf"},{"key":"e_1_3_2_1_22_1","volume-title":"Jose Sotelo, Alexandre de Br\u00e9bisson, Yoshua Bengio, and Aaron C Courville.","author":"Kumar Kundan","year":"2019","unstructured":"Kundan Kumar, Rithesh Kumar, Thibault de Boissiere, Lucas Gestin, Wei Zhen Teoh, Jose Sotelo, Alexandre de Br\u00e9bisson, Yoshua Bengio, and Aaron C Courville. 2019. MelGAN: Generative Adversarial Networks for Conditional Waveform Synthesis. In Advances in Neural Information Processing Systems, H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u00e9-Buc, E. Fox, and R. Garnett (Eds.), Vol. 32. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/file\/6804c9bca0a615bdb9374d00a9fcba59-Paper.pdf"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.113"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1190"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2006.1660175"},{"key":"e_1_3_2_1_26_1","volume-title":"Information retrieval for music and motion","author":"M\u00fcller Meinard","unstructured":"Meinard M\u00fcller. 2007. Information retrieval for music and motion. Springer Berlin, Heidelberg, New York."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Ryan Prenger Rafael Valle and Bryan Catanzaro. 2018. WaveGlow: A Flow-based Generative Network for Speech Synthesis. arXiv:1811.00002 [cs.SD]","DOI":"10.1109\/ICASSP.2019.8683143"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/BIOSIG58226.2023.10346006"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1561\/2000000001"},{"key":"e_1_3_2_1_30_1","volume-title":"Synthetic Speech Detection Using Deep Neural Networks. Master's thesis","author":"Reimao Ricardo","unstructured":"Ricardo Reimao. 2019. Synthetic Speech Detection Using Deep Neural Networks. Master's thesis. York University, Toronto, Ontario."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/SPED.2019.8906599"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/SpeD53181.2021.9587406"},{"key":"e_1_3_2_1_33_1","unstructured":"Philippe Remy. 2020. Temporal Convolutional Networks for Keras. https:\/\/github.com\/philipperemy\/keras-tcn."},{"key":"e_1_3_2_1_34_1","unstructured":"Ryosuke Sonobe Shinnosuke Takamichi and Hiroshi Saruwatari. 2017. JSUT corpus: free large-scale Japanese speech corpus for end-to-end speech synthesis. arXiv:1711.00354 [cs.CL]"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2016-41"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2249"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2020.101114"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.7488\/ds\/2555"},{"key":"e_1_3_2_1_39_1","volume-title":"Tomi Kinnunen, Nicholas Evans, and H\u00e9ctor Delgado.","author":"Yamagishi Junichi","year":"2021","unstructured":"Junichi Yamagishi, Xin Wang, Massimiliano Todisco, Md Sahidullah, Jose Patino, Andreas Nautsch, Xuechen Liu, Kong Aik Lee, Tomi Kinnunen, Nicholas Evans, and H\u00e9ctor Delgado. 2021. ASVspoof 2021: accelerating progress in spoofed and deepfake speech detection. arXiv:2109.00537 [eess.AS]"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-020-65070-5"},{"key":"e_1_3_2_1_41_1","volume-title":"Log-Frequency Spectrogram and Chromagram. Retrieved","author":"Zalkow Frank","year":"2022","unstructured":"Frank Zalkow and Meinard M\u00fcller. 2015. Log-Frequency Spectrogram and Chromagram. Retrieved March 1, 2022 from https:\/\/www.audiolabs-erlangen.de\/resources\/MIR\/FMP\/C3\/C3S1_SpecLogFreq-Chromagram.html"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-522"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6639215"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","unstructured":"Zhenyu Zhang Yewei Gu Xiaowei Yi and Xianfeng Zhao. 2020. SynSpeechDDB: a new synthetic speech detection database. 10.21227\/ta8z-mx73","DOI":"10.21227\/ta8z-mx73"},{"key":"e_1_3_2_1_45_1","volume-title":"FMFCC-A: A Challenging Mandarin Dataset for Synthetic Speech Detection","author":"Zhang Zhenyu","unstructured":"Zhenyu Zhang, Yewei Gu, Xiaowei Yi, and Xianfeng Zhao. 2022. FMFCC-A: A Challenging Mandarin Dataset for Synthetic Speech Detection. In Digital Forensics and Watermarking, Xianfeng Zhao, Alessandro Piva, and Pedro Comesa\u00f1a-Alfaro (Eds.). Springer International Publishing, Cham, 117--131."}],"event":{"name":"SAC '24: 39th ACM\/SIGAPP Symposium on Applied Computing","location":"Avila Spain","acronym":"SAC '24","sponsor":["SIGAPP ACM Special Interest Group on Applied Computing"]},"container-title":["Proceedings of the 39th ACM\/SIGAPP Symposium on Applied Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3605098.3635911","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3605098.3635911","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:36:14Z","timestamp":1750178174000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3605098.3635911"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,8]]},"references-count":44,"alternative-id":["10.1145\/3605098.3635911","10.1145\/3605098"],"URL":"https:\/\/doi.org\/10.1145\/3605098.3635911","relation":{},"subject":[],"published":{"date-parts":[[2024,4,8]]},"assertion":[{"value":"2024-05-21","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}