{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:59:40Z","timestamp":1776884380457,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":32,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658086","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"376-384","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["Retrieval-Augmented Audio Deepfake Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9789-7798","authenticated-orcid":false,"given":"Zuheng","family":"Kang","sequence":"first","affiliation":[{"name":"Ping An Technology (Shenzhen) Co., Ltd., Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-9722-8031","authenticated-orcid":false,"given":"Yayun","family":"He","sequence":"additional","affiliation":[{"name":"Ping An Technology (Shenzhen) Co., Ltd., Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9114-1236","authenticated-orcid":false,"given":"Botao","family":"Zhao","sequence":"additional","affiliation":[{"name":"Ping An Technology (Shenzhen) Co., Ltd., Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8353-4064","authenticated-orcid":false,"given":"Xiaoyang","family":"Qu","sequence":"additional","affiliation":[{"name":"Ping An Technology (Shenzhen) Co., Ltd., Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2903-9615","authenticated-orcid":false,"given":"Junqing","family":"Peng","sequence":"additional","affiliation":[{"name":"Ping An Technology (Shenzhen) Co., Ltd., Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9615-4749","authenticated-orcid":false,"given":"Jing","family":"Xiao","sequence":"additional","affiliation":[{"name":"Ping An Insurance (Group) Company of China, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9237-4231","authenticated-orcid":false,"given":"Jianzong","family":"Wang","sequence":"additional","affiliation":[{"name":"Ping An Technology (Shenzhen) Co., Ltd., Shenzhen, Guangdong, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"12449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. Advances In Neural Information Processing Systems , Vol. 33 (2020), 12449--12460.","journal-title":"Advances In Neural Information Processing Systems"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2023.101597"},{"key":"e_1_3_2_1_3_1","volume-title":"Advances in Neural Information Processing Systems (NIPS)","volume":"36","author":"Chen Jinggang","year":"2024","unstructured":"Jinggang Chen, Junjie Li, Xiaoyang Qu, Jianzong Wang, Jiguang Wan, and Jing Xiao. 2024. GAIA: Delving into Gradient-based Attribution Abnormality for Out-of-distribution Detection. Advances in Neural Information Processing Systems (NIPS) , Vol. 36 (2024)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1980.1163420"},{"key":"e_1_3_2_1_6_1","volume-title":"SAMO: Speaker Attractor Multi-Center One-Class Learning For Voice Anti-Spoofing. International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Ding Sivan","year":"2022","unstructured":"Sivan Ding, You Zhang, and Zhiyao Duan. 2022. SAMO: Speaker Attractor Multi-Center One-Class Learning For Voice Anti-Spoofing. International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2022), 1--5."},{"key":"e_1_3_2_1_7_1","volume-title":"Spatial reconstructed local attention Res2Net with F0 subband for fake speech detection. Neural Networks","author":"Fan Cunhang","year":"2024","unstructured":"Cunhang Fan, Jun Xue, Jianhua Tao, Jiangyan Yi, Chenglong Wang, Chengshi Zheng, and Zhao Lv. 2024. Spatial reconstructed local attention Res2Net with F0 subband for fake speech detection. Neural Networks (2024), 106320."},{"key":"e_1_3_2_1_8_1","unstructured":"Yunfan Gao Yun Xiong Xinyu Gao Kangxiang Jia Jinliu Pan Yuxi Bi Yi Dai Jiawei Sun Qianyu Guo Meng Wang and Haofen Wang. 2023. Retrieval-Augmented Generation for Large Language Models: A Survey."},{"key":"e_1_3_2_1_9_1","volume-title":"Audio Deepfake Detection With Self-Supervised Wavlm And Multi-Fusion Attentive Classifier. In International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 12702--12706","author":"Guo Yinlin","year":"2024","unstructured":"Yinlin Guo, Haofan Huang, Xi Chen, He Zhao, and Yuehai Wang. 2024. Audio Deepfake Detection With Self-Supervised Wavlm And Multi-Fusion Attentive Classifier. In International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 12702--12706."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3231480"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2021.3089437"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2023.3251895"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2023-1537"},{"key":"e_1_3_2_1_14_1","volume-title":"Reynolds","author":"Kinnunen Tomi H.","year":"2018","unstructured":"Tomi H. Kinnunen, Kong-Aik Lee, H\u00e9ctor Delgado, Nicholas W. D. Evans, Massimiliano Todisco, Md. Sahidullah, Junichi Yamagishi, and Douglas A. Reynolds. 2018. t-DCF: a Detection Cost Function for the Tandem Assessment of Spoofing Countermeasures and Automatic Speaker Verification. (2018)."},{"key":"e_1_3_2_1_15_1","first-page":"9459","article-title":"Retrieval-augmented generation for knowledge-intensive NLP tasks","volume":"33","author":"Lewis Patrick","year":"2020","unstructured":"Patrick Lewis, Ethan Perez, Aleksandra Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich K\u00fcttler, Mike Lewis, Wen-tau Yih, Tim Rockt\"aschel, et al. 2020. Retrieval-augmented generation for knowledge-intensive NLP tasks. Advances in Neural Information Processing Systems , Vol. 33 (2020), 9459--9474.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_16_1","volume-title":"Channel-wise gated res2net: Towards robust detection of synthetic speech attacks","author":"Li Xu","year":"2021","unstructured":"Xu Li, Xixin Wu, Hui Lu, Xunying Liu, and Helen Meng. 2021. Channel-wise gated res2net: Towards robust detection of synthetic speech attacks. International Speech Communication Association (Interspeech) (2021)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414670"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747768"},{"key":"e_1_3_2_1_19_1","volume-title":"Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever.","author":"Radford Alec","year":"2023","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2023. Robust speech recognition via large-scale weak supervision. (2023), 28492--28518."},{"key":"e_1_3_2_1_20_1","volume-title":"AI-Synthesized Voice Detection Using Neural Vocoder Artifacts. 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW)","author":"Sun Chengzhe","year":"2023","unstructured":"Chengzhe Sun, Shan Jia, Shuwei Hou, and Siwei Lyu. 2023. AI-Synthesized Voice Detection Using Neural Vocoder Artifacts. 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (CVPRW) (2023), 904--912."},{"key":"e_1_3_2_1_21_1","volume-title":"Speaker Odyssey Workshop","author":"Tak Hemlata","year":"2022","unstructured":"Hemlata Tak, Massimiliano Todisco, Xin Wang, Jee weon Jung, Junichi Yamagishi, and Nicholas W. D. Evans. 2022. Automatic speaker verification spoofing and deepfake detection using wav2vec 2.0 and data augmentation. Speaker Odyssey Workshop (2022)."},{"key":"e_1_3_2_1_22_1","volume-title":"Jose Patino, Madhu R. Kamble, Massimiliano Todisco, and Nicholas W. D. Evans.","author":"Tak Hemlata","year":"2021","unstructured":"Hemlata Tak, Jee weon Jung, Jose Patino, Madhu R. Kamble, Massimiliano Todisco, and Nicholas W. D. Evans. 2021. End-to-End Spectro-Temporal Graph Attention Networks for Speaker Verification Anti-Spoofing and Speech Deepfake Detection. ASVspoof 2021 Workshop-Automatic Speaker Verification and Spoofing Coutermeasures Challenge (2021)."},{"key":"e_1_3_2_1_23_1","volume-title":"ASVspoof 2019: Future Horizons in Spoofed and Fake Audio Detection","author":"Todisco Massimiliano","unstructured":"Massimiliano Todisco, Xin Wang, Ville Vestman, Md. Sahidullah, H\u00e9ctor Delgado, Andreas Nautsch, Junichi Yamagishi, Nicholas W. D. Evans, Tomi H. Kinnunen, and Kong-Aik Lee. 2019. ASVspoof 2019: Future Horizons in Spoofed and Fake Audio Detection. In International Speech Communication Association (Interspeech)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-702"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2022-14"},{"key":"e_1_3_2_1_26_1","volume-title":"AASIST: Audio Anti-Spoofing Using Integrated Spectro-Temporal Graph Attention Networks. International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Jung Jee","year":"2021","unstructured":"Jee weon Jung, Hee-Soo Heo, Hemlata Tak, Hye jin Shim, Joon Son Chung, Bong-Jin Lee, Ha jin Yu, and Nicholas W. D. Evans. 2021. AASIST: Audio Anti-Spoofing Using Integrated Spectro-Temporal Graph Attention Networks. International Conference on Acoustics, Speech and Signal Processing (ICASSP) (2021), 6367--6371."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.21437\/ASVSPOOF.2021-8"},{"key":"e_1_3_2_1_28_1","volume-title":"ADD 2022: the first audio deep synthesis detection challenge. In International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 9216--9220","author":"Yi Jiangyan","year":"2022","unstructured":"Jiangyan Yi, Ruibo Fu, Jianhua Tao, Shuai Nie, Haoxin Ma, Chenglong Wang, Tao Wang, Zhengkun Tian, Ye Bai, Cunhang Fan, et al. 2022. ADD 2022: the first audio deep synthesis detection challenge. In International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 9216--9220."},{"key":"e_1_3_2_1_29_1","volume-title":"ADD 2023: the Second Audio Deepfake Detection Challenge. ArXiv","volume":"2305","author":"Yi Jiangyan","year":"2023","unstructured":"Jiangyan Yi, Jianhua Tao, Ruibo Fu, Xinrui Yan, Chenglong Wang, Tao Wang, Chu Yuan Zhang, Xiaohui Zhang, Yan Zhao, Yong Ren, Leling Xu, Jun Zhou, Hao Gu, Zhengqi Wen, Shan Liang, Zheng Lian, Shuai Nie, and Haizhou Li. 2023 a. ADD 2023: the Second Audio Deepfake Detection Challenge. ArXiv , Vol. abs\/2305.13774 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"Chu Yuan Zhang, and Yan Zhao. 2023 b. Audio Deepfake Detection: A Survey. ArXiv","author":"Yi Jiangyan","year":"2023","unstructured":"Jiangyan Yi, Chenglong Wang, Jianhua Tao, Xiaohui Zhang, Chu Yuan Zhang, and Yan Zhao. 2023 b. Audio Deepfake Detection: A Survey. ArXiv , Vol. abs\/2308.14970 (2023)."},{"key":"e_1_3_2_1_31_1","volume-title":"The Effect of Silence and Dual-Band Fusion in Anti-Spoofing System","author":"Zhang Yuxiang","unstructured":"Yuxiang Zhang, Wenchao Wang, and Pengyuan Zhang. 2021. The Effect of Silence and Dual-Band Fusion in Anti-Spoofing System. In International Speech Communication Association (Interspeech)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSPIS54653.2021.9729387"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658086","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658086","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:54:49Z","timestamp":1755766489000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658086"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":32,"alternative-id":["10.1145\/3652583.3658086","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658086","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}