{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,13]],"date-time":"2026-05-13T17:44:25Z","timestamp":1778694265858,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746265.3759675","type":"proceedings-article","created":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T22:20:29Z","timestamp":1759962029000},"page":"93-100","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["SASDN: A Generalizable and Minimal-Intervention LLM-Integrated Framework for Continual Adaptation in Spoofed Speech Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-2804-3195","authenticated-orcid":false,"given":"Utkarsh","family":"Venaik","sequence":"first","affiliation":[{"name":"IIIT Delhi, New Delhi, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3984-7957","authenticated-orcid":false,"given":"Akash","family":"Kushwaha","sequence":"additional","affiliation":[{"name":"IIIT Delhi, New Delhi, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9202-5865","authenticated-orcid":false,"given":"Nabeel Koya","family":"A","sequence":"additional","affiliation":[{"name":"CDAC, Thiruvananthapuram, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1028-9373","authenticated-orcid":false,"given":"Rajiv Ratn","family":"Shah","sequence":"additional","affiliation":[{"name":"IIIT Delhi, New Delhi, India"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Aculab. n.d.. Voisentry Use Cases. https:\/\/aculab.com\/biometric-technologies\/voisentry-use-cases\/"},{"key":"e_1_3_2_1_2_1","volume-title":"Adapting TTS models for new speakers using transfer learning. arXiv preprint arXiv:2110.05798","author":"Chen Mingjian","year":"2021","unstructured":"Mingjian Chen, Yu Wu, Yi Ren, Zhiying Zhao, Jinglin Liu, Zhijie Lian, and Ming Zhou. 2021. Adapting TTS models for new speakers using transfer learning. arXiv preprint arXiv:2110.05798 (2021)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2020-19"},{"key":"e_1_3_2_1_4_1","volume-title":"Information Re-Organization Improves Reasoning in Large Language Models. arXiv preprint arXiv:2404.13985","author":"Cheng Xiaoxia","year":"2024","unstructured":"Xiaoxia Cheng, Zeqi Tan, Wei Xue, and Weiming Lu. 2024. Information Re-Organization Improves Reasoning in Large Language Models. arXiv preprint arXiv:2404.13985 (2024)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Joon Son Chung Arsha Nagrani and Andrew Zisserman. 2018. VoxCeleb2: Deep Speaker Recognition. In Interspeech.","DOI":"10.21437\/Interspeech.2018-1929"},{"key":"e_1_3_2_1_7_1","volume-title":"Pattern Recognition","author":"Daugman John","year":"2030","unstructured":"John Daugman and Cathryn Downing. 2003. The Importance of Being Random: Statistical Principles of Iris Recognition. In Pattern Recognition. Elsevier, 279-291. https:\/\/www.sciencedirect.com\/science\/article\/pii\/S0031320303002862"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.21437\/Odyssey.2018-42"},{"key":"e_1_3_2_1_9_1","first-page":"3830","article-title":"ECAPA-TDNN: Emphasized Channel Attention, propagation and aggregation in TDNN based speaker verification","volume":"2020","author":"Desplanques Brecht","year":"2020","unstructured":"Brecht Desplanques, Jenthe Thienpondt, and Kris Demuynck. 2020. ECAPA-TDNN: Emphasized Channel Attention, propagation and aggregation in TDNN based speaker verification. In Interspeech 2020. 3830-3834.","journal-title":"Interspeech"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2013-288"},{"key":"e_1_3_2_1_11_1","volume-title":"Detection and Evaluation of Human and Machine Generated Speech in Spoofing Attacks on Automatic Speaker Verification Systems. arXiv preprint arXiv:2011.03689","author":"Gao Yang","year":"2020","unstructured":"Yang Gao, Jiachen Lian, Bhiksha Raj, and Rita Singh. 2020. Detection and Evaluation of Human and Machine Generated Speech in Spoofing Attacks on Automatic Speaker Verification Systems. arXiv preprint arXiv:2011.03689 (2020)."},{"key":"e_1_3_2_1_12_1","first-page":"5991","article-title":"Utterance-Level End-to-End Language Identification Using Attention-Based CNN-BLSTM. In ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","author":"Gao Zhiyuan","year":"2019","unstructured":"Zhiyuan Gao, Shiyu Wang, Zhenyu Zhao, Jian Li, Yanjie Li, and Yonghong Yan. 2019. Utterance-Level End-to-End Language Identification Using Attention-Based CNN-BLSTM. In ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 5991-5995.","journal-title":"IEEE"},{"key":"e_1_3_2_1_13_1","first-page":"1","volume-title":"Proceedings of the ACM on Human-Computer Interaction","volume":"7","author":"Gong Fengjiao","year":"2024","unstructured":"Fengjiao Gong, Yuzhou Nie, and Hongteng Xu. 2024. MultiSurf-GPT: Facilitating Context-Aware Reasoning with Large Language Models. Proceedings of the ACM on Human-Computer Interaction, Vol. 7, CSCW2 (2024), 1-24."},{"key":"e_1_3_2_1_14_1","unstructured":"Ian Goodfellow Yoshua Bengio and Aaron Courville. 2016. Deep Learning. (2016). https:\/\/www.deeplearningbook.org\/."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_16_1","volume-title":"Identity Mappings in Deep Residual Networks. arXiv preprint arXiv:1603.05027","author":"He Kaiming","year":"2016","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2016b. Identity Mappings in Deep Residual Networks. arXiv preprint arXiv:1603.05027 (2016)."},{"key":"e_1_3_2_1_17_1","volume-title":"A Comparative Study of Credibility Assessments by LLM and Human Judges. arXiv preprint arXiv:2311.15396","author":"Jiang Yifan","year":"2023","unstructured":"Yifan Jiang, John Smith, and Jane Doe. 2023. A Comparative Study of Credibility Assessments by LLM and Human Judges. arXiv preprint arXiv:2311.15396 (2023). https:\/\/arxiv.org\/pdf\/2311.15396"},{"key":"e_1_3_2_1_18_1","volume-title":"To What Extent Can ASV Systems Naturally Defend Against Spoofing Attacks? arXiv preprint arXiv:2406.05339","author":"Wang Xin","year":"2024","unstructured":"Jee-weon Jung, Xin Wang, Nicholas Evans, Shinji Watanabe, Hye-jin Shim, Hemlata Tak, Siddhant Arora, Junichi Yamagishi, and Joon Son Chung. 2024. To What Extent Can ASV Systems Naturally Defend Against Spoofing Attacks? arXiv preprint arXiv:2406.05339 (2024)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1017\/ATSIP.2018.19"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning (ICML). PMLR, 5530-5540","author":"Kim Jaehyeon","year":"2021","unstructured":"Jaehyeon Kim, Jungil Kong, and Juhee Son. 2021. Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. In Proceedings of the 38th International Conference on Machine Learning (ICML). PMLR, 5530-5540. https:\/\/proceedings.mlr.press\/v139\/kim21k.html"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 29th International Conference on Neural Information Processing (ICONIP). Springer, 3-15","author":"Kim Taesu","year":"2022","unstructured":"Taesu Kim, Janghoon Cho, Minchan Jung, Minchan Kim, Soonyoung Jung, and Sungroh Yoon. 2022. XTTS: Multilingual Zero-Shot Text-to-Speech and Voice Conversion with Conditional Variational Autoencoders. In Proceedings of the 29th International Conference on Neural Information Processing (ICONIP). Springer, 3-15. https:\/\/arxiv.org\/pdf\/2406.04904"},{"key":"e_1_3_2_1_22_1","volume-title":"ASVspoof 2024 Workshop. https:\/\/www.isca-archive.org\/asvspoof_2024\/kulkarni24_asvspoof.pdf","author":"Kulkarni Prakash","year":"2024","unstructured":"Prakash Kulkarni, Vikram Kamble, and Hemant Patil. 2024. Exploring Generalization to Unseen Audio Data for Spoofing. In ASVspoof 2024 Workshop. https:\/\/www.isca-archive.org\/asvspoof_2024\/kulkarni24_asvspoof.pdf"},{"key":"e_1_3_2_1_23_1","volume-title":"ASSERT: Anti-Spoofing with Squeeze-Excitation and Residual neTworks. In Interspeech. https:\/\/api.semanticscholar.org\/CorpusID:91184432","author":"Lai I","year":"2019","unstructured":"Cheng-I Lai, Nanxin Chen, Jes\u00fas Villalba, and Najim Dehak. 2019. ASSERT: Anti-Spoofing with Squeeze-Excitation and Residual neTworks. In Interspeech. https:\/\/api.semanticscholar.org\/CorpusID:91184432"},{"key":"e_1_3_2_1_24_1","unstructured":"Patrick Lewis Ethan Perez Aleksandra Piktus Fabio Petroni Vladimir Karpukhin Naman Goyal Heinrich K\u00fcttler Mike Lewis Wen-tau Yih Tim Rockt\u00e4schel Sebastian Riedel and Douwe Kiela. 2020. Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/taslp.2023.3285283"},{"key":"e_1_3_2_1_26_1","volume-title":"Audio Scene Classification with Deep Recurrent Neural Networks. arXiv preprint arXiv:1703.04770","author":"Phan Huy","year":"2017","unstructured":"Huy Phan, Philipp Koch, Fabrice Katzberg, Marco Maass, Radoslaw Mazur, and Alfred Mertins. 2017. Audio Scene Classification with Deep Recurrent Neural Networks. arXiv preprint arXiv:1703.04770 (2017)."},{"key":"e_1_3_2_1_27_1","first-page":"10756","article-title":"Masked Audio Text Encoders are Effective Multi-Modal Rescorers","volume":"2023","author":"Qian Yao","year":"2023","unstructured":"Yao Qian, Ke Li, and Yifan Gong. 2023. Masked Audio Text Encoders are Effective Multi-Modal Rescorers. In Findings of the Association for Computational Linguistics: ACL 2023. 10756-10766.","journal-title":"Findings of the Association for Computational Linguistics: ACL"},{"key":"e_1_3_2_1_28_1","volume-title":"Tao Xu, Greg Brockman, Chris McLeavey, and Ilya Sutskever.","author":"Radford Alec","year":"2022","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Chris McLeavey, and Ilya Sutskever. 2022. Robust Speech Recognition via Large-Scale Weak Supervision. arXiv preprint arXiv:2212.04356 (2022)."},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the 2018 IEEE Spoken Language Technology Workshop (SLT). IEEE, 1-8.","author":"Ravanelli Mirco","year":"2018","unstructured":"Mirco Ravanelli and Yoshua Bengio. 2018. Interpretable Convolutional Filters with SincNet. In Proceedings of the 2018 IEEE Spoken Language Technology Workshop (SLT). IEEE, 1-8."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461368"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1844"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414234"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.21437\/ASVSPOOF.2021-1"},{"key":"e_1_3_2_1_34_1","volume-title":"Large Language Models are In-Context Semantic Reasoners rather than Symbolic Reasoners. arXiv preprint arXiv:2305.14825","author":"Tang Xiaojuan","year":"2023","unstructured":"Xiaojuan Tang, Zilong Zheng, Jiaqi Li, Fanxu Meng, Song-Chun Zhu, Yitao Liang, and Muhan Zhang. 2023. Large Language Models are In-Context Semantic Reasoners rather than Symbolic Reasoners. arXiv preprint arXiv:2305.14825 (2023)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2017.01.001"},{"key":"e_1_3_2_1_36_1","volume-title":"Hye jin Shim, Massimiliano Todisco, Ivan Kukanov, Xuechen Liu, Md Sahidullah, Tomi Kinnunen, Nicholas Evans, Kong Aik Lee, and Junichi Yamagishi.","author":"Wang Xin","year":"2024","unstructured":"Xin Wang, Hector Delgado, Hemlata Tak, Jee weon Jung, Hye jin Shim, Massimiliano Todisco, Ivan Kukanov, Xuechen Liu, Md Sahidullah, Tomi Kinnunen, Nicholas Evans, Kong Aik Lee, and Junichi Yamagishi. 2024. ASVspoof 5: Crowdsourced Speech Data, Deepfakes, and Adversarial Attacks at Scale. arXiv:2408.08739 [eess.AS] https:\/\/arxiv.org\/abs\/2408.08739"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2020.101114"},{"key":"e_1_3_2_1_38_1","first-page":"3583","article-title":"Improved RawNet with Feature Map Scaling for Text-independent Speaker Verification using Raw Waveforms","author":"Jung Jee","year":"2020","unstructured":"Jee weon Jung, Seung bin Kim, Hye jin Shim, Ju ho Kim, and Ha-Jin Yu. 2020. Improved RawNet with Feature Map Scaling for Text-independent Speaker Verification using Raw Waveforms. In Proceedings of Interspeech. ISCA, 3583-3587. https:\/\/www.isca-speech.org\/archive\/Interspeech_2020\/pdfs\/2585.pdf","journal-title":"Proceedings of Interspeech. ISCA"},{"key":"e_1_3_2_1_39_1","first-page":"1268","article-title":"RawNet: Advanced End-to-End Deep Neural Network Using Raw Waveforms for Text-Independent Speaker Verification","author":"Jung Jee","year":"2019","unstructured":"Jee weon Jung, Hee-Soo Heo, Han-Gyu Kim, Hye jin Shim, Joon Son Chung, and Ha-Jin Yu. 2019. RawNet: Advanced End-to-End Deep Neural Network Using Raw Waveforms for Text-Independent Speaker Verification. In Proceedings of Interspeech. ISCA, 1268-1272. https:\/\/www.isca-archive.org\/interspeech_2019\/papers\/1982.pdf","journal-title":"Proceedings of Interspeech. ISCA"},{"key":"e_1_3_2_1_40_1","volume-title":"Joon Son Chung, Bong-Jin Lee, Ha-Jin Yu, and Nicholas Evans.","author":"Jung Jee","year":"2021","unstructured":"Jee weon Jung, Hee-Soo Heo, Hemlata Tak, Hye jin Shim, Joon Son Chung, Bong-Jin Lee, Ha-Jin Yu, and Nicholas Evans. 2021. AASIST: Audio Anti-Spoofing using Integrated Spectro-Temporal Graph Attention Networks. arXiv:2110.01200 [eess.AS] https:\/\/arxiv.org\/abs\/2110.01200"},{"key":"e_1_3_2_1_41_1","first-page":"808","article-title":"Spoofing and countermeasures for speaker verification: A review","volume":"10","author":"Wu Zhizheng","year":"2015","unstructured":"Zhizheng Wu, Nicholas Evans, Tomi Kinnunen, Junichi Yamagishi, Florian Alegre, and Haizhou Li. 2015a. Spoofing and countermeasures for speaker verification: A review. IEEE Transactions on Information Forensics and Security, Vol. 10, 4 (2015), 808-824.","journal-title":"IEEE Transactions on Information Forensics and Security"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2014.10.005"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2017.2671435"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-462"},{"key":"e_1_3_2_1_45_1","volume-title":"ASVspoof 2021: accelerating progress in spoofed and deepfake speech detection. arXiv preprint arXiv:2109.00537","author":"Yamagishi Junichi","year":"2021","unstructured":"Junichi Yamagishi, Xin Wang, Massimiliano Todisco, Md Sahidullah, Jose Patino, H\u00e9ctor Delgado, Tomi Kinnunen, Nicholas Evans, Andreas Nautsch, and Kong Aik Lee. 2021. ASVspoof 2021: accelerating progress in spoofed and deepfake speech detection. arXiv preprint arXiv:2109.00537 (2021)."},{"key":"e_1_3_2_1_46_1","volume-title":"A Survey on Speech Large Language Models. arXiv preprint arXiv:2410.18908","author":"Zhang Lei","year":"2024","unstructured":"Lei Zhang, Xin Wang, and Junichi Yamagishi. 2024. A Survey on Speech Large Language Models. arXiv preprint arXiv:2410.18908 (2024)."},{"key":"e_1_3_2_1_47_1","volume-title":"Multimodal learning with transformers: A survey. arXiv preprint arXiv:2206.06488","author":"Zhang Qiang","year":"2022","unstructured":"Qiang Zhang, Yin Yang, Huan Ma, Richang Hong, and Liqiang Nie. 2022. Multimodal learning with transformers: A survey. arXiv preprint arXiv:2206.06488 (2022)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3066309"},{"key":"e_1_3_2_1_49_1","volume-title":"Handbook of Biometric Anti-Spoofing: Presentation Attack Detection and Vulnerability Assessment","author":"Zhang You","unstructured":"You Zhang, Fei Jiang, Ge Zhu, Xinhui Chen, and Zhiyao Duan. 2023. Generalizing Voice Presentation Attack Detection to Unseen Synthetic Attacks and Channel Variation. In Handbook of Biometric Anti-Spoofing: Presentation Attack Detection and Vulnerability Assessment. Springer Nature Singapore, 421-443."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 1st on Deepfake Forensics Workshop: Detection, Attribution, Recognition, and Adversarial Challenges in the Era of AI-Generated Media"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746265.3759675","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:51:13Z","timestamp":1764550273000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746265.3759675"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":49,"alternative-id":["10.1145\/3746265.3759675","10.1145\/3746265"],"URL":"https:\/\/doi.org\/10.1145\/3746265.3759675","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}