{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,7]],"date-time":"2026-07-07T15:49:57Z","timestamp":1783439397376,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":41,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Guangdong Provincial Key Laboratory of Information Security Technology","award":["2023B1212060026"],"award-info":[{"award-number":["2023B1212060026"]}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U2001202, 62072480, U23A20305, 62172435"],"award-info":[{"award-number":["U2001202, 62072480, U23A20305, 62172435"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Open Research Project of the State Key Laboratory Media Convergence and Communication (Communication University of China)","award":["SKLMCC2022KF003"],"award-info":[{"award-number":["SKLMCC2022KF003"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680585","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"7395-7403","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["Coarse-to-Fine Proposal Refinement Framework for Audio Temporal Forgery Detection and Localization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2692-5928","authenticated-orcid":false,"given":"Junyan","family":"Wu","sequence":"first","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4068-1766","authenticated-orcid":false,"given":"Wei","family":"Lu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Sun Yat-sen University, Guangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3225-4649","authenticated-orcid":false,"given":"Xiangyang","family":"Luo","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Mathematical Engineering and Advanced Computing, Zhengzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7446-7216","authenticated-orcid":false,"given":"Rui","family":"Yang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8967-8525","authenticated-orcid":false,"given":"Qian","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Engineering, Wuhan University, Wuhan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7141-708X","authenticated-orcid":false,"given":"Xiaochun","family":"Cao","sequence":"additional","affiliation":[{"name":"School of Cyber Science and Technology, Shenzhen Campus of Sun Yat-sen University, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2020.3039045"},{"key":"e_1_3_2_1_2_1","volume-title":"Proceeding of the International Conference on Learning Representations.","author":"Binkowski Mikolaj","year":"2020","unstructured":"Mikolaj Binkowski, Jeff Donahue, Sander Dieleman, Aidan Clark, Erich Elsen, Norman Casagrande, Luis C. Cobo, and Karen Simonyan. 2020. High Fidelity Speech Synthesis with Adversarial Networks. In Proceeding of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the IEEE International Conference on Computer Vision. 5562--5570","author":"Bodla Navaneeth","unstructured":"Navaneeth Bodla, Bharat Singh, Rama Chellappa, and Larry S. Davis. 2017. Soft-NMS - Improving Object Detection with One Line of Code. In Proceedings of the IEEE International Conference on Computer Vision. 5562--5570."},{"key":"e_1_3_2_1_4_1","volume-title":"Munawar Hayat, Abhinav Dhall, and Kalin Stefanov.","author":"Cai Zhixi","year":"2023","unstructured":"Zhixi Cai, Shreya Ghosh, Aman Pankaj Adatia, Munawar Hayat, Abhinav Dhall, and Kalin Stefanov. 2023. AV-Deepfake1M: A Large-Scale LLM-Driven Audio-Visual Deepfake Dataset. arXiv preprint arXiv:2311.15308 (2023)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2023.103818"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.csl.2023.101597"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/DICTA56598.2022.10034605"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10094774"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceeding of the International Conference on Learning Representations.","author":"Dinh Laurent","year":"2015","unstructured":"Laurent Dinh, David Krueger, and Yoshua Bengio. 2015. NICE: Non-linear Independent Components Estimation. In Proceeding of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_10_1","volume-title":"Proceeding of the International Conference on Learning Representations.","author":"Dinh Laurent","year":"2017","unstructured":"Laurent Dinh, Jascha Sohl-Dickstein, and Samy Bengio. 2017. Density estimation using Real NVP. In Proceeding of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_11_1","unstructured":"Ian J. Goodfellow Jean Pouget-Abadie Mehdi Mirza Bing Xu David Warde-Farley Sherjil Ozair Aaron C. Courville and Yoshua Bengio. 2014. Generative Adversarial Nets. In Advances in Neural Information Processing Systems. 2672--2680."},{"key":"e_1_3_2_1_12_1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. In Advances in Neural Information Processing Systems, Vol. 33. 6840--6851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceeding of the IEEE International Conference on Acoustics, Speech and Signal Processing. 6367--6371","author":"Heo Hee-Soo","unstructured":"Jee-weon Jung, Hee-Soo Heo, Hemlata Tak, Hye-jin Shim, Joon Son Chung, Bong-Jin Lee, Ha-Jin Yu, and Nicholas W. D. Evans. 2022. AASIST: Audio Anti-Spoofing Using Integrated Spectro-Temporal Graph Attention Networks. In Proceeding of the IEEE International Conference on Acoustics, Speech and Signal Processing. 6367--6371."},{"key":"e_1_3_2_1_14_1","volume-title":"Proceeding of the International Conference on Machine Learning","volume":"162","author":"Kim Heeseung","year":"2022","unstructured":"Heeseung Kim, Sungwon Kim, and Sungroh Yoon. 2022. Guided-TTS: A Diffusion Model for Text-to-Speech via Classifier Guidance. In Proceeding of the International Conference on Machine Learning, Vol. 162. 11119--11133."},{"key":"e_1_3_2_1_15_1","first-page":"8067","article-title":"Glow-TTS: A Generative Flow for Text-to-Speech via Monotonic Alignment Search","volume":"33","author":"Kim Jaehyeon","year":"2020","unstructured":"Jaehyeon Kim, Sungwon Kim, Jungil Kong, and Sungroh Yoon. 2020. Glow-TTS: A Generative Flow for Text-to-Speech via Monotonic Alignment Search. In Advances in Neural Information Processing Systems, Vol. 33. 8067--8077.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceeding of the International Conference on Machine Learning","volume":"139","author":"Kim Jaehyeon","year":"2021","unstructured":"Jaehyeon Kim, Jungil Kong, and Juhee Son. 2021. Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech. In Proceeding of the International Conference on Machine Learning, Vol. 139. 5530--5540."},{"key":"e_1_3_2_1_17_1","first-page":"10236","article-title":"Glow: Generative Flow with Invertible 1x1 Convolutions","volume":"31","author":"Kingma Diederik P.","year":"2018","unstructured":"Diederik P. Kingma and Prafulla Dhariwal. 2018. Glow: Generative Flow with Invertible 1x1 Convolutions. In Advances in Neural Information Processing Systems, Vol. 31. 10236--10245.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the Workshop on Deepfake Audio Detection and Analysis co-located with 32th International Joint Conference on Artificial Intelligence","volume":"3597","author":"Li Jun","year":"2023","unstructured":"Jun Li, Lin Li, Mengjie Luo, Xiaoqin Wang, Shushan Qiao, and Yumei Zhou. 2023. Multi-grained Backend Fusion for Manipulation Region Location of Partially Fake Audio. In Proceedings of the Workshop on Deepfake Audio Detection and Analysis co-located with 32th International Joint Conference on Artificial Intelligence, Vol. 3597. 43--48."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00399"},{"key":"e_1_3_2_1_20_1","first-page":"9204","article-title":"Pay Attention to MLPs","volume":"34","author":"Liu Hanxiao","year":"2021","unstructured":"Hanxiao Liu, Zihang Dai, David R. So, and Quoc V. Le Le. 2021. Pay Attention to MLPs. Advances in Neural Information Processing Systems, Vol. 34, 9204--9215.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_21_1","volume-title":"Audio-Visual Temporal Forgery Detection Using Embedding-Level Fusion and Multi-Dimensional Contrastive Loss","author":"Liu Miao","year":"2023","unstructured":"Miao Liu, Jing Wang, Xinyuan Qian, and Haizhou Li. 2023. Audio-Visual Temporal Forgery Detection Using Embedding-Level Fusion and Multi-Dimensional Contrastive Loss. IEEE Transactions on Circuits and Systems for Video Technology (2023), 1--1."},{"key":"e_1_3_2_1_22_1","volume-title":"Proceeding of the International Conference on Learning Representations.","author":"Ren Yi","year":"2021","unstructured":"Yi Ren, Chenxu Hu, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, and Tie-Yan Liu. 2021. FastSpeech 2: Fast and High-Quality End-to-End Text to Speech. In Proceeding of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_23_1","unstructured":"Yi Ren Yangjun Ruan Xu Tan Tao Qin Sheng Zhao Zhou Zhao and Tie-Yan Liu. 2019. FastSpeech: Fast Robust and Controllable Text to Speech. In Advances in Neural Information Processing Systems. 3165--3174."},{"key":"e_1_3_2_1_24_1","volume-title":"Proceeding of the International Conference on Learning Representations.","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021. Denoising diffusion implicit models. In Proceeding of the International Conference on Learning Representations."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16363"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414234"},{"key":"e_1_3_2_1_27_1","first-page":"2579","article-title":"Visualizing Data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing Data using t-SNE. Journal of Machine Learning Research, Vol. 9, 86 (2008), 2579--2605.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552466.3556530"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Xin Wang and Junichi Yamagishi. 2021. A Comparative Study on Recent Neural Spoofing Countermeasures for Synthetic Speech Detection. In Proceeding of the Interspeech. 4259--4263.","DOI":"10.21437\/Interspeech.2021-702"},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the Workshop on Deepfake Audio Detection and Analysis co-located with 32th International Joint Conference on Artificial Intelligence","volume":"3597","author":"Wu Haibin","year":"2023","unstructured":"Haibin Wu, Jiawen Kang, Lingwei Meng, Helen Meng, and Hung-yi Lee. 2023. The defender's perspective on automatic speaker verification: An overview. Proceedings of the Workshop on Deepfake Audio Detection and Analysis co-located with 32th International Joint Conference on Artificial Intelligence, Vol. 3597 (2023), 6--11."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746162"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2023.3324724"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448196"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-930"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR56361.2022.9956134"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_29"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3233236"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Lin Zhang Xin Wang Erica Cooper Junichi Yamagishi Jose Patino and Nicholas Evans. 2021. An Initial Investigation for Detecting Partially Spoofed Audio. In Proceeding of the Interspeech. 4264--4268.","DOI":"10.21437\/Interspeech.2021-738"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613767"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01453"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3616540"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680585","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680585","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:56Z","timestamp":1750295876000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680585"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":41,"alternative-id":["10.1145\/3664647.3680585","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680585","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}