{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:25:31Z","timestamp":1775579131596,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Major Science and Technology Project of Xiamen (Industry and Information Technology Area)","award":["3502Z20231007"],"award-info":[{"award-number":["3502Z20231007"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680596","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"3294-3302","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["GROOT: Generating Robust Watermark for Diffusion-Model-Based Audio Synthesis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-6068-3448","authenticated-orcid":false,"given":"Weizhi","family":"Liu","sequence":"first","affiliation":[{"name":"Huaqiao University &amp; Xiamen Key Laboratory of Data Security and Blockchain Technology, Xiamen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6425-9298","authenticated-orcid":false,"given":"Yue","family":"Li","sequence":"additional","affiliation":[{"name":"Huaqiao University &amp; Xiamen Key Laboratory of Data Security and Blockchain Technology, Xiamen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-4376-3210","authenticated-orcid":false,"given":"Dongdong","family":"Lin","sequence":"additional","affiliation":[{"name":"Shenzhen University &amp; Guangdong Provincial Key Laboratory of Intelligent Information Processing, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1591-656X","authenticated-orcid":false,"given":"Hui","family":"Tian","sequence":"additional","affiliation":[{"name":"Huaqiao University &amp; Xiamen Key Laboratory of Data Security and Blockchain Technology, Xiamen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9158-9401","authenticated-orcid":false,"given":"Haizhou","family":"Li","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong (Shenzhen), Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Wavmark: Watermarking for audio generation. arXiv preprint arXiv:2308.12770","author":"Chen Guangyu","year":"2023","unstructured":"Guangyu Chen, Yu Wu, Shujie Liu, Tao Liu, Xiaoyong Du, and Furu Wei. 2023. Wavmark: Watermarking for audio generation. arXiv preprint arXiv:2308.12770 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"LightGrad: Lightweight Diffusion Probabilistic Model for Text-to-Speech. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing.","author":"Chen Jie","year":"2023","unstructured":"Jie Chen, Xingchen Song, Zhendong Peng, Binbin Zhang, Fuping Pan, and Zhiyong Wu. 2023. LightGrad: Lightweight Diffusion Probabilistic Model for Text-to-Speech. In ICASSP 2023--2023 IEEE International Conference on Acoustics, Speech and Signal Processing."},{"key":"e_1_3_2_1_3_1","volume-title":"Distribution-preserving steganography based on text-to-speech generative models","author":"Chen Kejiang","year":"2021","unstructured":"Kejiang Chen, Hang Zhou, Hanqing Zhao, Dongdong Chen, Weiming Zhang, and Nenghai Yu. 2021. Distribution-preserving steganography based on text-to-speech generative models. IEEE Transactions on Dependable and Secure Computing (2021)."},{"key":"e_1_3_2_1_4_1","volume-title":"WaveGrad: Estimating Gradients for Waveform Generation. In International Conference on Learning Representations.","author":"Chen Nanxin","year":"2020","unstructured":"Nanxin Chen, Yu Zhang, Heiga Zen, Ron J Weiss, Mohammad Norouzi, and William Chan. 2020. WaveGrad: Estimating Gradients for Waveform Generation. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_5_1","volume-title":"Attributable Watermarking of Speech Generative Models. In ICASSP 2022--2022 IEEE International Conference on Acoustics, Speech and Signal Processing.","author":"Cho Yongbaek","year":"2022","unstructured":"Yongbaek Cho, Changhoon Kim, Yezhou Yang, and Yi Ren. 2022. Attributable Watermarking of Speech Generative Models. In ICASSP 2022--2022 IEEE International Conference on Acoustics, Speech and Signal Processing."},{"key":"e_1_3_2_1_6_1","volume-title":"International Conference on Machine Learning.","author":"Dauphin Yann N","year":"2017","unstructured":"Yann N Dauphin, Angela Fan, Michael Auli, and David Grangier. 2017. Language modeling with gated convolutional networks. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/SP46215.2023.10179287"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02053"},{"key":"e_1_3_2_1_9_1","volume-title":"Generative adversarial networks. Commun. ACM","author":"Goodfellow Ian","year":"2020","unstructured":"Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. 2020. Generative adversarial networks. Commun. ACM (2020)."},{"key":"e_1_3_2_1_10_1","volume-title":"Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems (2020)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10448502"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547855"},{"key":"e_1_3_2_1_13_1","unstructured":"Keith Ito. 2017. The lj speech dataset. https:\/\/keithito.com\/LJ-Speech-Dataset\/."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00976"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"e_1_3_2_1_16_1","volume-title":"International Conference on Learning Representations.","author":"Kingma Diederik P","year":"2015","unstructured":"Diederik P Kingma and Jimmy Ba. 2015. Adam: A method for stochastic optimization. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_17_1","volume-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in Neural Information Processing Systems","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in Neural Information Processing Systems (2020)."},{"key":"e_1_3_2_1_18_1","volume-title":"DiffWave: A Versatile Diffusion Model for Audio Synthesis. In International Conference on Learning Representations.","author":"Kong Zhifeng","year":"2020","unstructured":"Zhifeng Kong, Wei Ping, Jiaji Huang, Kexin Zhao, and Bryan Catanzaro. 2020. DiffWave: A Versatile Diffusion Model for Audio Synthesis. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_19_1","volume-title":"BDDM: Bilateral Denoising Diffusion Models for Fast and High-Quality Speech Synthesis. In International Conference on Learning Representations.","author":"Lam Max WY","year":"2021","unstructured":"Max WY Lam, Jun Wang, Dan Su, and Dong Yu. 2021. BDDM: Bilateral Denoising Diffusion Models for Fast and High-Quality Speech Synthesis. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_20_1","volume-title":"International Conference on Learning Representations.","author":"Kim Heeseung","year":"2022","unstructured":"Sang-gil Lee, Heeseung Kim, Chaehun Shin, Xu Tan, Chang Liu, Qi Meng, Tao Qin, Wei Chen, Sungroh Yoon, and Tie-Yan Liu. 2022. PriorGrad: Improving conditional denoising diffusion models with data-dependent adaptive prior. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_21_1","volume-title":"A Coverless Audio Steganography Based on Generative Adversarial Networks. Electronics","author":"Li Jing","year":"2023","unstructured":"Jing Li, Kaixi Wang, and Xiaozhu Jia. 2023. A Coverless Audio Steganography Based on Generative Adversarial Networks. Electronics (2023)."},{"key":"e_1_3_2_1_22_1","volume-title":"A survey of deep neural network watermarking techniques. Neurocomputing","author":"Li Yue","year":"2021","unstructured":"Yue Li, Hongxia Wang, and Mauro Barni. 2021. A survey of deep neural network watermarking techniques. Neurocomputing (2021)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TDSC.2024.3424900"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26550"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.14722\/ndss.2024.24200"},{"key":"e_1_3_2_1_26_1","volume-title":"GLA-Grad: A Griffin-Lim Extended Waveform Generation Diffusion Model. In ICASSP 2024--2024 IEEE International Conference on Acoustics, Speech and Signal Processing.","author":"Liu Haocheng","year":"2024","unstructured":"Haocheng Liu, Teysir Baoueb, Mathieu Fontaine, Jonathan Le Roux, and Gael Richard. 2024. GLA-Grad: A Griffin-Lim Extended Waveform Generation Diffusion Model. In ICASSP 2024--2024 IEEE International Conference on Acoustics, Speech and Signal Processing."},{"key":"e_1_3_2_1_27_1","volume-title":"Diffgan-tts: High-fidelity and efficient text-to-speech with denoising diffusion gans. arXiv preprint arXiv:2201.11972","author":"Liu Songxiang","year":"2022","unstructured":"Songxiang Liu, Dan Su, and Dong Yu. 2022. Diffgan-tts: High-fidelity and efficient text-to-speech with denoising diffusion gans. arXiv preprint arXiv:2201.11972 (2022)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00363"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_30_1","volume-title":"Robust speech watermarking by a jointly trained embedder and detector using a DNN. Digital Signal Processing","author":"Pavlovi\u0107 Kosta","year":"2022","unstructured":"Kosta Pavlovi\u0107, Slavko Kovavcevi\u0107, Igor Djurovi\u0107, and Adam Wojciechowski. 2022. Robust speech watermarking by a jointly trained embedder and detector using a DNN. Digital Signal Processing (2022)."},{"key":"e_1_3_2_1_31_1","volume-title":"International Conference on Machine Learning.","author":"Popov Vadim","year":"2021","unstructured":"Vadim Popov, Ivan Vovk, Vladimir Gogoryan, Tasnima Sadekova, and Mikhail Kudinov. 2021. Grad-tts: A diffusion probabilistic model for text-to-speech. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_32_1","volume-title":"Perceptual evaluation of speech quality (PESQ): An objective method for end-to-end speech quality assessment of narrow-band telephone networks and speech codecs. Rec. ITU-T P. 862","author":"Recommendation ITU-T","year":"2001","unstructured":"ITU-T Recommendation. 2001. Perceptual evaluation of speech quality (PESQ): An objective method for end-to-end speech quality assessment of narrow-band telephone networks and speech codecs. Rec. ITU-T P. 862 (2001)."},{"key":"e_1_3_2_1_33_1","volume-title":"Proactive Detection of Voice Cloning with Localized Watermarking. In International Conference on Machine Learning.","author":"Roman Robin San","year":"2024","unstructured":"Robin San Roman, Pierre Fernandez, Alexandre D\u00e9fossez, Teddy Furon, Tuan Tran, and Hady Elsahar. 2024. Proactive Detection of Voice Cloning with Localized Watermarking. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_34_1","volume-title":"Denoising Diffusion Implicit Models. In International Conference on Learning Representations.","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising Diffusion Implicit Models. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_35_1","volume-title":"Generative modeling by estimating gradients of the data distribution. Advances in Neural Information Processing Systems","author":"Song Yang","year":"2019","unstructured":"Yang Song and Stefano Ermon. 2019. Generative modeling by estimating gradients of the data distribution. Advances in Neural Information Processing Systems (2019)."},{"key":"e_1_3_2_1_36_1","volume-title":"International Conference on Learning Representations.","author":"Song Yang","year":"2020","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik P Kingma, Abhishek Kumar, Stefano Ermon, and Ben Poole. 2020. Score-Based Generative Modeling through Stochastic Differential Equations. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2010.5495701"},{"key":"e_1_3_2_1_38_1","volume-title":"Image quality assessment: from error visibility to structural similarity","author":"Wang Zhou","year":"2004","unstructured":"Zhou Wang, Alan C Bovik, Hamid R Sheikh, and Eero P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE Transactions on Image Processing (2004)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612448"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01418"},{"key":"e_1_3_2_1_42_1","volume-title":"Responsible Disclosure of Generative Models Using Scalable Fingerprinting. In International Conference on Learning Representations.","author":"Yu Ning","year":"2022","unstructured":"Ning Yu, Vladislav Skripniuk, Dingfan Chen, Larry S Davis, and Mario Fritz. 2022. Responsible Disclosure of Generative Models Using Scalable Fingerprinting. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_43_1","volume-title":"LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech. Interspeech","author":"Zen Heiga","year":"2019","unstructured":"Heiga Zen, Viet Dang, Rob Clark, Yu Zhang, Ron J Weiss, Ye Jia, Zhifeng Chen, and Yonghui Wu. 2019. LibriTTS: A Corpus Derived from LibriSpeech for Text-to-Speech. Interspeech (2019)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3064850"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3543873.3587321"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.244"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680596","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680596","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:56Z","timestamp":1750295876000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680596"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":46,"alternative-id":["10.1145\/3664647.3680596","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680596","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}