{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:45:13Z","timestamp":1776887113583,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755416","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:38:54Z","timestamp":1761377934000},"page":"8293-8302","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["BAPEN: Towards Versatile Audio Phase Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-7973-2080","authenticated-orcid":false,"given":"Lingling","family":"Dai","sequence":"first","affiliation":[{"name":"Institute of Acoustics, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4094-8448","authenticated-orcid":false,"given":"Andong","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Acoustics, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9048-2565","authenticated-orcid":false,"given":"Zhe","family":"Han","sequence":"additional","affiliation":[{"name":"ByteDance China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5656-994X","authenticated-orcid":false,"given":"Chengshi","family":"Zheng","sequence":"additional","affiliation":[{"name":"Institute of Acoustics, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4170-0076","authenticated-orcid":false,"given":"Xiaodong","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Acoustics, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3417347"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3277276"},{"key":"e_1_3_2_2_3_1","first-page":"1","article-title":"Neural speech phase prediction based on parallel estimation architecture and anti-wrapping losses","author":"Ai Yang","year":"2023","unstructured":"Yang Ai and Zhen-Hua Ling. 2023b. Neural speech phase prediction based on parallel estimation architecture and anti-wrapping losses. In Proc. ICASSP. 1-5.","journal-title":"Proc. ICASSP."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3385285"},{"key":"e_1_3_2_2_5_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E Hinton","author":"Ba Jimmy Lei","year":"2016","unstructured":"Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. 2016. Layer normalization. arXiv preprint arXiv:1607.06450 (2016)."},{"key":"e_1_3_2_2_6_1","first-page":"2776","article-title":"Hi-fi multi-speaker english tts dataset","author":"Bakhturina Evelina","year":"2021","unstructured":"Evelina Bakhturina, Vitaly Lavrukhin, Boris Ginsburg, and Yang Zhang. 2021. Hi-fi multi-speaker english tts dataset. In Proc. Interspeech. 2776-2780.","journal-title":"Proc. Interspeech."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDSP.2015.7251907"},{"key":"e_1_3_2_2_8_1","first-page":"527","article-title":"Adaptive neural networks for efficient inference","author":"Bolukbasi Tolga","year":"2017","unstructured":"Tolga Bolukbasi, Joseph Wang, Ofer Dekel, and Venkatesh Saligrama. 2017. Adaptive neural networks for efficient inference. In Proc. ICML. 527-536.","journal-title":"Proc. ICML."},{"key":"e_1_3_2_2_9_1","volume-title":"Proc. ICLR.","author":"Choi Hyeong-Seok","year":"2018","unstructured":"Hyeong-Seok Choi, Jang-Hyun Kim, Jaesung Huh, Adrian Kim, Jung-Woo Ha, and Kyogu Lee. 2018. Phase-aware speech enhancement with deep complex u-net. In Proc. ICLR."},{"key":"e_1_3_2_2_10_1","volume-title":"Proc. ISMIR.","author":"Defferrard Micha\u00ebl","year":"2017","unstructured":"Micha\u00ebl Defferrard, Kirell Benzi, Pierre Vandergheynst, and Xavier Bresson. 2017. FMA: A Dataset for Music Analysis. In Proc. ISMIR."},{"key":"e_1_3_2_2_11_1","first-page":"736","article-title":"Clotho: an Audio Captioning Dataset","author":"Drossos Konstantinos","year":"2020","unstructured":"Konstantinos Drossos, Samuel Lipping, and Tuomas Virtanen. 2020. Clotho: an Audio Captioning Dataset. In Proc. ICASSP. 736-740.","journal-title":"Proc. ICASSP."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Eduardo Fonseca Xavier Favory Jordi Pons Frederic Font and Xavier Serra. 2022. FSD50K: An Open Dataset of Human-Labeled Sound Events. In arxiv:2010.00475.","DOI":"10.1109\/TASLP.2021.3133208"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2007.907330"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1984.1164317"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-015-0054-9"},{"key":"e_1_3_2_2_16_1","volume-title":"Conv2former: A simple transformer-style convnet for visual recognition","author":"Hou Qibin","year":"2024","unstructured":"Qibin Hou, Cheng-Ze Lu, Ming-Ming Cheng, and Jiashi Feng. 2024. Conv2former: A simple transformer-style convnet for visual recognition. IEEE transactions on pattern analysis and machine intelligence (2024)."},{"key":"e_1_3_2_2_17_1","first-page":"6212","article-title":"Acoustic Application of Phase Reconstruction Algorithms in Optics","author":"Kobayashi Tomoki","year":"2022","unstructured":"Tomoki Kobayashi, Tomoro Tanaka, Kohei Yatabe, and Yasuhiro Oikawa. 2022. Acoustic Application of Phase Reconstruction Algorithms in Optics. In Proc. ICASSP. 6212-6216.","journal-title":"Proc. ICASSP."},{"key":"e_1_3_2_2_18_1","volume-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in neural information processing systems","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaekyoung Bae. 2020. Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. Advances in neural information processing systems, Vol. 33 (2020), 17022-17033."},{"key":"e_1_3_2_2_19_1","volume-title":"Sabato Marco Siniscalchi, and Chin-Hui Lee","author":"Ku Pin-Jui","year":"2024","unstructured":"Pin-Jui Ku, Chun-Wei Ho, Hao Yen, Sabato Marco Siniscalchi, and Chin-Hui Lee. 2024. An Explicit Consistency-Preserving Loss Function for Phase Reconstruction and Speech Enhancement. In arXiv:2409.16282."},{"key":"e_1_3_2_2_20_1","first-page":"27980","article-title":"High-fidelity audio compression with improved rvqgan","volume":"36","author":"Kumar Rithesh","year":"2023","unstructured":"Rithesh Kumar, Prem Seetharaman, Alejandro Luebs, Ishaan Kumar, and Kundan Kumar. 2023. High-fidelity audio compression with improved rvqgan. Advances in Neural Information Processing Systems, Vol. 36 (2023), 27980-27993.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3079813"},{"key":"e_1_3_2_2_22_1","unstructured":"Andong Li Zhihang Sun Fengyuan Hao Xiaodong Li and Chengshi Zheng. 2025. Neural Vocoders as Speech Enhancers. In arXiv:2501.13465."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3313442"},{"key":"e_1_3_2_2_24_1","first-page":"638","article-title":"Stage-Wise and Prior-Aware Neural Speech Phase Prediction","author":"Liu Fei","year":"2024","unstructured":"Fei Liu, Yang Ai, Hui-Peng Du, Ye-Xin Lu, Rui-Chen Zheng, and Zhen-Hua Ling. 2024. Stage-Wise and Prior-Aware Neural Speech Phase Prediction. In Proc. SLT. 638-644.","journal-title":"Proc. SLT."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3271145"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2020.3034486"},{"key":"e_1_3_2_2_27_1","volume-title":"Audio Engineering Society Convention 126","author":"Nagel Frederik","year":"2009","unstructured":"Frederik Nagel, Sascha Disch, and Nikolaus Rettelbach. 2009. A phase vocoder driven bandwidth extension method with novel transient handling for audio codecs. In Audio Engineering Society Convention 126. Audio Engineering Society."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095396"},{"key":"e_1_3_2_2_29_1","volume-title":"Moisesdb: A dataset for source separation beyond 4-stems. In arxiv:2307.15913.","author":"Pereira Igor","year":"2023","unstructured":"Igor Pereira, Felipe Ara\u00fajo, Filip Korzeniowski, and Richard Vogl. 2023. Moisesdb: A dataset for source separation beyond 4-stems. In arxiv:2307.15913."},{"key":"e_1_3_2_2_30_1","first-page":"1","article-title":"A fast Griffin-Lim algorithm","author":"Perraudin Nathana\u00ebl","year":"2013","unstructured":"Nathana\u00ebl Perraudin, Peter Balazs, and Peter L. S\u00f8ndergaard. 2013. A fast Griffin-Lim algorithm. In Proc. WASPAA. 1-4.","journal-title":"Proc. WASPAA."},{"key":"e_1_3_2_2_31_1","volume-title":"Dara Dabiri, Hiroshi Tokuda, Wataru Hariya, Koji Oishi, and Xavier Serra.","author":"Picas Oriol Romani","year":"2015","unstructured":"Oriol Romani Picas, Hector Parra Rodriguez, Dara Dabiri, Hiroshi Tokuda, Wataru Hariya, Koji Oishi, and Xavier Serra. 2015. A real-time system for measuring sound goodness in instrumental sounds. Journal of The Audio Engineering Society (2015)."},{"key":"e_1_3_2_2_32_1","first-page":"1015","article-title":"ESC","author":"Piczak Karol J.","year":"2015","unstructured":"Karol J. Piczak. 2015. ESC: Dataset for Environmental Sound Classification. In Proc. ACM MM. 1015-1018.","journal-title":"Dataset for Environmental Sound Classification. In Proc. ACM MM."},{"key":"e_1_3_2_2_33_1","first-page":"17","article-title":"Real-time spectrogram inversion using phase gradient heap integration","author":"Pru\u0161a Zdenek","year":"2016","unstructured":"Zdenek Pru\u0161a and Peter L S\u00f8ndergaard. 2016. Real-time spectrogram inversion using phase gradient heap integration. In Proc. DAFx. 17-21.","journal-title":"Proc. DAFx."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2017.2678166"},{"key":"e_1_3_2_2_35_1","volume-title":"Stylianos Ioannis Mimilakis, and Rachel Bittner","author":"Rafii Zafar","year":"2019","unstructured":"Zafar Rafii, Antoine Liutkus, Fabian-Robert St\u00f6ter, Stylianos Ioannis Mimilakis, and Rachel Bittner. 2019. MUSDB18-HQ - an uncompressed version of MUSDB18."},{"key":"e_1_3_2_2_36_1","first-page":"4873","article-title":"EARS","author":"Richter Julius","year":"2024","unstructured":"Julius Richter, Yi-Chiao Wu, Steven Krenn, Simon Welker, Bunlong Lay, Shinjii Watanabe, Alexander Richard, and Timo Gerkmann. 2024. EARS: An Anechoic Fullband Speech Dataset Benchmarked for Speech Enhancement and Dereverberation. In Proc. Interspeech. 4873-4877.","journal-title":"In Proc. Interspeech."},{"key":"e_1_3_2_2_37_1","first-page":"4521","article-title":"UTMOS: UTokyo-SaruLab System for VoiceMOS Challenge 2022","author":"Saeki Takaaki","year":"2022","unstructured":"Takaaki Saeki, Detai Xin, Wataru Nakata, Tomoki Koriyama, Shinnosuke Takamichi, and Hiroshi Saruwatari. 2022. UTMOS: UTokyo-SaruLab System for VoiceMOS Challenge 2022. In Proc. Interspeech. 4521-4525.","journal-title":"Proc. Interspeech."},{"key":"e_1_3_2_2_38_1","volume-title":"Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis. In arXiv:2306.00814.","author":"Siuzdak Hubert","year":"2023","unstructured":"Hubert Siuzdak. 2023. Vocos: Closing the gap between time-domain and Fourier-based neural vocoders for high-quality audio synthesis. In arXiv:2306.00814."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"e_1_3_2_2_40_1","first-page":"286","article-title":"Phase Reconstruction from Amplitude Spectrograms Based on Von-Mises-Distribution Deep Neural Network","author":"Takamichi Shinnosuke","year":"2018","unstructured":"Shinnosuke Takamichi, Yuki Saito, Norihiro Takamune, Daichi Kitamura, and Hiroshi Saruwatari. 2018. Phase Reconstruction from Amplitude Spectrograms Based on Von-Mises-Distribution Deep Neural Network. In Proc. IWAENC. 286-290.","journal-title":"Proc. IWAENC."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3268577"},{"key":"e_1_3_2_2_42_1","first-page":"409","article-title":"Skipnet: Learning dynamic routing in convolutional networks","author":"Wang Xin","year":"2018","unstructured":"Xin Wang, Fisher Yu, Zi-Yi Dou, Trevor Darrell, and Joseph E Gonzalez. 2018. Skipnet: Learning dynamic routing in convolutional networks. In Proc. ECCV. 409-424.","journal-title":"Proc. ECCV."},{"key":"e_1_3_2_2_43_1","first-page":"1","article-title":"TF-GridNet: Making time-frequency domain models great again for monaural speaker separation","author":"Wang Zhong-Qiu","year":"2023","unstructured":"Zhong-Qiu Wang, Samuele Cornell, Shukjae Choi, Younglo Lee, Byeong-Yeol Kim, and Shinji Watanabe. 2023. TF-GridNet: Making time-frequency domain models great again for monaural speaker separation. In Proc. ICASSP. 1-5.","journal-title":"Proc. ICASSP."},{"key":"e_1_3_2_2_44_1","first-page":"396","article-title":"Phase reconstruction with learned time-frequency representations for single-channel speech separation","author":"Wichern Gordon","year":"2018","unstructured":"Gordon Wichern and Jonathan Le Roux. 2018. Phase reconstruction with learned time-frequency representations for single-channel speech separation. In Proc. IWAENC. IEEE, 396-400.","journal-title":"Proc. IWAENC. IEEE"},{"key":"e_1_3_2_2_45_1","first-page":"16133","article-title":"Convnext v2: Co-designing and scaling convnets with masked autoencoders","author":"Woo Sanghyun","year":"2023","unstructured":"Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, and Saining Xie. 2023. Convnext v2: Co-designing and scaling convnets with masked autoencoders. In Proc. CVPR. 16133-16142.","journal-title":"Proc. CVPR."},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/SLT61566.2024.10832364"},{"key":"e_1_3_2_2_47_1","volume-title":"CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit (version 0.92)","author":"Yamagishi Junichi","unstructured":"Junichi Yamagishi, Christophe Veaux, and Kirsten MacDonald. 2019. CSTR VCTK Corpus: English Multi-speaker Corpus for CSTR Voice Cloning Toolkit (version 0.92). University of Edinburgh. The Centre for Speech Technology Research (CSTR) (2019)."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6489"},{"key":"e_1_3_2_2_49_1","first-page":"571","article-title":"BAE-Net: A low complexity and high fidelity bandwidth-adaptive neural network for speech super-resolution","author":"Yu Guanghui","year":"2024","unstructured":"Guanghui Yu, Xin Zheng, Ning Li, Ruoxu Han, Chen Zheng, Chen Zhang, Cheng Zhou, Qing Huang, and Bin Yu. 2024. BAE-Net: A low complexity and high fidelity bandwidth-adaptive neural network for speech super-resolution. In Proc. ICASSP. IEEE, 571-575.","journal-title":"Proc. ICASSP. IEEE"},{"key":"e_1_3_2_2_50_1","first-page":"2483","article-title":"High Fidelity Speech Enhancement with Band-split RNN","author":"Yu Jianwei","year":"2023","unstructured":"Jianwei Yu, Hangting Chen, Yi Luo, Rongzhi Gu, and Chao Weng. 2023. High Fidelity Speech Enhancement with Band-split RNN. In Proc. Interspeech. 2483-2487.","journal-title":"Proc. Interspeech."},{"key":"e_1_3_2_2_51_1","first-page":"1","article-title":"Efficient Monaural Speech Enhancement with Universal Sample Rate Band-Split RNN","author":"Yu Jianwei","year":"2023","unstructured":"Jianwei Yu and Yi Luo. 2023. Efficient Monaural Speech Enhancement with Universal Sample Rate Band-Split RNN. In Proc. ICASSP. 1-5.","journal-title":"Proc. ICASSP."},{"key":"e_1_3_2_2_52_1","first-page":"1026","article-title":"Unrestricted global phase bias-aware single-channel speech enhancement with conformer-based metric gan","author":"Zhang Shiqi","year":"2024","unstructured":"Shiqi Zhang, Zheng Qiu, Daiki Takeuchi, Noboru Harada, and Shoji Makino. 2024a. Unrestricted global phase bias-aware single-channel speech enhancement with conformer-based metric gan. In Proc. ICASSP. 1026-1030.","journal-title":"Proc. ICASSP."},{"key":"e_1_3_2_2_53_1","first-page":"1","article-title":"Toward universal speech enhancement for diverse input conditions","author":"Zhang Wangyou","year":"2023","unstructured":"Wangyou Zhang, Kohei Saijo, Zhong-Qiu Wang, Shinji Watanabe, and Yanmin Qian. 2023. Toward universal speech enhancement for diverse input conditions. In Proc. ASRU. 1-6.","journal-title":"Proc. ASRU."},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-1239"},{"key":"e_1_3_2_2_55_1","first-page":"229","article-title":"Real-time iterative spectrum inversion with look-ahead","author":"Zhu Xinglei","year":"2006","unstructured":"Xinglei Zhu, Gerald T Beauregard, and Lonce Wyse. 2006. Real-time iterative spectrum inversion with look-ahead. In Proc. ICME. 229-232.","journal-title":"Proc. ICME."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755416","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:11:05Z","timestamp":1765339865000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755416"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":55,"alternative-id":["10.1145\/3746027.3755416","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755416","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}