{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,30]],"date-time":"2025-12-30T17:05:07Z","timestamp":1767114307724,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":17,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,12]]},"DOI":"10.1145\/3714394.3756154","type":"proceedings-article","created":{"date-parts":[[2025,12,29]],"date-time":"2025-12-29T21:13:49Z","timestamp":1767042829000},"page":"656-661","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MagiaSVS: Singing Voice Synthesis with Lyrics and Pitch Guidance via a Unified-Modal Large Language Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1051-1862","authenticated-orcid":false,"given":"Hao","family":"Zhou","sequence":"first","affiliation":[{"name":"Tianjin Key Laboratory of Software Experience and Human Computer Interaction, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4765-2049","authenticated-orcid":false,"given":"Zhiyue","family":"Wu","sequence":"additional","affiliation":[{"name":"Independent, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7777-1178","authenticated-orcid":false,"given":"Xingjian","family":"Du","sequence":"additional","affiliation":[{"name":"Independent, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1320-3981","authenticated-orcid":false,"given":"Haining","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tianjin Key Laboratory of Software Experience and Human Computer Interaction, Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4554-2351","authenticated-orcid":false,"given":"Binhui","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Software, Nankai University, Tianjin, China and Innovation and Intelligent Design Center(I\u00b2DC), Nankai University, Tianjin, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,12,29]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Ziyi chen ju, Anirudh Kumar, Shaan Sapra, and Prateek Verma.","author":"Guan Tze","year":"2023","unstructured":"Tze cheng Guan, Zhi qiang Dang, Ziyi chen ju, Anirudh Kumar, Shaan Sapra, and Prateek Verma. 2023. Fish-Speech: A Scalable and High-Quality Text-to-Speech System with Multi-Speaker Support. arXiv:2311.01156 [cs.SD]"},{"unstructured":"Antoine D\u00e9fossez Jade Copet Gabriel Synnaeve and Yossi Adi. 2022. High Fidelity Neural Audio Compression. arXiv:2210.13438 [cs.SD]","key":"e_1_3_2_1_2_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_3_1","DOI":"10.1109\/TASLP.2021.3104165"},{"key":"e_1_3_2_1_4_1","first-page":"17022","article-title":"Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis","volume":"33","author":"Kong Jungil","year":"2020","unstructured":"Jungil Kong, Jaehyeon Kim, and Jaehyeon Bae. 2020. Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis. In Advances in Neural Information Processing Systems, Vol. 33. 17022-17033.","journal-title":"Advances in Neural Information Processing Systems"},{"unstructured":"Anirudh Kumar Zhi-Qiang Dang C. K. Anirvin Alejandro Luebs Minho Jin Shaan Sapra and Prateek Verma. 2023. High-Fidelity Audio Compression with Improved RVQGAN. arXiv:2306.06546 [cs.SD]","key":"e_1_3_2_1_5_1"},{"unstructured":"Jing-Zhi Liu Chen-Fei Zhang Zhen-Hui Lin Zhou-Zhao and Le-Ping You. 2022. DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism. arXiv:2205.09422 [cs.SD]","key":"e_1_3_2_1_6_1"},{"unstructured":"Yong-Zheng Lu Yi-Chen Liu Zong-Yao Li Xi-Wen Ding Song-Yang Shi and Lei Xie. 2023. VISinger2: A High-Fidelity Singing Voice Synthesis System with a Large-scale Multi-lingual Dataset. arXiv:2309.07327 [cs.SD]","key":"e_1_3_2_1_7_1"},{"key":"e_1_3_2_1_8_1","volume-title":"Singing voice data scaling-up: An introduction to ACE-Opencpop and ACE-KiSing. arXiv preprint arXiv:2401.17619","author":"Shi Jiatong","year":"2024","unstructured":"Jiatong Shi, Yueqian Lin, Xinyi Bai, Keyi Zhang, Yuning Wu, Yuxun Tang, Yifeng Yu, Qin Jin, and Shinji Watanabe. 2024. Singing voice data scaling-up: An introduction to ACE-Opencpop and ACE-KiSing. arXiv preprint arXiv:2401.17619 (2024)."},{"unstructured":"Qwen Team. 2024. Qwen2: The Glimpse of the Future. arXiv:2409.11718 [cs.CL]","key":"e_1_3_2_1_9_1"},{"key":"e_1_3_2_1_10_1","first-page":"5998","article-title":"Attention is All you Need","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems 30. 5998-6008.","journal-title":"Advances in Neural Information Processing Systems"},{"unstructured":"Chen-Zhi Wang Ziao-Fei Wang Yao-Fei Yu Hui-Bin Zhou Zhong-Qian Li Le-Ping You Chen-Fei Zhang Jiu-Ming Liu Long-Biao Wang He-Ping Zhang and Jian-Kang Wang. 2023. Neural Codec Language Models are Zero-Shot Text to Speech Synthesizers. arXiv:2301.02111 [cs.CL]","key":"e_1_3_2_1_11_1"},{"key":"e_1_3_2_1_12_1","volume-title":"ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 681-685","author":"Wang Yu","year":"2022","unstructured":"Yu Wang, Guang Chen, Yujun Wang, Zelin Wu, Enyan Liu, Hui Li, Pengcheng Wang, and Lu He. 2022. Opencpop: A high-quality open-source mandarin singing voice synthesis corpus. In ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 681-685."},{"key":"e_1_3_2_1_13_1","volume-title":"FireRedASR: Open-Source Industrial-Grade Mandarin Speech Recognition Models from Encoder-Decoder to LLM Integration. arXiv preprint arXiv:2501.14350","author":"Xu Kai-Tuo","year":"2025","unstructured":"Kai-Tuo Xu, Feng-Long Xie, Xu Tang, and Yao Hu. 2025. FireRedASR: Open-Source Industrial-Grade Mandarin Speech Recognition Models from Encoder-Decoder to LLM Integration. arXiv preprint arXiv:2501.14350 (2025)."},{"unstructured":"An Yang Anfeng Li Baosong Yang Beichen Zhang Binyuan Hui Bo Zheng Bowen Yu Chang Gao Chengen Huang Chenxu Lv et al. 2025. Qwen3 technical report. arXiv preprint arXiv:2505.09388 (2025).","key":"e_1_3_2_1_14_1"},{"key":"e_1_3_2_1_15_1","volume-title":"Chao-Han Huck Yang, and Jian-Kang Wang","author":"Yu Yao-Fei","year":"2023","unstructured":"Yao-Fei Yu, Hong-Bin Zhou, Jia-Jun Deng, Si-Cheng Lv, Long-Biao Wang, He-Ping Zhang, Chao-Han Huck Yang, and Jian-Kang Wang. 2023. SpeechGPT: Empowering Large Language Models with Intrinsic Cross-Modal Conversational Abilities. arXiv:2305.11000 [cs.CL]"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_16_1","DOI":"10.1109\/TASLP.2021.3129994"},{"key":"e_1_3_2_1_17_1","volume-title":"Multi-Singer and Musical Score Provided Mandarin Singing Corpus. In Interspeech","author":"Zhang Ming-Quan","year":"2022","unstructured":"Ming-Quan Zhang, Yue Zhao, Kuan Feng, Ya-Zhe Qin, Tie-Yuan Lu, and Lu He. 2022. M4Singer: A Multi-Style, Multi-Singer and Musical Score Provided Mandarin Singing Corpus. In Interspeech 2022."}],"event":{"sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing","SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGSPATIAL ACM Special Interest Group on Spatial Information"],"name":"UbiComp '25:The 2025 ACM International Joint Conference on Pervasive and Ubiquitous Computing \/ ISWC ACM International Symposium on Wearable Computers","location":"Espoo Finland"},"container-title":["Companion of the 2025 ACM International Joint Conference on Pervasive and Ubiquitous Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3714394.3756154","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,30]],"date-time":"2025-12-30T17:00:57Z","timestamp":1767114057000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3714394.3756154"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":17,"alternative-id":["10.1145\/3714394.3756154","10.1145\/3714394"],"URL":"https:\/\/doi.org\/10.1145\/3714394.3756154","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]},"assertion":[{"value":"2025-12-29","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}