{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:26:35Z","timestamp":1765506395367,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","funder":[{"name":"National Research Foundation of Korea (NRF)","award":["RS-2024-00453301"],"award-info":[{"award-number":["RS-2024-00453301"]}]},{"name":"Institute of Information & Communications Technology Planning & Evaluation (IITP)","award":["RS-2019-II191906"],"award-info":[{"award-number":["RS-2019-II191906"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,10]]},"DOI":"10.1145\/3746252.3761614","type":"proceedings-article","created":{"date-parts":[[2025,11,8]],"date-time":"2025-11-08T00:52:37Z","timestamp":1762563157000},"page":"6492-6497","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["S2Cap: A Benchmark and a Baseline for Singing Style Captioning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-7204-7350","authenticated-orcid":false,"given":"Hyunjong","family":"Ok","sequence":"first","affiliation":[{"name":"Pohang University of Science and Technology, Pohang, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1349-8595","authenticated-orcid":false,"given":"Jaeho","family":"Lee","sequence":"additional","affiliation":[{"name":"Pohang University of Science and Technology, Pohang, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2025,11,10]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325","author":"Agostinelli A.","year":"2023","unstructured":"A. Agostinelli, T. I Denk, Z. Borsos, J. Engel, M. Verzetti, A. Caillon, Q. Huang, A. Jansen, A. Roberts, M. Tagliasacchi, et al., 2023. Musiclm: Generating music from text. arXiv preprint arXiv:2301.11325 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Spice: Semantic propositional image caption evaluation. In ECCV.","author":"Anderson P.","year":"2016","unstructured":"P. Anderson, B. Fernando, M. Johnson, and S. Gould. 2016. Spice: Semantic propositional image caption evaluation. In ECCV."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"A. Ando T. Moriya S. Horiguchi and R. Masumura. 2024. Factor-Conditioned Speaking-Style Captioning. arXiv preprint arXiv:2406.18910 (2024).","DOI":"10.21437\/Interspeech.2024-633"},{"volume-title":"METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In ACL Workshop.","author":"Banerjee S.","key":"e_1_3_2_1_4_1","unstructured":"S. Banerjee and A. Lavie. 2005. METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments. In ACL Workshop."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","unstructured":"S. Bhosale R. Chakraborty and S. Kopparapu. 2023. A Novel Metric For Evaluating Audio Caption Similarity. In ICASSP. doi:10.1109\/ICASSP49357.2023.10096526","DOI":"10.1109\/ICASSP49357.2023.10096526"},{"key":"e_1_3_2_1_6_1","unstructured":"D. Chen and W. B Dolan. 2011. Collecting highly parallel data for paraphrase evaluation. In ACL."},{"key":"e_1_3_2_1_7_1","unstructured":"X. Chen H. Fang T.-Y. Lin R. Vedantam S. Gupta P. Doll\u00e1r and C L. Zitnick. 2015. Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)."},{"key":"e_1_3_2_1_8_1","unstructured":"Y. Chu J. Xu Q. Yang H. Wei X. Wei Z. Guo Y. Leng Y. Lv J. He J. Lin et al. 2024. Qwen2-audio technical report. arXiv preprint arXiv:2407.10759 (2024)."},{"key":"e_1_3_2_1_9_1","unstructured":"S. Doh K. Choi J. Lee and J. Nam. 2023. LP-MusicCaps: LLM-Based Pseudo Music Captioning. In ISMIR."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPA.2013.6694316"},{"key":"e_1_3_2_1_12_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"A. Ferraro Y. Kim S. Lee B. Kim N. Jo S. Lim S. Lim J. Jang S. Kim X. Serra et al. 2021. Melon playlist dataset: A public dataset for audio-based playlist generation and music tagging. In ICASSP.","DOI":"10.1109\/ICASSP39728.2021.9413552"},{"key":"e_1_3_2_1_14_1","volume-title":"Prompttts: Controllable text-to-speech with text descriptions. In ICASSP.","author":"Guo Z.","year":"2023","unstructured":"Z. Guo, Y. Leng, Y. Wu, S. Zhao, and X. Tan. 2023. Prompttts: Controllable text-to-speech with text descriptions. In ICASSP."},{"key":"e_1_3_2_1_15_1","volume-title":"Hubert: Self-supervised speech representation learning by masked prediction of hidden units","author":"Hsu W.-N.","year":"2021","unstructured":"W.-N. Hsu, B. Bolte, Y.-H. H. Tsai, K. Lakhotia, R. Salakhutdinov, and A. Mohamed. 2021. Hubert: Self-supervised speech representation learning by masked prediction of hidden units. IEEE\/ACM TASLP (2021)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475437"},{"key":"e_1_3_2_1_17_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_18_1","volume-title":"Audiocaps: Generating captions for audios in the wild. In NAACL.","author":"Kim C. D.","year":"2019","unstructured":"C. D. Kim, B. Kim, H. Lee, and G. Kim. 2019. Audiocaps: Generating captions for audios in the wild. In NAACL."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"M. Kim K. Sung-Bin and T.-H. Oh. 2023. Prefix tuning for automated audio captioning. In ICASSP.","DOI":"10.1109\/ICASSP49357.2023.10096877"},{"key":"e_1_3_2_1_20_1","volume-title":"Approximating human judgment of generated image quality. arXiv preprint arXiv:1912.12121","author":"Kolchinski Y Alex","year":"2019","unstructured":"Y Alex Kolchinski, Sharon Zhou, Shengjia Zhao, Mitchell Gordon, and Stefano Ermon. 2019. Approximating human judgment of generated image quality. arXiv preprint arXiv:1912.12121 (2019)."},{"key":"e_1_3_2_1_21_1","volume-title":"K. Song, L. He, X. Li, s. zhao, T. Qin, and J. Bian.","author":"Leng Y.","year":"2024","unstructured":"Y. Leng, Z. Guo, K. Shen, Z. Ju, X. Tan, E. Liu, Y. Liu, D. Yang, l. zhang, K. Song, L. He, X. Li, s. zhao, T. Qin, and J. Bian. 2024. PromptTTS 2: Describing and Generating Voices with Text Prompt. In ICLR. https:\/\/openreview.net\/forum?id=NsCXDyv2Bn"},{"key":"e_1_3_2_1_22_1","volume-title":"MERT: Acoustic Music Understanding Model with Large-Scale Self-supervised Training. In ICLR. https:\/\/openreview.net\/forum?id=w3YZ9MSlBu","author":"Yuan Y. LI, R.","year":"2024","unstructured":"Y. LI, R. Yuan, G. Zhang, Y. Ma, X. Chen, H. Yin, C. Xiao, C. Lin, A. Ragni, E. Benetos, N. Gyenge, R. Dannenberg, R. Liu, W. Chen, G. Xia, Y. Shi, W. Huang, Z. Wang, Y. Guo, and J. Fu. 2024. MERT: Acoustic Music Understanding Model with Large-Scale Self-supervised Training. In ICLR. https:\/\/openreview.net\/forum?id=w3YZ9MSlBu"},{"key":"e_1_3_2_1_23_1","volume-title":"ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out. https:\/\/aclanthology.org\/W04-1013","author":"Lin C.-Y.","year":"2004","unstructured":"C.-Y. Lin. 2004. ROUGE: A Package for Automatic Evaluation of Summaries. In Text Summarization Branches Out. https:\/\/aclanthology.org\/W04-1013"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"S. Liu Z. Zhu N. Ye S. Guadarrama and K. Murphy. 2017. Improved image captioning via policy gradient optimization of spider. In ICCV.","DOI":"10.1109\/ICCV.2017.100"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3271145"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Y.-J. Luo C.-C. Hsu K. Agres and D. Herremans. 2020. Singing voice conversion with disentangled representations of singer and vocal technique using variational autoencoders. In ICASSP.","DOI":"10.1109\/ICASSP40776.2020.9054582"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_28_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, Vol. 21, 140 (2020), 1-67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","unstructured":"N. Reimers and I. Gurevych. 2019. Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. In EMNLP-IJCNLP. doi:10.18653\/v1\/D19-1410","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0987-1"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"S. Rouard F. Massa and A. D\u00e9fossez. 2023. Hybrid transformers for music source separation. In ICASSP.","DOI":"10.1109\/ICASSP49357.2023.10096956"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_1_33_1","volume-title":"Cider: Consensus-based image description evaluation. In CVPR.","author":"Vedantam R.","year":"2015","unstructured":"R. Vedantam, C. Lawrence Zitnick, and D. Parikh. 2015. Cider: Consensus-based image description evaluation. In CVPR."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Y. Wang R. Hu R. Huang Z. Hong R. Li W. Liu F. You T. Jin and Z. Zhao. 2024. Prompt-Singer: Controllable Singing-Voice-Synthesis with Natural Language Prompt. In NAACL.","DOI":"10.18653\/v1\/2024.naacl-long.268"},{"key":"e_1_3_2_1_35_1","unstructured":"Y. Wang D. Stanton Y. Zhang R.-S. Ryan E. Battenberg J. Shor Y. Xiao Y. Jia F. Ren and R. A Saurous. 2018. Style tokens: Unsupervised style modeling control and transfer in end-to-end speech synthesis. In ICML."},{"key":"e_1_3_2_1_36_1","volume-title":"Opencpop: A high-quality open source chinese popular song corpus for singing voice synthesis. arXiv preprint arXiv:2201.07429","author":"Wang Yu","year":"2022","unstructured":"Yu Wang, Xinsheng Wang, Pengcheng Zhu, Jie Wu, Hanzhao Li, Heyang Xue, Yongmao Zhang, Lei Xie, and Mengxiao Bi. 2022. Opencpop: A high-quality open source chinese popular song corpus for singing voice synthesis. arXiv preprint arXiv:2201.07429 (2022)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"K. Yamauchi Y. Ijima and Y. Saito. 2024. StyleCap: Automatic Speaking-Style Captioning from Speech Based on Speech and Language Self-supervised Learning Models. In ICASSP.","DOI":"10.1109\/ICASSP48485.2024.10445977"},{"key":"e_1_3_2_1_39_1","unstructured":"A. Yang B. Yang B. Hui B. Zheng B. Yu C. Zhou C. Li C. Li D. Liu F. Huang et al. 2024c. Qwen2 technical report. arXiv preprint arXiv:2407.10671 (2024)."},{"key":"e_1_3_2_1_40_1","volume-title":"Instructtts: Modelling expressive tts in discrete latent space with natural language style prompt","author":"Yang D.","year":"2024","unstructured":"D. Yang, S. Liu, R. Huang, C. Weng, and H. Meng. 2024a. Instructtts: Modelling expressive tts in discrete latent space with natural language style prompt. IEEE\/ACM TASLP (2024)."},{"key":"e_1_3_2_1_41_1","volume-title":"Air-bench: Benchmarking large audio-language models via generative comprehension. arXiv preprint arXiv:2402.07729","author":"Yang Qian","year":"2024","unstructured":"Qian Yang, Jin Xu, Wenrui Liu, Yunfei Chu, Ziyue Jiang, Xiaohuan Zhou, Yichong Leng, Yuanjun Lv, Zhou Zhao, Chang Zhou, et al., 2024b. Air-bench: Benchmarking large audio-language models via generative comprehension. arXiv preprint arXiv:2402.07729 (2024)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10445804"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_1_44_1","volume-title":"Multi-Singer and Musical Score Provided Mandarin Singing Corpus. In Advances in Neural Information Processing Systems","author":"Zhang Lichao","year":"2022","unstructured":"Lichao Zhang, Ruiqi Li, Shoutong Wang, Liqun Deng, Jinglin Liu, Yi Ren, Jinzheng He, Rongjie Huang, Jieming Zhu, Xiao Chen, and Zhou Zhao. 2022. M4Singer: A Multi-Style, Multi-Singer and Musical Score Provided Mandarin Singing Corpus. In Advances in Neural Information Processing Systems, S. Koyejo, S. Mohamed, A. Agarwal, D. Belgrave, K. Cho, and A. Oh (Eds.), Vol. 35. Curran Associates, Inc., 6914-6926. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/2de60892dd329683ec21877a4e7c3091-Paper-Datasets_and_Benchmarks.pdf"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Y. Zhang R. Huang R. Li J. He Y. Xia F. Chen X. Duan B. Huai and Z. Zhao. 2024. StyleSinger: Style Transfer for Out-of-Domain Singing Voice Synthesis. In AAAI.","DOI":"10.1609\/aaai.v38i17.29932"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389772"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Z. Zhou Z. Zhang X. Xu Z. Xie M. Wu and K. Q Zhu. 2022. Can audio captions be evaluated with image caption metrics?. In ICASSP.","DOI":"10.1109\/ICASSP43922.2022.9746427"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"X. Zhu W. Tian X. Wang L. He Y. Xiao X. Wang X. Tan L. Xie et al. 2024. UniStyle: Unified Style Modeling for Speaking Style Captioning and Stylistic Speech Synthesis. In ACM MM.","DOI":"10.1145\/3664647.3681465"}],"event":{"name":"CIKM '25: The 34th ACM International Conference on Information and Knowledge Management","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval","SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Seoul Republic of Korea","acronym":"CIKM '25"},"container-title":["Proceedings of the 34th ACM International Conference on Information and Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746252.3761614","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,12]],"date-time":"2025-12-12T02:24:45Z","timestamp":1765506285000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746252.3761614"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,10]]},"references-count":48,"alternative-id":["10.1145\/3746252.3761614","10.1145\/3746252"],"URL":"https:\/\/doi.org\/10.1145\/3746252.3761614","relation":{},"subject":[],"published":{"date-parts":[[2025,11,10]]},"assertion":[{"value":"2025-11-10","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}