{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,29]],"date-time":"2025-11-29T06:51:37Z","timestamp":1764399097814,"version":"3.46.0"},"reference-count":29,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,22]]},"DOI":"10.1109\/apsipaasc65261.2025.11249411","type":"proceedings-article","created":{"date-parts":[[2025,11,28]],"date-time":"2025-11-28T18:40:26Z","timestamp":1764355226000},"page":"311-316","source":"Crossref","is-referenced-by-count":0,"title":["Data-Efficient Music Captioning Via Contrastive and Semantic Alignment"],"prefix":"10.1109","author":[{"given":"Leekyung","family":"Kim","sequence":"first","affiliation":[{"name":"Seoul National University,Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jonghun","family":"Park","sequence":"additional","affiliation":[{"name":"Seoul National University,Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN52387.2021.9533461"},{"article-title":"Contrastive audio-language learning for music","volume-title":"Ismir 2022 Hybrid Conference","author":"Manco","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.ijhcs.2018.04.004"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447027"},{"key":"ref5","article-title":"Musiclm: Generating music from text","author":"Agostinelli","year":"2023","journal-title":"arXiv preprint"},{"article-title":"Lp-musiccaps: Llm-based pseudo music captioning","volume-title":"Ismir 2023 Hybrid Conference","author":"Doh","key":"ref6"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref8","article-title":"Sequence to sequence learning with neural networks","volume":"27","author":"Sutskever","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref10","first-page":"5178","article-title":"Beats: Audio pretraining with acoustic tokenizers","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Chen"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1525\/9780520940420-020"},{"journal-title":"Language models are unsupervised multitask learners","author":"Radford","key":"ref12"},{"key":"ref13","first-page":"67","article-title":"Music auto-tagging as captioning","volume-title":"Proceedings of the 1st Workshop on NLP for Music and Audio (NLP4MusA)","author":"Cai"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10094670"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746131"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN64981.2025.11228034"},{"key":"ref17","article-title":"Qwen2-audio technical report","author":"Chu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.29007\/1mjd"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447215"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref26","first-page":"65","article-title":"Meteor: An automatic metric for mt evaluation with improved correlation with human judgments","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization","author":"Banerjee"},{"key":"ref27","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"Lin","year":"2004","journal-title":"in Text summarization branches out"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/7287.001.0001"},{"article-title":"Bertscore: Evaluating text generation with bert","volume-title":"International Conference on Learning Representations","author":"Zhang","key":"ref29"}],"event":{"name":"2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","start":{"date-parts":[[2025,10,22]]},"location":"Singapore, Singapore","end":{"date-parts":[[2025,10,24]]}},"container-title":["2025 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11248853\/11248968\/11249411.pdf?arnumber=11249411","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,29]],"date-time":"2025-11-29T06:51:06Z","timestamp":1764399066000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11249411\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,22]]},"references-count":29,"URL":"https:\/\/doi.org\/10.1109\/apsipaasc65261.2025.11249411","relation":{},"subject":[],"published":{"date-parts":[[2025,10,22]]}}}