{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,29]],"date-time":"2025-11-29T08:04:34Z","timestamp":1764403474493,"version":"3.45.0"},"reference-count":27,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11228034","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-8","source":"Crossref","is-referenced-by-count":1,"title":["JamendoMaxCaps: A Large Scale Music-caption Dataset with Imputed Metadata"],"prefix":"10.1109","author":[{"given":"Abhinaba","family":"Roy","sequence":"first","affiliation":[{"name":"Singapore University of Technology and Design"}]},{"given":"Renhang","family":"Liu","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design"}]},{"given":"Tongyu","family":"Lu","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design"}]},{"given":"Dorien","family":"Herremans","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design"}]}],"member":"263","reference":[{"journal-title":"Musiclm: Generating music from text","year":"2023","author":"Agostinelli","key":"ref1"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i22.34516"},{"article-title":"Beats: Audio pre-training with acoustic tokenizers","volume-title":"Proc. of the 40th Int. Conf. on Machine Learning (ICML), Honolulu, USA, volume 202 of Proc. of Machine Learning Research (PMLR)","author":"Chen","key":"ref3"},{"article-title":"Qwen2-audio technical report","year":"2024","author":"Chu","key":"ref4"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/j.dib.2024.110743"},{"article-title":"Simple and controllable music generation","volume-title":"Thirty-seventh Conf. on Neural Information Processing Systems","author":"Copet","key":"ref6"},{"journal-title":"Lp-musiccaps: Llm-based pseudo music captioning","year":"2023","author":"Doh","key":"ref7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.64"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"article-title":"Noise2music: Text-conditioned music generation with diffusion models","year":"2023","author":"Huang","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2024.123640"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73016-0_11"},{"article-title":"Deep content-user embedding model for music recommendation","year":"2018","author":"Lee","key":"ref13"},{"key":"ref14","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. of the 40 th Int. Conf. on Machine Learning (ICML), Honolulu, Hawaii, USA","author":"Li"},{"article-title":"MERT: acoustic music understanding model with large-scale self-supervised training","volume-title":"The 12th Int. Conf. on Learning Representations","author":"Li","key":"ref15"},{"key":"ref16","article-title":"A technique for the measurement of attitudes","author":"Likert","year":"1932","journal-title":"Archives of Psychology"},{"article-title":"The song describer dataset: a corpus of audio captions for music-and-language evaluation","year":"2023","author":"Manco","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.459"},{"article-title":"Midicaps\u2013a large-scale midi dataset with text captions","volume-title":"Proc. of ISMIR","author":"Melechovsky","key":"ref21"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"Proc. of the 40 th Int. Conf. on Machine Learning (ICML)","author":"Radford","key":"ref23"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.3758\/BRM.42.4.1096"},{"article-title":"SALMONN: Towards generic hearing abilities for large language models","volume-title":"The 12th Int. Conf. on Learning Representations","author":"Tang","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"article-title":"Bertscore: Evaluating text generation with bert","year":"2019","author":"Zhang","key":"ref27"}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2025,6,30]]},"location":"Rome, Italy","end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11228034.pdf?arnumber=11228034","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:26:23Z","timestamp":1763191583000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11228034\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":27,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11228034","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}