{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T17:58:13Z","timestamp":1776880693144,"version":"3.51.2"},"reference-count":41,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,7,18]],"date-time":"2021-07-18T00:00:00Z","timestamp":1626566400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,7,18]],"date-time":"2021-07-18T00:00:00Z","timestamp":1626566400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,7,18]],"date-time":"2021-07-18T00:00:00Z","timestamp":1626566400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100014013","name":"UK Research and Innovation","doi-asserted-by":"publisher","award":["EP\/S022694\/1"],"award-info":[{"award-number":["EP\/S022694\/1"]}],"id":[{"id":"10.13039\/100014013","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100009148","name":"Queen Mary University of London","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100009148","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,7,18]]},"DOI":"10.1109\/ijcnn52387.2021.9533461","type":"proceedings-article","created":{"date-parts":[[2021,9,20]],"date-time":"2021-09-20T21:27:41Z","timestamp":1632173261000},"page":"1-8","source":"Crossref","is-referenced-by-count":18,"title":["MusCaps: Generating Captions for Music Audio"],"prefix":"10.1109","author":[{"given":"Ilaria","family":"Manco","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Emmanouil","family":"Benetos","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Elio","family":"Quinton","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gyorgy","family":"Fazekas","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.100"},{"key":"ref38","first-page":"382","article-title":"Spice: Semantic propositional image caption evaluation","author":"anderson","year":"0","journal-title":"European Conference on Computer Vision"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1989.1.2.270"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref31","first-page":"331","article-title":"Evaluation of CNN-based Automatic Music Tagging Models","author":"won","year":"0","journal-title":"Proceedings of the 17th Sound and Music Computing Conference"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053669"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref36","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"lin","year":"0","journal-title":"Proceedings of the Workshop on Text Summarization Branches Out (WAS 2004)"},{"key":"ref35","first-page":"65","article-title":"METEOR: An Automatic Metric for MT Evaluation with Improved Correlation with Human Judgments","author":"lavie","year":"0","journal-title":"Proceedings of the Second Workshop on Statistical Machine Translation"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"ref40","article-title":"Evaluation of algorithms using games: The case of music tagging","author":"law","year":"0","journal-title":"Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)"},{"key":"ref11","first-page":"4171","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","volume":"1","author":"devlin","year":"0","journal-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologies"},{"key":"ref12","article-title":"Video Understanding as Machine Translation","author":"korbar","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2017.8170058"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682377"},{"key":"ref16","article-title":"Multi-task Regularization Based on Infrequent Classes for Audio Captioning","author":"\u00e7akir","year":"0","journal-title":"Workshop on Detection and Classification of Acoustic Scenes and Events"},{"key":"ref17","article-title":"Audio Captioning using Gated Recurrent Units","author":"eren","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref18","first-page":"119","article-title":"AudioCaps: Generating captions for audios in the wild","volume":"1","author":"kim","year":"0","journal-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologies"},{"key":"ref19","article-title":"WaveTransformer: A Novel Architecture for Audio Captioning Based on Learning Temporal and Time-Frequency Information","author":"tran","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref28","article-title":"Transfer Learning for Music Classification and Regression Tasks","author":"choi","year":"0","journal-title":"Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref27","article-title":"Transfer learning by supervised pre-training for audio-based music classification","author":"van","year":"0","journal-title":"Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)"},{"key":"ref3","first-page":"637","article-title":"End-to-end learning for music audio tagging at scale","author":"pons","year":"0","journal-title":"Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2598339"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref5","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"0","journal-title":"Proceedings of The 32nd International Conference on Machine Learning"},{"key":"ref8","article-title":"VI-bert: Pre-training of generic visual-linguistic representations","author":"su","year":"0","journal-title":"International Conference on Learning Representations"},{"key":"ref7","first-page":"13","article-title":"ViLBERT: Pretraining Task- Agnostic Visiolinguistic Representations for Vision-and-Language Tasks","volume":"32","author":"lu","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462046"},{"key":"ref9","author":"li","year":"2019","journal-title":"Visu-alBERT A Simple and Performant Baseline for Vision and Language"},{"key":"ref1","article-title":"Automatic tagging using deep convolutional neural networks","author":"choi","year":"0","journal-title":"Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.33682\/7bay-bj41"},{"key":"ref22","article-title":"Audio Captioning using Pre-Trained Large-Scale Language Model Guided by Audio-based Similar Caption Retrieval","author":"koizumi","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref21","first-page":"150","article-title":"Listen carefully and tell: an audio captioning system based on residual learning and gammatone audio representation","author":"perez-castanos","year":"0","journal-title":"Workshop on Detection and Classification of Acoustic Scenes and Events"},{"key":"ref24","article-title":"Towards Music Captioning: Generating Music Playlist Descriptions","author":"choi","year":"2016","journal-title":"ArXiv Preprint"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2587640"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-2087"},{"key":"ref26","article-title":"COALA: Co-Aligned Autoencoders for Learning Semantically Enriched Audio Representations","author":"favory","year":"0","journal-title":"Workshop on Self-supervised learning in Audio and Speech at ICML"},{"key":"ref25","article-title":"Music autotagging as captioning","author":"tian","year":"0","journal-title":"Proceedings of the 1st Workshop on NLP for Music and Audio (NLP4MusA)"}],"event":{"name":"2021 International Joint Conference on Neural Networks (IJCNN)","location":"Shenzhen, China","start":{"date-parts":[[2021,7,18]]},"end":{"date-parts":[[2021,7,22]]}},"container-title":["2021 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9533266\/9533267\/09533461.pdf?arnumber=9533461","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T15:46:19Z","timestamp":1652197579000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9533461\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,18]]},"references-count":41,"URL":"https:\/\/doi.org\/10.1109\/ijcnn52387.2021.9533461","relation":{},"subject":[],"published":{"date-parts":[[2021,7,18]]}}}