{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T19:40:55Z","timestamp":1730230855489,"version":"3.28.0"},"reference-count":34,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6,4]]},"DOI":"10.1109\/icassp49357.2023.10094915","type":"proceedings-article","created":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T17:28:30Z","timestamp":1683307710000},"page":"1-5","source":"Crossref","is-referenced-by-count":2,"title":["On the Role of Visual Context in Enriching Music Representations"],"prefix":"10.1109","author":[{"given":"Kleanthis","family":"Avramidis","sequence":"first","affiliation":[{"name":"University of Southern California,Signal Analysis and Interpretation Lab,Los Angeles,CA 90089"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shanti","family":"Stewart","sequence":"additional","affiliation":[{"name":"University of Southern California,Signal Analysis and Interpretation Lab,Los Angeles,CA 90089"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shrikanth","family":"Narayanan","sequence":"additional","affiliation":[{"name":"University of Southern California,Signal Analysis and Interpretation Lab,Los Angeles,CA 90089"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"ref1","DOI":"10.1177\/0305735619888803"},{"volume-title":"Musicophilia : Tales of Music and the Brain","year":"2007","author":"Sacks","key":"ref2"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1037\/rev0000364"},{"year":"2020","author":"Won","journal-title":"Evaluation of cnn-based automatic music tagging models","key":"ref4"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.1177\/0963721411422522"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.1109\/ICASSP.2019.8683735"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1371\/journal.pone.0249957"},{"key":"ref8","first-page":"22243","article-title":"Big self-supervised models are strong semi-supervised learners","volume":"33","author":"Chen","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref9","first-page":"4182","article-title":"Data-efficient image recognition with contrastive predictive coding","volume-title":"International conference on machine learning","author":"Henaff"},{"key":"ref10","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","volume-title":"International conference on machine learning","author":"Chen"},{"doi-asserted-by":"publisher","key":"ref11","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"ref12","first-page":"9758","article-title":"Self-supervised learning by cross-modal audiovideo clustering","volume":"33","author":"Alwassel","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref13","first-page":"25","article-title":"Self-supervised multimodal versatile networks","volume":"33","author":"Alayrac","year":"2020","journal-title":"Advances in NeurIPS"},{"key":"ref14","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021","journal-title":"ICML"},{"doi-asserted-by":"publisher","key":"ref15","DOI":"10.21437\/Interspeech.2019-2605"},{"doi-asserted-by":"publisher","key":"ref16","DOI":"10.1109\/ICASSP39728.2021.9413528"},{"doi-asserted-by":"publisher","key":"ref17","DOI":"10.1109\/ICASSP43922.2022.9747669"},{"key":"ref18","first-page":"23634","article-title":"Merlot: Multimodal neural script knowledge models","volume":"34","author":"Zellers","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"year":"2021","author":"Spijkervet","journal-title":"Contrastive learning of musical representations","key":"ref19"},{"doi-asserted-by":"publisher","key":"ref20","DOI":"10.1109\/cvpr52688.2022.01031"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.2139\/ssrn.4189323"},{"year":"2022","author":"Huang","journal-title":"Mulan: A joint embedding of music audio and natural language","key":"ref22"},{"year":"2022","author":"Manco","journal-title":"Contrastive audio-language learning for music","key":"ref23"},{"doi-asserted-by":"publisher","key":"ref24","DOI":"10.1109\/lsp.2017.2713830"},{"year":"2017","author":"Choi","journal-title":"A tutorial on deep learning for music information retrieval","key":"ref25"},{"key":"ref26","first-page":"565","article-title":"The harmonix set: Beats, downbeats, and functional segment annotations of western popular music","author":"Nieto","year":"2019","journal-title":"ISMIR"},{"doi-asserted-by":"publisher","key":"ref27","DOI":"10.1145\/1143844.1143874"},{"key":"ref28","first-page":"387","article-title":"Evaluation of algorithms using games: The case of music tagging","author":"Law","year":"2009","journal-title":"ISMIR"},{"year":"2019","author":"Bogdanov","article-title":"The mtg-jamendo dataset for automatic music tagging","key":"ref29"},{"key":"ref30","article-title":"Mediaeval 2020: Emotion and theme recognition in music using jamendo","author":"Bogdanov","year":"2020","journal-title":"MediaEval"},{"doi-asserted-by":"publisher","key":"ref31","DOI":"10.1109\/lsp.2017.2713830"},{"doi-asserted-by":"publisher","key":"ref32","DOI":"10.1109\/ICASSP40776.2020.9053669"},{"doi-asserted-by":"publisher","key":"ref33","DOI":"10.1109\/cbmi50038.2021.9461913"},{"key":"ref34","first-page":"591","article-title":"The million song dataset","author":"Bertin-Mahieux","year":"2011"}],"event":{"name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2023,6,4]]},"location":"Rhodes Island, Greece","end":{"date-parts":[[2023,6,10]]}},"container-title":["ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10094559\/10094560\/10094915.pdf?arnumber=10094915","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,29]],"date-time":"2024-02-29T22:34:52Z","timestamp":1709246092000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10094915\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,4]]},"references-count":34,"URL":"https:\/\/doi.org\/10.1109\/icassp49357.2023.10094915","relation":{},"subject":[],"published":{"date-parts":[[2023,6,4]]}}}