{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T01:15:44Z","timestamp":1773191744551,"version":"3.50.1"},"reference-count":33,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,7,18]],"date-time":"2021-07-18T00:00:00Z","timestamp":1626566400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,7,18]],"date-time":"2021-07-18T00:00:00Z","timestamp":1626566400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,7,18]],"date-time":"2021-07-18T00:00:00Z","timestamp":1626566400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,7,18]]},"DOI":"10.1109\/ijcnn52387.2021.9533662","type":"proceedings-article","created":{"date-parts":[[2021,9,20]],"date-time":"2021-09-20T17:27:41Z","timestamp":1632158861000},"page":"1-9","source":"Crossref","is-referenced-by-count":21,"title":["Cross-Modal Music-Video Recommendation: A Study of Design Choices"],"prefix":"10.1109","author":[{"given":"Laure","family":"Pretet","sequence":"first","affiliation":[]},{"given":"Gael","family":"Richard","sequence":"additional","affiliation":[]},{"given":"Geoffroy","family":"Peeters","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ISM.2018.00-21"},{"key":"ref32","article-title":"Beyond Short Snippets: Deep Networks for Video Classification","author":"yue","year":"2015","journal-title":"Proceedings of IEEE CVPR (Conference on Computer Vision and Pattern Recognition)"},{"key":"ref31","first-page":"207","article-title":"Distance metric learning for large margin nearest neighbor classification","volume":"10","author":"weinberger","year":"2009","journal-title":"Journal of Machine Learning Research"},{"key":"ref30","article-title":"A Comprehensive Survey on Cross-modal Retrieval","author":"wang","year":"2016","journal-title":"ArXiv Preprint"},{"key":"ref10","article-title":"Music, Movies and Meaning: Communication in Film-makers' Search for Preexisting Music, and the Implications for Music Information Retrieval","author":"inskip","year":"2008","journal-title":"Proceedings of ISMIR (International Conference on Music Information Retrieval)"},{"key":"ref11","article-title":"Background Music Recommendation for Video Based on Multimodal Latent Semantic Analysis","author":"kuo","year":"2013","journal-title":"Proceedings of ICME (International Conference on Multimedia and Expo)"},{"key":"ref12","article-title":"Query by Video: Cross-Modal Music Retrieval","author":"li","year":"2019","journal-title":"Proceedings of ISMIR (International Conference on Music Information Retrieval)"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-92892-8_41"},{"key":"ref14","doi-asserted-by":"crossref","DOI":"10.25080\/Majora-7b98e3ed-003","article-title":"librosa: Audio and music signal analysis in python","author":"mcfee","year":"2015","journal-title":"Proceedings of Python in Science"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2018.2868887"},{"key":"ref16","article-title":"Multimodal Deep Learning","author":"ngiam","year":"2011","journal-title":"Proceedings of ICML (International Conference on Machine Learning)"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.222"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_48"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"ref4","first-page":"892","article-title":"SoundNet: Learning Sound Representations from Unlabeled Video","author":"aytar","year":"2016","journal-title":"Advances in neural information processing systems"},{"key":"ref27","article-title":"Music synchronization with video using emotion similarity","author":"shin","year":"2017","journal-title":"Proceedings of IEEE BigComp (Inter-national Conference on Big Data and Smart Computing)"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_46"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.180"},{"key":"ref5","article-title":"See, Hear, and Read: Deep Aligned Representations","author":"aytar","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682475"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3206025.3206046"},{"key":"ref1","article-title":"Youtube-8m: A large-scale video classification benchmark","author":"abu-el-haija","year":"2016","journal-title":"ArXiv Preprint"},{"key":"ref20","article-title":"Weakly Supervised Representation Learning for Audio-Visual Scene Analysis","author":"parekh","year":"2019","journal-title":"IEEE\/ACM Transactions on Audio Speech and Language Processing"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053135"},{"key":"ref21","article-title":"musicnn: Pre-trained convolutional neural networks for music audio tagging","author":"pons","year":"2019","journal-title":"Late Breaking Demo ISMIR (International Conference on Music Information Retrieval)"},{"key":"ref24","author":"schindler","year":"2019","journal-title":"Multi-Modal Music Information Retrieval Augmenting Audio-Analysis with Visual Computing for Improved Music Video Analysis"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-14442-9_33"},{"key":"ref26","article-title":"ADVISOR - Personalized video soundtrack recommendation by late fusion with heuristic rankings","author":"shah","year":"2014","journal-title":"Proceedings of ACM Multimedia"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/2926719"}],"event":{"name":"2021 International Joint Conference on Neural Networks (IJCNN)","location":"Shenzhen, China","start":{"date-parts":[[2021,7,18]]},"end":{"date-parts":[[2021,7,22]]}},"container-title":["2021 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9533266\/9533267\/09533662.pdf?arnumber=9533662","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T11:46:04Z","timestamp":1652183164000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9533662\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,18]]},"references-count":33,"URL":"https:\/\/doi.org\/10.1109\/ijcnn52387.2021.9533662","relation":{},"subject":[],"published":{"date-parts":[[2021,7,18]]}}}