{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:19:39Z","timestamp":1750220379005,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":27,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,7,11]],"date-time":"2021-07-11T00:00:00Z","timestamp":1625961600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,7,11]]},"DOI":"10.1145\/3404835.3462993","type":"proceedings-article","created":{"date-parts":[[2021,7,12]],"date-time":"2021-07-12T02:41:54Z","timestamp":1626057714000},"page":"1880-1884","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Deep Music Retrieval for Fine-Grained Videos by Exploiting Cross-Modal-Encoded Voice-Overs"],"prefix":"10.1145","author":[{"given":"Tingtian","family":"Li","sequence":"first","affiliation":[{"name":"Tencent, Shanghai, China"}]},{"given":"Zixun","family":"Sun","sequence":"additional","affiliation":[{"name":"Tencent, Shanghai, China"}]},{"given":"Haoruo","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tencent, Shanghai, China"}]},{"given":"Jin","family":"Li","sequence":"additional","affiliation":[{"name":"Tencent, Shanghai, China"}]},{"given":"Ziming","family":"Wu","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}]},{"given":"Hui","family":"Zhan","sequence":"additional","affiliation":[{"name":"Tencent, Shanghai, China"}]},{"given":"Yipeng","family":"Yu","sequence":"additional","affiliation":[{"name":"Tencent, Shanghai, China"}]},{"given":"Hengcan","family":"Shi","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, China"}]}],"member":"320","published-online":{"date-parts":[[2021,7,11]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"The World's 2.7 Billion Gamers Will Spend 159.3 Billion on Games in 2020","author":"Wijman T.","year":"2023","unstructured":"T. Wijman. 2020. The World's 2.7 Billion Gamers Will Spend 159.3 Billion on Games in 2020; The Market Will Surpass 200 Billion by 2023. Newzoon."},{"key":"e_1_3_2_2_2_1","unstructured":"R. Wyatt. 2020. 2020 is YouTube Gaming's biggest year ever: 100B watch time hours. YouTube Officail Blog."},{"key":"e_1_3_2_2_3_1","volume-title":"Proceedings of The IEEE International Conference on Big Data and Smart Computing, 47--50","author":"Shin K.-H.","year":"2017","unstructured":"K.-H. Shin and I.-K. Lee. 2017. Music synchronization with video using emotion similarity. In Proceedings of The IEEE International Conference on Big Data and Smart Computing, 47--50."},{"volume-title":"Proceedings of the International Semantic Web Conference.","author":"Chao J.","key":"e_1_3_2_2_4_1","unstructured":"J. Chao, H. Wang, W. Zhou, W. Zhang, and Y. Yu. 2011. Tunesensor: A semantic-driven music recommendation service for digital photo albums. In Proceedings of the International Semantic Web Conference."},{"volume-title":"Workshop on Artificial Intelligence and Statistics.","author":"Brochu E.","key":"e_1_3_2_2_5_1","unstructured":"E. Brochu, N. De Freitas, and K. Bao. 2003. The sound of an album cover: Probabilistic multimedia and information retrieval. In Workshop on Artificial Intelligence and Statistics."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2016.2557722"},{"volume-title":"Proceedings of the International Society for Music Information Retrieval, 604--611","author":"Li B.","key":"e_1_3_2_2_7_1","unstructured":"B. Li and A. Kumar. 2019. Query by Video: Cross-modal Music Retrieval. In Proceedings of the International Society for Music Information Retrieval, 604--611."},{"volume-title":"Proceedings of the ACM on International Conference on Multimedia Retrieval, 353--361","author":"Hong S.","key":"e_1_3_2_2_8_1","unstructured":"S. Hong, W. Im, and H. S. Yang. 2018. Cbvmr: content-based video-music retrieval using soft intra-modal structure constraint. In Proceedings of the ACM on International Conference on Multimedia Retrieval, 353--361."},{"volume-title":"Proceedings of the IEEE International Conference on Computer Vision, 6202--6211","author":"Feichtenhofer C.","key":"e_1_3_2_2_9_1","unstructured":"C. Feichtenhofer, H. Fan, J. Malik, and K. He. 2019. Slowfast networks for video recognition. In Proceedings of the IEEE International Conference on Computer Vision, 6202--6211."},{"volume-title":"Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing, 131--135","author":"Hershey S.","key":"e_1_3_2_2_10_1","unstructured":"S. Hershey, S. Chaudhuri, D.P.W. Elis et al.. 2017. CNN architectures for large-scale audio classification. In Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing, 131--135."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2003.814255"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.2517-6161.1977.tb01600.x"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDSP.2015.7251877"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"T. Evgeniou and M. Pontil 1999 Support vector machines: Theory and applications. Advanced Course on Artificial Intelligence 249--257.","DOI":"10.1007\/3-540-44673-7_12"},{"volume-title":"What is a support vector machine? Nature biotechnology","author":"Noble W. S.","key":"e_1_3_2_2_15_1","unstructured":"W. S. Noble. 2006. What is a support vector machine? Nature biotechnology, vol. 24, no. 12, 1565--1567."},{"volume-title":"Proceedings of International Conference on Multimedia Modeling, 303--314","author":"Acar E.","key":"e_1_3_2_2_16_1","unstructured":"E. Acar, F. Hopfgartner, and S. Albayrak. 2014. Understanding affective content of music videos through learned representations. In Proceedings of International Conference on Multimedia Modeling, 303--314."},{"volume-title":"Deep Pairwise Ranking with Multi-label Information for Cross-Modal Retrieval. In Proceedings of IEEE International Conference on Multimedia and Expo","author":"Jian Y.","key":"e_1_3_2_2_17_1","unstructured":"Y. Jian, J. Xiao, Y. Cao, A. Khan, and J. Zhu. 2019. Deep Pairwise Ranking with Multi-label Information for Cross-Modal Retrieval. In Proceedings of IEEE International Conference on Multimedia and Expo, 1810--1815."},{"volume-title":"Proceedings of the International ACM SIGIR Conference on Research and Development in Information Retrieval, 635--644","author":"Hu P.","key":"e_1_3_2_2_18_1","unstructured":"P. Hu, L. Zhen, D. Peng, and P. Liu. 2019. Scalable deep multimodal learning for cross-modal retrieval. In Proceedings of the International ACM SIGIR Conference on Research and Development in Information Retrieval, 635--644."},{"volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 10394--10403","author":"Zhen L.","key":"e_1_3_2_2_19_1","unstructured":"L. Zhen, P. Hu, X. Wang, and D. Peng, 2019, Deep supervised cross-modal retrieval. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 10394--10403."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401151"},{"key":"e_1_3_2_2_21_1","volume-title":"Parmar et al","author":"Vaswani A.","year":"2017","unstructured":"A. Vaswani, N. Shazeer, N. Parmar et al.. 2017. Attention is all you need. Advances in neural information processing systems, 5998--6008."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_43"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3031184"},{"volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 4681--4690","author":"Ledig C.","key":"e_1_3_2_2_24_1","unstructured":"C. Ledig, L. Theis, F. Huszar et al. 2017. Photo-realistic single image super-resolution using a generative adversarial network. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 4681--4690."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2019.2926828"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.21105\/joss.02154"},{"key":"e_1_3_2_2_27_1","volume-title":"Freedman et al","author":"Gemmeke J. F.","year":"2017","unstructured":"J. F. Gemmeke, D. P. W. Ellis, D. Freedman et al. 2017. Audio set: An ontology and human-labeled dataset for audio events In Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing, 776--780."}],"event":{"name":"SIGIR '21: The 44th International ACM SIGIR Conference on Research and Development in Information Retrieval","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"],"location":"Virtual Event Canada","acronym":"SIGIR '21"},"container-title":["Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3462993","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3404835.3462993","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:18:20Z","timestamp":1750191500000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3462993"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,11]]},"references-count":27,"alternative-id":["10.1145\/3404835.3462993","10.1145\/3404835"],"URL":"https:\/\/doi.org\/10.1145\/3404835.3462993","relation":{},"subject":[],"published":{"date-parts":[[2021,7,11]]},"assertion":[{"value":"2021-07-11","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}