{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T15:37:10Z","timestamp":1775230630859,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072459, 62172421"],"award-info":[{"award-number":["62072459, 62172421"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658045","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"11-19","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["BeatDance: A Beat-Based Model-Agnostic Contrastive Learning Framework for Music-Dance Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3879-7225","authenticated-orcid":false,"given":"Kaixing","family":"Yang","sequence":"first","affiliation":[{"name":"Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-5478-1791","authenticated-orcid":false,"given":"Xukun","family":"Zhou","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0857-1056","authenticated-orcid":false,"given":"Xulong","family":"Tang","sequence":"additional","affiliation":[{"name":"The University of Texas at Dallas, Dallas, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1046-3623","authenticated-orcid":false,"given":"Ran","family":"Diao","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4902-1078","authenticated-orcid":false,"given":"Hongyan","family":"Liu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1511-7554","authenticated-orcid":false,"given":"Jun","family":"He","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6324-1712","authenticated-orcid":false,"given":"Zhaoxin","family":"Fan","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"GrooveNet: Realtime music-driven dance movement generation using artificial neural networks. networks 8, 17","author":"Alemi Omid","year":"2017","unstructured":"Omid Alemi, Jules Fran\u00e7oise, and Philippe Pasquier. 2017. GrooveNet: Realtime music-driven dance movement generation using artificial neural networks. networks 8, 17 (2017), 26."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00513"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.143"},{"key":"e_1_3_2_1_4_1","volume-title":"SSVMR: Saliency-Based Self-Training for Video-Music Retrieval. In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5.","author":"Cheng Xuxin","year":"2023","unstructured":"Xuxin Cheng, Zhihong Zhu, Hongxiang Li, Yaowei Li, and Yuexian Zou. 2023. SSVMR: Saliency-Based Self-Training for Video-Music Retrieval. In ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1--5."},{"key":"e_1_3_2_1_5_1","volume-title":"Alec Radford, and Ilya Sutskever.","author":"Dhariwal Prafulla","year":"2020","unstructured":"Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, and Ilya Sutskever. 2020. Jukebox: A generative model for music. arXiv preprint arXiv:2005.00341 (2020)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11312"},{"key":"e_1_3_2_1_7_1","volume-title":"GANSynth: Adversarial Neural Audio Synthesis. In International Conference on Learning Representations.","author":"Engel Jesse","year":"2018","unstructured":"Jesse Engel, Kumar Krishna Agrawal, Shuo Chen, Ishaan Gulrajani, Chris Donahue, and Adam Roberts. 2018. GANSynth: Adversarial Neural Audio Synthesis. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cag.2020.09.009"},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Machine Learning. PMLR, 7616--7633","author":"Goel Karan","year":"2022","unstructured":"Karan Goel, Albert Gu, Chris Donahue, and Christopher R\u00e9. 2022. It's raw! audio generation with state-space models. In International Conference on Machine Learning. PMLR, 7616--7633."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3038912.3052569"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3206025.3206046"},{"key":"e_1_3_2_1_13_1","volume-title":"Music Transformer: Generating Music with Long-Term Structure. In International Conference on Learning Representations.","author":"Anna Huang Cheng-Zhi","year":"2018","unstructured":"Cheng-Zhi Anna Huang, Ashish Vaswani, Jakob Uszkoreit, Ian Simon, Curtis Hawthorne, Noam Shazeer, Andrew M Dai, Matthew D Hoffman, Monica Dinculescu, and Douglas Eck. 2018. Music Transformer: Generating Music with Long-Term Structure. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_14_1","volume-title":"Dance revolution: Long-term dance generation with music via curriculum learning. arXiv preprint arXiv:2006.06119","author":"Huang Ruozi","year":"2020","unstructured":"Ruozi Huang, Huang Hu, Wei Wu, Kei Sawada, Mi Zhang, and Daxin Jiang. 2020. Dance revolution: Long-term dance generation with music via curriculum learning. arXiv preprint arXiv:2006.06119 (2020)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00348"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"e_1_3_2_1_17_1","volume-title":"Lucas Gestin, Wei Zhen Teoh, Jose Sotelo, Alexandre De Brebisson, Yoshua Bengio, and Aaron C Courville.","author":"Kumar Kundan","year":"2019","unstructured":"Kundan Kumar, Rithesh Kumar, Thibault De Boissiere, Lucas Gestin, Wei Zhen Teoh, Jose Sotelo, Alexandre De Brebisson, Yoshua Bengio, and Aaron C Courville. 2019. Melgan: Generative adversarial networks for conditional waveform synthesis. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_18_1","volume-title":"Dancing to music. Advances in neural information processing systems 32","author":"Lee Hsin-Ying","year":"2019","unstructured":"Hsin-Ying Lee, Xiaodong Yang, Ming-Yu Liu, Ting-Chun Wang, Yu-Ding Lu, Ming-Hsuan Yang, and Jan Kautz. 2019. Dancing to music. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_19_1","unstructured":"Bochen Li and Aparna Kumar. 2019. Query by Video: Cross-modal Music Retrieval.. In ISMIR. 604--611."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"e_1_3_2_1_21_1","volume-title":"MERT: Acoustic Music Understanding Model with Large-Scale Self-supervised Training. arXiv preprint arXiv:2306.00107","author":"Li Yizhi","year":"2023","unstructured":"Yizhi Li, Ruibin Yuan, Ge Zhang, Yinghao Ma, Xingran Chen, Hanzhi Yin, Chenghua Lin, Anton Ragni, Emmanouil Benetos, Norbert Gyenge, et al. 2023. MERT: Acoustic Music Understanding Model with Large-Scale Self-supervised Training. arXiv preprint arXiv:2306.00107 (2023)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/2816795.2818013"},{"key":"e_1_3_2_1_23_1","volume-title":"Juhyun Lee, et al.","author":"Lugaresi Camillo","year":"2019","unstructured":"Camillo Lugaresi, Jiuqiang Tang, Hadon Nash, Chris McClanahan, Esha Uboweja, Michael Hays, Fan Zhang, Chuo-Ling Chang, Ming Guang Yong, Juhyun Lee, et al. 2019. Mediapipe: A framework for building perception pipelines. arXiv preprint arXiv:1906.08172 (2019)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01420"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i1.16117"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00221"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-020-08836-3"},{"key":"e_1_3_2_1_29_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413932"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413721"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01077"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2981989"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01031"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240526"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-05710-7_21"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3478513.3480570"},{"key":"e_1_3_2_1_38_1","volume-title":"WaveNet: A Generative Model for Raw Audio. In 9th ISCA Speech Synthesis Workshop. 125--125","author":"van den Oord A\u00e4ron","unstructured":"A\u00e4ron van den Oord, Sander Dieleman, Heiga Zen, Karen Simonyan, Oriol Vinyals, Alex Graves, Nal Kalchbrenner, Andrew Senior, and Koray Kavukcuoglu [n. d.]. WaveNet: A Generative Model for Raw Audio. In 9th ISCA Speech Synthesis Workshop. 125--125."},{"key":"e_1_3_2_1_39_1","volume-title":"Melnet: A generative model for audio in the frequency domain. arXiv preprint arXiv:1906.01083","author":"Vasquez Sean","year":"2019","unstructured":"Sean Vasquez and Mike Lewis. 2019. Melnet: A generative model for audio in the frequency domain. arXiv preprint arXiv:1906.01083 (2019)."},{"key":"e_1_3_2_1_40_1","volume-title":"European Conference on Computer Vision. Springer, 233--249","author":"Wang Zeyu","year":"2022","unstructured":"Zeyu Wang, Yu Wu, Karthik Narasimhan, and Olga Russakovsky. 2022. Multiquery video retrieval. In European Conference on Computer Vision. Springer, 233--249."},{"key":"e_1_3_2_1_41_1","volume-title":"Self-supervised learning of music-dance representation through explicit-implicit rhythm synchronization. arXiv preprint arXiv:2207.03190","author":"Yu Jiashuo","year":"2022","unstructured":"Jiashuo Yu, Junfu Pu, Ying Cheng, Rui Feng, and Ying Shan. 2022. Self-supervised learning of music-dance representation through explicit-implicit rhythm synchronization. arXiv preprint arXiv:2207.03190 (2022)."},{"key":"e_1_3_2_1_42_1","volume-title":"Classification is a strong baseline for deep metric learning. arXiv preprint arXiv:1811.12649","author":"Zhai Andrew","year":"2018","unstructured":"Andrew Zhai and Hao-Yu Wu. 2018. Classification is a strong baseline for deep metric learning. arXiv preprint arXiv:1811.12649 (2018)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095203"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485664"}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658045","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658045","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:54:06Z","timestamp":1755766446000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658045"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":44,"alternative-id":["10.1145\/3652583.3658045","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658045","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}