{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:10:05Z","timestamp":1755889805029,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62271034"],"award-info":[{"award-number":["62271034"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,13]]},"DOI":"10.1145\/3726302.3730034","type":"proceedings-article","created":{"date-parts":[[2025,7,14]],"date-time":"2025-07-14T01:18:36Z","timestamp":1752455916000},"page":"348-357","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MIDI-Zero: A MIDI-driven Self-Supervised Learning Approach for Music Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-3957-5727","authenticated-orcid":false,"given":"Yuhang","family":"Su","sequence":"first","affiliation":[{"name":"Beijing University of Chemical Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5320-6086","authenticated-orcid":false,"given":"Wei","family":"Hu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Beijing University of Chemical Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2953-7223","authenticated-orcid":false,"given":"Hongfeng","family":"Gao","sequence":"additional","affiliation":[{"name":"Beijing University of Chemical Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2058-2373","authenticated-orcid":false,"given":"Fan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Beijing University of Chemical Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,7,13]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Clarinet: A Music Retrieval System. arXiv preprint arXiv:2210.12648","author":"Alwadhi Kshitij","year":"2022","unstructured":"Kshitij Alwadhi, Rohan Sharma, and Siddhant Sharma. 2022. Clarinet: A Music Retrieval System. arXiv preprint arXiv:2210.12648 (2022)."},{"unstructured":"Amantur Amatov Dmitry Lamanov Maksim Titov Ivan Vovk Ilya Makarov and Mikhail A Kudinov. 2023. A Semi-Supervised Deep Learning Approach to Dataset Collection for Query-by-Humming Task.. In ISMIR. 649--656.","key":"e_1_3_2_1_2_1"},{"key":"e_1_3_2_1_3_1","volume-title":"Waveprint: Efficient wavelet-based audio fingerprinting. Pattern recognition","author":"Baluja Shumeet","year":"2008","unstructured":"Shumeet Baluja and Michele Covell. 2008. Waveprint: Efficient wavelet-based audio fingerprinting. Pattern recognition, Vol. 41, 11 (2008), 3467--3480."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_4_1","DOI":"10.1109\/MSP.2018.2869928"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_5_1","DOI":"10.1109\/ICASSP43922.2022.9746549"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_6_1","DOI":"10.1007\/s11265-005-4151-3"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_7_1","DOI":"10.1109\/ICASSP39728.2021.9414337"},{"key":"e_1_3_2_1_8_1","volume-title":"International conference on machine learning. PMLR, 1597--1607","author":"Chen Ting","year":"2020","unstructured":"Ting Chen, Simon Kornblith, Mohammad Norouzi, and Geoffrey Hinton. 2020. A simple framework for contrastive learning of visual representations. In International conference on machine learning. PMLR, 1597--1607."},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the AISB 2003 Symposium on Artificial Intelligence and Creativity in the Arts and Sciences. Citeseer, 30--35","author":"Conklin Darrell","year":"2003","unstructured":"Darrell Conklin. 2003. Music generation from statistical models. In Proceedings of the AISB 2003 Symposium on Artificial Intelligence and Creativity in the Arts and Sciences. Citeseer, 30--35."},{"key":"e_1_3_2_1_10_1","volume-title":"Dejavu: open-source audio fingerprinting project. Dejavu: open-source audio fingerprinting project","author":"Drevo W","year":"2014","unstructured":"W Drevo. 2014. Dejavu: open-source audio fingerprinting project. Dejavu: open-source audio fingerprinting project (2014)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_11_1","DOI":"10.1109\/ICASSP43922.2022.9747630"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_12_1","DOI":"10.1109\/ICASSP49357.2023.10095389"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_13_1","DOI":"10.1109\/ICASSP39728.2021.9414128"},{"key":"e_1_3_2_1_14_1","volume-title":"ByteHum: Fast and Accurate Query-by-Humming in the Wild. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1111--1115","author":"Du Xingjian","year":"2024","unstructured":"Xingjian Du, Pei Zou, Mingyu Liu, Xia Liang, Minghang Chu, and Bilei Zhu. 2024. ByteHum: Fast and Accurate Query-by-Humming in the Wild. In ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 1111--1115."},{"unstructured":"Beat Gfeller Ruiqi Guo Kevin Kilgour Sanjiv Kumar James Lyon Julian Odell Marvin Ritter Dominik Roblek Matthew Sharifi Mihajlo Velimirovi\u0107 et al. 2017. Now playing: Continuous low-power music recognition. arXiv preprint arXiv:1711.10958 (2017).","key":"e_1_3_2_1_15_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_16_1","DOI":"10.1145\/217279.215273"},{"key":"e_1_3_2_1_17_1","volume-title":"Dagstuhl Follow-Ups","volume":"3","author":"Grosche Peter","year":"2012","unstructured":"Peter Grosche, Meinard M\u00fcller, and Joan Serra. 2012. Audio content-based music retrieval. In Dagstuhl Follow-Ups, Vol. 3. Schloss Dagstuhl-Leibniz-Zentrum f\u00fcr Informatik."},{"key":"e_1_3_2_1_18_1","first-page":"107","article-title":"A highly robust audio fingerprinting system","volume":"2002","author":"Haitsma Jaap","year":"2002","unstructured":"Jaap Haitsma and Ton Kalker. 2002. A highly robust audio fingerprinting system.. In Ismir, Vol. 2002. 107--115.","journal-title":"Ismir"},{"key":"e_1_3_2_1_19_1","volume-title":"Onsets and frames: Dual-objective piano transcription. arXiv preprint arXiv:1710.11153","author":"Hawthorne Curtis","year":"2017","unstructured":"Curtis Hawthorne, Erich Elsen, Jialin Song, Adam Roberts, Ian Simon, Colin Raffel, Jesse Engel, Sageev Oore, and Douglas Eck. 2017. Onsets and frames: Dual-objective piano transcription. arXiv preprint arXiv:1710.11153 (2017)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_20_1","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","volume-title":"Proc. of the 10th International Symposium on Computer Music Multidisciplinary Research","volume":"16","author":"Huang Tongbo","year":"2013","unstructured":"Tongbo Huang, Guangyu Xia, Yifei Ma, Roger Dannenberg, and Christos Faloutsos. 2013. MidiFind: fast and effective similarity searching in large MIDI databases. In Proc. of the 10th International Symposium on Computer Music Multidisciplinary Research, Marseille, France, Vol. 16."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_22_1","DOI":"10.1109\/TBDATA.2019.2921572"},{"key":"e_1_3_2_1_23_1","volume-title":"Giantmidi-piano: A large-scale midi dataset for classical piano music. arXiv preprint arXiv:2010.07061","author":"Kong Qiuqiang","year":"2020","unstructured":"Qiuqiang Kong, Bochen Li, Jitong Chen, and Yuxuan Wang. 2020. Giantmidi-piano: A large-scale midi dataset for classical piano music. arXiv preprint arXiv:2010.07061 (2020)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_24_1","DOI":"10.1145\/354384.354520"},{"volume-title":"A Generative Theory of Tonal Music, reissue, with a new preface","author":"Lerdahl Fred","unstructured":"Fred Lerdahl and Ray S Jackendoff. 1996. A Generative Theory of Tonal Music, reissue, with a new preface. MIT press.","key":"e_1_3_2_1_25_1"},{"key":"e_1_3_2_1_26_1","volume-title":"Robust Singing Voice Transcription Serves Synthesis. arXiv preprint arXiv:2405.09940","author":"Li Ruiqi","year":"2024","unstructured":"Ruiqi Li, Yu Zhang, Yongqi Wang, Zhiqing Hong, Rongjie Huang, and Zhou Zhao. 2024. Robust Singing Voice Transcription Serves Synthesis. arXiv preprint arXiv:2405.09940 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"CoverHunter: Cover Song Identification with Refined Attention and Alignments. In 2023 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1080--1085","author":"Liu Feng","year":"2023","unstructured":"Feng Liu, Deyi Tuo, Yinan Xu, and Xintong Han. 2023. CoverHunter: Cover Song Identification with Refined Attention and Alignments. In 2023 IEEE International Conference on Multimedia and Expo (ICME). IEEE, 1080--1085."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_28_1","DOI":"10.1609\/aaai.v36i10.21350"},{"key":"e_1_3_2_1_29_1","volume-title":"MIREX 2021: Query by Singing\/Humming. https:\/\/www.music-ir.org\/mirex\/wiki\/2021:Query_by_Singing\/Humming","author":"MIREX.","year":"2021","unstructured":"MIREX. 2021. MIREX 2021: Query by Singing\/Humming. https:\/\/www.music-ir.org\/mirex\/wiki\/2021:Query_by_Singing\/Humming"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_30_1","DOI":"10.5555\/1324818"},{"key":"e_1_3_2_1_31_1","volume-title":"https:\/\/openai.com\/blog\/musenet\/","author":"MuseNet AI.","year":"2019","unstructured":"OpenAI. 2019. MuseNet. OpenAI Blog (2019). https:\/\/openai.com\/blog\/musenet\/"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_32_1","DOI":"10.1145\/3394171.3413721"},{"key":"e_1_3_2_1_33_1","volume-title":"Octubre","author":"Romero-Velo Hilda","year":"2023","unstructured":"Hilda Romero-Velo, Susana Ladra, Jos\u00e9 R Param\u00e1, and Fernando Silva-Coira. 2023. Indexing and Retrieval of Scores by Humming based on Extracted Features. In VI Congreso Xove TIC: impulsando el talento cient\u00edfico. Octubre, 2023, A Coru na. Universidade da Coru na, Servizo de Publicaci\u00f3ns, 35--42."},{"key":"e_1_3_2_1_34_1","volume-title":"Audio cover song identification and similarity: background, approaches, evaluation, and beyond. Advances in music information retrieval","author":"Serra Joan","year":"2010","unstructured":"Joan Serra, Emilia G\u00f3mez, and Perfecto Herrera. 2010. Audio cover song identification and similarity: background, approaches, evaluation, and beyond. Advances in music information retrieval (2010), 307--332."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_35_1","DOI":"10.1109\/ICME.2013.6607520"},{"key":"e_1_3_2_1_36_1","volume-title":"Contrastive learning of musical representations. arXiv preprint arXiv:2103.09410","author":"Spijkervet Janne","year":"2021","unstructured":"Janne Spijkervet and John Ashley Burgoyne. 2021. Contrastive learning of musical representations. arXiv preprint arXiv:2103.09410 (2021)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_37_1","DOI":"10.1145\/3664647.3681647"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_38_1","DOI":"10.1109\/ICAICTA56449.2022.9933001"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_39_1","DOI":"10.1145\/1145287.1145312"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_40_1","DOI":"10.1109\/TNNLS.2020.2984665"},{"unstructured":"Qingyang Xi Rachel M Bittner Johan Pauwels Xuzhou Ye and Juan Pablo Bello. 2018. GuitarSet: A Dataset for Guitar Transcription.. In ISMIR. 453--460.","key":"e_1_3_2_1_41_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_42_1","DOI":"10.1145\/3539618.3591664"},{"key":"e_1_3_2_1_43_1","volume-title":"Contrastive unsupervised learning for audio fingerprinting. arXiv preprint arXiv:2010.13540","author":"Yu Zhesong","year":"2020","unstructured":"Zhesong Yu, Xingjian Du, Bilei Zhu, and Zejun Ma. 2020a. Contrastive unsupervised learning for audio fingerprinting. arXiv preprint arXiv:2010.13540 (2020)."},{"doi-asserted-by":"crossref","unstructured":"Zhesong Yu Xiaoshuo Xu Xiaoou Chen and Deshun Yang. 2019. Temporal Pyramid Pooling Convolutional Neural Network for Cover Song Identification.. In IJCAI. 4846--4852.","key":"e_1_3_2_1_44_1","DOI":"10.24963\/ijcai.2019\/673"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_45_1","DOI":"10.1109\/ICASSP40776.2020.9053839"},{"key":"e_1_3_2_1_46_1","first-page":"6914","article-title":"M4singer: A multi-style, multi-singer and musical score provided mandarin singing corpus","volume":"35","author":"Zhang Lichao","year":"2022","unstructured":"Lichao Zhang, Ruiqi Li, Shoutong Wang, Liqun Deng, Jinglin Liu, Yi Ren, Jinzheng He, Rongjie Huang, Jieming Zhu, Xiao Chen, et al., 2022. M4singer: A multi-style, multi-singer and musical score provided mandarin singing corpus. Advances in Neural Information Processing Systems, Vol. 35 (2022), 6914--6926.","journal-title":"Advances in Neural Information Processing Systems"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_47_1","DOI":"10.1109\/ICASSP43922.2022.9746056"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_48_1","DOI":"10.1145\/3474085.3475576"}],"event":{"sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"],"acronym":"SIGIR '25","name":"SIGIR '25: The 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Padua Italy"},"container-title":["Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3726302.3730034","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T18:29:42Z","timestamp":1755887382000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3726302.3730034"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,13]]},"references-count":48,"alternative-id":["10.1145\/3726302.3730034","10.1145\/3726302"],"URL":"https:\/\/doi.org\/10.1145\/3726302.3730034","relation":{},"subject":[],"published":{"date-parts":[[2025,7,13]]},"assertion":[{"value":"2025-07-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}