{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:40:29Z","timestamp":1755823229959,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":25,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T00:00:00Z","timestamp":1715558400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,13]]},"DOI":"10.1145\/3589335.3651535","type":"proceedings-article","created":{"date-parts":[[2024,5,12]],"date-time":"2024-05-12T18:41:21Z","timestamp":1715539281000},"page":"967-970","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["MART: Learning Hierarchical Music Audio Representations with Part-Whole Transformer"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-2444-8201","authenticated-orcid":false,"given":"Dong","family":"Yao","sequence":"first","affiliation":[{"name":"Zhejiang Unversity, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5666-8320","authenticated-orcid":false,"given":"Jieming","family":"Zhu","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6141-4125","authenticated-orcid":false,"given":"Jiahao","family":"Xun","sequence":"additional","affiliation":[{"name":"Zhejiang Unversity, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0030-8289","authenticated-orcid":false,"given":"Shengyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang Unversity, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6121-0384","authenticated-orcid":false,"given":"Zhou","family":"Zhao","sequence":"additional","affiliation":[{"name":"Zhejiang Unversity, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7242-0794","authenticated-orcid":false,"given":"Liqun","family":"Deng","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5988-7609","authenticated-orcid":false,"given":"Wenqiao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang Unversity, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2231-4663","authenticated-orcid":false,"given":"Zhenhua","family":"Dong","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9117-8247","authenticated-orcid":false,"given":"Xin","family":"Jiang","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Hong Kong, China"}]}],"member":"320","published-online":{"date-parts":[[2024,5,13]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Andrea Agostinelli and et al. 2023. MusicLM: Generating Music From Text. CoRR Vol. abs\/2301.11325 (2023)."},{"key":"e_1_3_2_2_2_1","volume-title":"Gy\u00f6 rgy Fazekas, and Mark B. Sandler","author":"Choi Keunwoo","year":"2016","unstructured":"Keunwoo Choi, Gy\u00f6 rgy Fazekas, and Mark B. Sandler. 2016. Automatic Tagging Using Deep Convolutional Neural Networks. In ISMIR. 805--811."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.5334\/tismir.41"},{"key":"e_1_3_2_2_4_1","unstructured":"Michal Genussov and Israel Cohen. 2010. Musical genre classification of audio signals using geometric methods. In EUSIPCO. 497--501."},{"key":"e_1_3_2_2_5_1","first-page":"21271","article-title":"Bootstrap your own latent-a new approach to self-supervised learning","volume":"33","author":"Grill Jean-Bastien","year":"2020","unstructured":"Jean-Bastien Grill, Florian Strub, Florent Altch\u00e9, Corentin Tallec, Pierre Richemond, Elena Buchatskaya, Carl Doersch, Bernardo Avila Pires, Zhaohan Guo, Mohammad Gheshlaghi Azar, et al. 2020. Bootstrap your own latent-a new approach to self-supervised learning. NeurIPS , Vol. 33 (2020), 21271--21284.","journal-title":"NeurIPS"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco_a_01557"},{"key":"e_1_3_2_2_7_1","volume-title":"Music Transformer: Generating Music with Long-Term Structure. In ICLR.","author":"Anna Huang Cheng-Zhi","year":"2019","unstructured":"Cheng-Zhi Anna Huang, Ashish Vaswani, Jakob Uszkoreit, Ian Simon, Curtis Hawthorne, Noam Shazeer, Andrew M. Dai, Matthew D. Hoffman, Monica Dinculescu, and Douglas Eck. 2019. Music Transformer: Generating Music with Long-Term Structure. In ICLR."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"crossref","unstructured":"Taejun Kim Jongpil Lee and Juhan Nam. 2018. Sample-level CNN architectures for music auto-tagging using raw waveforms. In ICASSP. 366--370.","DOI":"10.1109\/ICASSP.2018.8462046"},{"key":"e_1_3_2_2_9_1","unstructured":"Edith Law Kris West Michael I Mandel Mert Bay and J Stephen Downie. 2009. Evaluation of algorithms using games: The case of music tagging.. In ISMIR."},{"key":"e_1_3_2_2_10_1","volume-title":"Keunhyoung Luke Kim, and Juhan Nam","author":"Lee Jongpil","year":"2019","unstructured":"Jongpil Lee, Jiyoung Park, Keunhyoung Luke Kim, and Juhan Nam. 2019. Sample-level deep convolutional neural networks for music auto-tagging using raw waveforms. SMC (2019), 220--226."},{"key":"e_1_3_2_2_11_1","volume-title":"MERT: Acoustic Music Understanding Model with Large-Scale Self-supervised Training. CoRR","author":"Li Yizhi","year":"2023","unstructured":"Yizhi Li, Ruibin Yuan, Ge Zhang, Yinghao Ma, Xingran Chen, Hanzhi Yin, Chenghua Lin, Anton Ragni, Emmanouil Benetos, Norbert Gyenge, Roger B. Dannenberg, Ruibo Liu, Wenhu Chen, Gus Xia, Yemin Shi, Wenhao Huang, Yike Guo, and Jie Fu. 2023. MERT: Acoustic Music Understanding Model with Large-Scale Self-supervised Training. CoRR , Vol. abs\/2306.00107 (2023)."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Daisuke Niizumi Daiki Takeuchi Yasunori Ohishi Noboru Harada and Kunio Kashino. 2021. Byol for audio: Self-supervised learning for general-purpose audio representation. In IJCNN. 1--8.","DOI":"10.1109\/IJCNN52387.2021.9534474"},{"key":"e_1_3_2_2_13_1","volume-title":"Matthew Prockup, Erik M Schmidt, Andreas F Ehmann, and Xavier Serra.","author":"Puig Jordi Pons","year":"2018","unstructured":"Jordi Pons Puig, Oriol Nieto Caballero, Matthew Prockup, Erik M Schmidt, Andreas F Ehmann, and Xavier Serra. 2018. End-to-end learning for music audio tagging at scale. In ISMIR."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"Aaqib Saeed David Grangier and Neil Zeghidour. 2021. Contrastive Learning of General-Purpose Audio Representations. In ICASSP. 3875--3879.","DOI":"10.1109\/ICASSP39728.2021.9413528"},{"key":"e_1_3_2_2_15_1","unstructured":"Janne Spijkervet and John Ashley Burgoyne. 2021. Contrastive Learning of Musical Representations. In ISMIR."},{"key":"e_1_3_2_2_16_1","volume-title":"Multi-Format Contrastive Learning of Audio Representations. arXiv:2103.06508 (March","author":"Wang Luyu","year":"2021","unstructured":"Luyu Wang and Aaron van den Oord. 2021. Multi-Format Contrastive Learning of Audio Representations. arXiv:2103.06508 (March 2021)."},{"key":"e_1_3_2_2_17_1","volume-title":"Data-Driven Harmonic Filters for Audio Representation Learning. ICASSP","author":"Won Minz","year":"2020","unstructured":"Minz Won, Sanghyuk Chun, Oriol Nieto, and Xavier Serrc. 2020a. Data-Driven Harmonic Filters for Audio Representation Learning. ICASSP (2020), 536--540."},{"key":"e_1_3_2_2_18_1","unstructured":"Minz Won Sanghyuk Chun and Xavier Serra. 2019. Toward Interpretable Music Tagging with Self-Attention. (2019)."},{"key":"e_1_3_2_2_19_1","volume-title":"Evaluation of CNN-based automatic music tagging models. SMC","author":"Won Minz","year":"2020","unstructured":"Minz Won, Andres Ferraro, Dmitry Bogdanov, and Xavier Serra. 2020b. Evaluation of CNN-based automatic music tagging models. SMC (2020), 331--337."},{"key":"e_1_3_2_2_20_1","unstructured":"Shangda Wu Dingyao Yu Xu Tan and Maosong Sun. 2023. CLaMP: Contrastive Language-Music Pre-Training for Cross-Modal Symbolic Music Information Retrieval. In ISMIR. 157--165."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"crossref","unstructured":"Xiaoshuo Xu Xiaoou Chen and Deshun Yang. 2018. Key-invariant convolutional neural network toward efficient cover song identification. In ICME. 1--6.","DOI":"10.1109\/ICME.2018.8486531"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"crossref","unstructured":"Dong Yao Zhou Zhao Shengyu Zhang Jieming Zhu Yudong Zhu Rui Zhang and Xiuqiang He. 2022. Contrastive Learning with Positive-Negative Frame Mask for Music Representation. In WWW. 2906--2915.","DOI":"10.1145\/3485447.3512011"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"crossref","unstructured":"Zhesong Yu Xiaoshuo Xu Xiaoou Chen and Deshun Yang. 2019. Temporal Pyramid Pooling Convolutional Neural Network for Cover Song Identification.. In IJCAI. 4846--4852.","DOI":"10.24963\/ijcai.2019\/673"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"crossref","unstructured":"Zhesong Yu Xiaoshuo Xu Xiaoou Chen and Deshun Yang. 2020. Learning a representation for cover song identification using convolutional neural network. In ICASSP. 541--545.","DOI":"10.1109\/ICASSP40776.2020.9053839"},{"key":"e_1_3_2_2_25_1","unstructured":"Mingliang Zeng Xu Tan Rui Wang Zeqian Ju Tao Qin and Tie-Yan Liu. 2021. MusicBERT: Symbolic Music Understanding with Large-Scale Pre-Training. (2021). io"}],"event":{"name":"WWW '24: The ACM Web Conference 2024","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"],"location":"Singapore Singapore","acronym":"WWW '24"},"container-title":["Companion Proceedings of the ACM Web Conference 2024"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3589335.3651535","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3589335.3651535","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:33:46Z","timestamp":1755822826000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3589335.3651535"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,13]]},"references-count":25,"alternative-id":["10.1145\/3589335.3651535","10.1145\/3589335"],"URL":"https:\/\/doi.org\/10.1145\/3589335.3651535","relation":{},"subject":[],"published":{"date-parts":[[2024,5,13]]},"assertion":[{"value":"2024-05-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}