{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T19:40:43Z","timestamp":1776886843850,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T00:00:00Z","timestamp":1733184000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100014013","name":"UK Research and Innovation","doi-asserted-by":"publisher","award":["EP\/S022694\/1"],"award-info":[{"award-number":["EP\/S022694\/1"]}],"id":[{"id":"10.13039\/100014013","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,12,3]]},"DOI":"10.1145\/3696409.3700202","type":"proceedings-article","created":{"date-parts":[[2024,12,28]],"date-time":"2024-12-28T09:55:23Z","timestamp":1735379723000},"page":"1-8","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Pitch-aware generative pretraining improves multi-pitch estimation with scarce data"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-2572-9728","authenticated-orcid":false,"given":"Mary","family":"Pilataki","sequence":"first","affiliation":[{"name":"Queen Mary University of London, London, UK"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4352-6809","authenticated-orcid":false,"given":"Matthias","family":"Mauch","sequence":"additional","affiliation":[{"name":"Apple, London, UK"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6098-481X","authenticated-orcid":false,"given":"Simon","family":"Dixon","sequence":"additional","affiliation":[{"name":"Queen Mary University of London, London, UK"}]}],"member":"320","published-online":{"date-parts":[[2024,12,28]]},"reference":[{"key":"e_1_3_3_2_2_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Alain Guillaume","year":"2016","unstructured":"Guillaume Alain and Yoshua Bengio. 2016. Understanding intermediate layers using linear classifier probes. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1109\/SIU.2017.7960729"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","unstructured":"Emmanouil Benetos Simon Dixon Zhiyao Duan and Sebastian Ewert. 2019. Automatic Music Transcription: An Overview. IEEE Signal Processing Magazine 36 1 (2019) 20\u201330. 10.1109\/MSP.2018.2869928","DOI":"10.1109\/MSP.2018.2869928"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746549"},{"key":"e_1_3_3_2_6_2","volume-title":"International Society for Music Information Retrieval Conference (ISMIR)","author":"Bittner Rachel\u00a0M.","year":"2017","unstructured":"Rachel\u00a0M. Bittner, Brian McFee, Justin Salamon, Peter\u00a0Qi Li, and Juan\u00a0Pablo Bello. 2017. Deep Salience Representations for F0 Estimation in Polyphonic Music. In International Society for Music Information Retrieval Conference (ISMIR). https:\/\/api.semanticscholar.org\/CorpusID:4531539"},{"key":"e_1_3_3_2_7_2","unstructured":"Lee\u00a0Friese Callender Curtis Glenn-Macway Hawthorne and Jesse Engel. 2020. Improving Perceptual Quality of Drum Transcription with the Expanded Groove MIDI Dataset. ArXiv (2020). https:\/\/arxiv.org\/abs\/2004.00188"},{"key":"e_1_3_3_2_8_2","volume-title":"International Society for Music Information Retrieval Conference (ISMIR)","author":"Castellon Rodrigo","year":"2021","unstructured":"Rodrigo Castellon, Chris Donahue, and Percy Liang. 2021. Codified audio language modeling learns useful representations for music information retrieval. In International Society for Music Information Retrieval Conference (ISMIR)."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475405"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412155"},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446141"},{"key":"e_1_3_3_2_12_2","unstructured":"Frank Cwitkowitz and Zhiyao Duan. 2024. Toward Fully Self-Supervised Multi-Pitch Estimation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.15569 (2024)."},{"key":"e_1_3_3_2_13_2","unstructured":"Prafulla Dhariwal Heewoo Jun Christine Payne Jong\u00a0Wook Kim Alec Radford and Ilya Sutskever. 2020. Jukebox: A Generative Model for Music. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2005.00341 (2020)."},{"key":"e_1_3_3_2_14_2","volume-title":"International Society for Music Information Retrieval Conference (ISMIR)","author":"Donahue Chris","year":"2022","unstructured":"Chris Donahue, John Thickstun, and Percy Liang. 2022. Melody transcription via generative pre-training. In International Society for Music Information Retrieval Conference (ISMIR)."},{"key":"e_1_3_3_2_15_2","unstructured":"Alexandre D\u00e9fossez Jade Copet Gabriel Synnaeve and Yossi Adi. 2022. High Fidelity Neural Audio Compression. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2210.13438 (2022)."},{"key":"e_1_3_3_2_16_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Gardner Josh","year":"2021","unstructured":"Josh Gardner, Ian Simon, Ethan Manilow, Curtis Hawthorne, and Jesse Engel. 2021. MT3: Multi-Task Multitrack Music Transcription. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_17_2","volume-title":"International Society for Music Information Retrieval Conference (ISMIR)","author":"Hawthorne Curtis","year":"2018","unstructured":"Curtis Hawthorne, Erich Elsen, Jialin Song, Adam Roberts, Ian Simon, Colin Raffel, Jesse Engel, Sageev Oore, and Douglas Eck. 2018. Onsets and Frames: Dual-Objective Piano Transcription. In International Society for Music Information Retrieval Conference (ISMIR)."},{"key":"e_1_3_3_2_18_2","volume-title":"International Conference on Learning Representations (ICML)","author":"Hawthorne Curtis","year":"2019","unstructured":"Curtis Hawthorne, Andriy Stasyuk, Adam Roberts, Ian Simon, Cheng-Zhi\u00a0Anna Huang, Sander Dieleman, Erich Elsen, Jesse Engel, and Douglas Eck. 2019. Enabling Factorized Piano Music Modeling and Generation with the MAESTRO Dataset. In International Conference on Learning Representations (ICML). https:\/\/openreview.net\/forum?id=r1lYRjC9F7"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8683426"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1109\/APSIPAASC58517.2023.10317515"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO55093.2022.9909659"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Qiuqiang Kong Bochen Li Xuchen Song Yuan Wan and Yuxuan Wang. 2020. High-Resolution Piano Transcription With Pedals by Regressing Onset and Offset Times. IEEE\/ACM Transactions on Audio Speech and Language Processing 29 (2020) 3707\u20133717. https:\/\/api.semanticscholar.org\/CorpusID:222133261","DOI":"10.1109\/TASLP.2021.3121991"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747147"},{"key":"e_1_3_3_2_24_2","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Kumar Kundan","year":"2019","unstructured":"Kundan Kumar, Rithesh Kumar, Thibault de Boissiere, Lucas Gestin, Wei\u00a0Zhen Teoh, Jose Sotelo, Alexandre de Brebisson, Yoshua Bengio, and Aaron Courville. 2019. MelGAN: generative adversarial networks for conditional waveform synthesis. In Advances in Neural Information Processing Systems (NeurIPS). Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/file\/6804c9bca0a615bdb9374d00a9fcba59-Paper.pdf"},{"key":"e_1_3_3_2_25_2","first-page":"27980","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","volume":"36","author":"Kumar Rithesh","year":"2023","unstructured":"Rithesh Kumar, Prem Seetharaman, Alejandro Luebs, Ishaan Kumar, and Kundan Kumar. 2023. High-Fidelity Audio Compression with Improved RVQGAN. In Advances in Neural Information Processing Systems (NeurIPS) , A.\u00a0Oh, T.\u00a0Neumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.), Vol.\u00a036. Curran Associates, Inc., 27980\u201327993. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2023\/file\/58d0e78cf042af5876e12661087bea12-Paper-Conference.pdf"},{"key":"e_1_3_3_2_26_2","unstructured":"Jae\u00a0Hyun Lim and J.\u00a0C. Ye. 2017. Geometric GAN. ArXiv abs\/1705.02894 (2017). https:\/\/api.semanticscholar.org\/CorpusID:9010805"},{"key":"e_1_3_3_2_27_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Loshchilov Ilya","year":"2019","unstructured":"Ilya Loshchilov and Frank Hutter. 2019. Decoupled Weight Decay Regularization. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_28_2","series-title":"Proceedings of Machine Learning Research","first-page":"14918","volume-title":"Proceedings of the 39th International Conference on Machine Learning (ICML)","volume":"162","author":"Maman Ben","year":"2022","unstructured":"Ben Maman and Amit\u00a0H Bermano. 2022. Unaligned Supervision for Automatic Music Transcription in The Wild. In Proceedings of the 39th International Conference on Machine Learning (ICML)(Proceedings of Machine Learning Research, Vol.\u00a0162), Kamalika Chaudhuri, Stefanie Jegelka, Le\u00a0Song, Csaba Szepesvari, Gang Niu, and Sivan Sabato (Eds.). PMLR, 14918\u201314934. https:\/\/proceedings.mlr.press\/v162\/maman22a.html"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054340"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","unstructured":"Ethan Manilow Gordon Wichern Prem Seetharaman and Jonathan Le\u00a0Roux. 2019. Cutting Music Source Separation Some Slakh: A Dataset to Study the Impact of Training Data Quality and Quantity. IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA) (October 2019). 10.1109\/waspaa.2019.8937170","DOI":"10.1109\/waspaa.2019.8937170"},{"key":"e_1_3_3_2_31_2","first-page":"367","volume-title":"International Society for Music Information Retrieval Conference (ISMIR)","author":"Raffel Colin","year":"2014","unstructured":"Colin Raffel, Brian McFee, Eric\u00a0J. Humphrey, Justin Salamon, Oriol Nieto, Dawen Liang, and Daniel P.\u00a0W. Ellis. 2014. mir eval: A Transparent Implementation of Common MIR Metrics. In International Society for Music Information Retrieval Conference (ISMIR), Hsin-Min Wang, Yi-Hsuan Yang, and Jin\u00a0Ha Lee (Eds.). 367\u2013372. http:\/\/dblp.uni-trier.de\/db\/conf\/ismir\/ismir2014.html#RaffelMHSNLE14"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446182"},{"key":"e_1_3_3_2_33_2","volume-title":"Proceedings of the 24th Conf. of the International Society for Music Information Retrieval (ISMIR)","author":"Riou Alain","year":"2023","unstructured":"Alain Riou, Stefan Lattner, Ga\u00ebtan Hadjeres, and Geoffroy Peeters. 2023. PESTO: Pitch Estimation with Self-supervised Transposition-equivariant Objective. In Proceedings of the 24th Conf. of the International Society for Music Information Retrieval (ISMIR). Milan, Italy."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.5281\/zenodo.7316590"},{"key":"e_1_3_3_2_35_2","volume-title":"International Society for Music Information Retrieval Conference (ISMIR)","author":"Tamer Nazif\u00a0Can","year":"2023","unstructured":"Nazif\u00a0Can Tamer, Yigitcan \u00d6zer, Meinard M\u00fcller, and Xavier Serra. 2023. High Resolution Violin Transcription using weak labels. In International Society for Music Information Retrieval Conference (ISMIR)."},{"key":"e_1_3_3_2_36_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Thickstun John","year":"2017","unstructured":"John Thickstun, Zaid Harchaoui, and Sham Kakade. 2017. Learning Features of Music from Scratch. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_37_2","volume-title":"Proceedings of the 24th International Society for Music Information Retrieval Conference","author":"Toyama Keisuke","year":"2023","unstructured":"Keisuke Toyama, Taketo Akama, Yukara Ikemiya, Yuhta Takida, Wei-Hsiang Liao, and Yuki Mitsufuji. 2023. Automatic Piano Transcription with Hierarchical Frequency-Time Transformer. In Proceedings of the 24th International Society for Music Information Retrieval Conference."},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","unstructured":"G. Tzanetakis and P. Cook. 2002. Musical genre classification of audio signals. IEEE Transactions on Speech and Audio Processing 10 5 (2002) 293\u2013302. 10.1109\/TSA.2002.800560","DOI":"10.1109\/TSA.2002.800560"},{"key":"e_1_3_3_2_39_2","volume-title":"Advances in Neural Information Processing Systems (NeurIPS)","author":"Oord Aaron van\u00a0den","year":"2017","unstructured":"Aaron van\u00a0den Oord, Oriol Vinyals, and koray kavukcuoglu. 2017. Neural Discrete Representation Learning. In Advances in Neural Information Processing Systems (NeurIPS) , I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.), Vol.\u00a030. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2017\/file\/7a98af17e63a0ac09ce2e96d03992fbc-Paper.pdf"},{"key":"e_1_3_3_2_40_2","unstructured":"Laurens van\u00a0der Maaten and Geoffrey Hinton. 2008. Visualizing Data using t-SNE. Journal of Machine Learning Research 9 86 (2008) 2579\u20132605. http:\/\/jmlr.org\/papers\/v9\/vandermaaten08a.html"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414601"},{"key":"e_1_3_3_2_42_2","doi-asserted-by":"publisher","unstructured":"Jun-You Wang and Jyh-Shing\u00a0Roger Jang. 2023. Training a Singing Transcription Model Using Connectionist Temporal Classification Loss and Cross-Entropy Loss. IEEE\/ACM Transactions on Audio Speech and Language Processing 31 (2023) 383\u2013396. 10.1109\/TASLP.2022.3224297 https:\/\/dl.acm.org\/doi\/10.1109\/TASLP.2022.3224297","DOI":"10.1109\/TASLP.2022.3224297"},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"publisher","DOI":"10.5281\/zenodo.5624387"},{"key":"e_1_3_3_2_44_2","unstructured":"Ho-Hsiang Wu Chieh-Chi Kao Qingming Tang Ming Sun Brian McFee Juan\u00a0Pablo Bello and Chao Wang. 2021. Multi-Task Self-Supervised Pre-Training for Music Classification. IEEE International Conference on Acoustics Speech and Signal Processing (ICASSP) (2021) 556\u2013560. https:\/\/api.semanticscholar.org\/CorpusID:231839503"},{"key":"e_1_3_3_2_45_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682605"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","unstructured":"Y.\u00a0T. Wu B. Chen and L. Su. 2020. Multi-Instrument Automatic Music Transcription With Self-Attention-Based Instance Segmentation. IEEE\/ACM Transactions on Audio Speech and Language Processing 28 (2020) 2796\u20132809. 10.1109\/TASLP.2020.3030482 https:\/\/dl.acm.org\/doi\/10.1109\/TASLP.2020.3030482","DOI":"10.1109\/TASLP.2020.3030482"},{"key":"e_1_3_3_2_47_2","volume-title":"International Society for Music Information Retrieval Conference (ISMIR)","author":"Xi Qingyang","year":"2018","unstructured":"Qingyang Xi, Rachel\u00a0M. Bittner, Johan Pauwels, Xuzhou Ye, and Juan\u00a0Pablo Bello. 2018. GuitarSet: A Dataset for Guitar Transcription. In International Society for Music Information Retrieval Conference (ISMIR)."},{"key":"e_1_3_3_2_48_2","doi-asserted-by":"publisher","unstructured":"Neil Zeghidour Alejandro Luebs Ahmed Omran Jan Skoglund and Marco Tagliasacchi. 2022. SoundStream: An End-to-End Neural Audio Codec. IEEE\/ACM Transactions on Audio Speech and Language Processing 30 (2022) 495\u2013507. 10.1109\/TASLP.2021.3129994 https:\/\/dl.acm.org\/doi\/10.1109\/TASLP.2021.3129994","DOI":"10.1109\/TASLP.2021.3129994"}],"event":{"name":"MMAsia '24: ACM Multimedia Asia","location":"Auckland New Zealand","acronym":"MMAsia '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 6th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700202","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696409.3700202","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:10:15Z","timestamp":1750295415000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696409.3700202"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,3]]},"references-count":47,"alternative-id":["10.1145\/3696409.3700202","10.1145\/3696409"],"URL":"https:\/\/doi.org\/10.1145\/3696409.3700202","relation":{},"subject":[],"published":{"date-parts":[[2024,12,3]]},"assertion":[{"value":"2024-12-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}