{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T17:40:32Z","timestamp":1779385232067,"version":"3.53.1"},"reference-count":60,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,10,31]],"date-time":"2023-10-31T00:00:00Z","timestamp":1698710400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,10,31]],"date-time":"2023-10-31T00:00:00Z","timestamp":1698710400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100006477","name":"National Taiwan University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100006477","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,10,31]]},"DOI":"10.1109\/apsipaasc58517.2023.10317286","type":"proceedings-article","created":{"date-parts":[[2023,11,20]],"date-time":"2023-11-20T14:07:46Z","timestamp":1700489266000},"page":"1745-1752","source":"Crossref","is-referenced-by-count":2,"title":["Toward Leveraging Pre-Trained Self-Supervised Frontends for Automatic Singing Voice Understanding Tasks: Three Case Studies"],"prefix":"10.1109","author":[{"given":"Yuya","family":"Yamamoto","sequence":"first","affiliation":[{"name":"University of Tsukuba,Tsukuba,Japan"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/217279.215273"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ISM.Workshops.2007.19"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952233"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2018.2875133"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3190732"},{"key":"ref6","article-title":"Hierarchical classification networks for singing voice segmentation and transcription","volume-title":"The 20th International Society for Music Information Retrieval Conference (ISMIR)","author":"Fu"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.2019.8851988"},{"key":"ref8","first-page":"890","article-title":"Investigating time-frequency representations for audio feature extraction in singing technique classification","volume-title":"2021 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","author":"Yamamoto"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"ref11","first-page":"12 449","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","volume":"33","author":"Baevski","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"key":"ref14","article-title":"Mert: Acoustic music understanding model with large-scale self-supervised training","author":"Li","year":"2023"},{"key":"ref15","article-title":"Map-music2vec: A simple and effective baseline for self-supervised music audio representation learning","author":"Li","year":"2022"},{"key":"ref16","article-title":"Transfer learning of wav2vec 2.0 for automatic lyric transcription","volume-title":"The 23rd International Society for Music Information Retrieval Conference (ISMIR)","author":"Ou"},{"key":"ref17","article-title":"Singing beat tracking with self-supervised front-end and linear transformers","volume-title":"The 23rd International Society for Music Information Retrieval Conference (ISMIR)","author":"Heydari"},{"key":"ref18","article-title":"Deep audio-visual singing voice transcription based on self-supervised learning models","author":"Gu","year":"2023"},{"key":"ref19","article-title":"Melody transcription via generative pre-training","volume-title":"The 23rd International Society for Music Information Retrieval Conference (ISMIR)","author":"Donahue"},{"key":"ref20","article-title":"Semi-supervised learning using teacher-student models for vocal melody extraction","volume-title":"The 21st International Society for Music Information Retrieval Conference (ISMIR)","author":"Kum"},{"key":"ref21","first-page":"293","article-title":"Vocano: A note transcription framework for singing voice in polyphonic music","volume-title":"The 22nd International Society for Music Information Retrieval Conference (ISMIR)","author":"Hsu"},{"key":"ref22","first-page":"121","article-title":"Exploring data augmentation for improved singing voice detection with neural networks","volume-title":"The 16th International Society for Music Information Retrieval Conference (ISMIR)","author":"Schl\u00fcter"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054069"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3169627"},{"key":"ref25","article-title":"Self-supervised learning of context-aware pitch prosody representations","author":"Noufi","year":"2020"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9415096"},{"key":"ref27","article-title":"Pdaugment: Data augmentation by pitch and duration adjustments for automatic lyrics transcription","volume-title":"The 23rd International Society for Music Information Retrieval Conference (ISMIR)","author":"Zhang"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN48605.2020.9207052"},{"key":"ref30","article-title":"Mstre-net: Multistreaming acoustic modeling for automatic lyrics transcription","volume-title":"The 22nd International Society for Music Information Retrieval Conference (ISMIR)","author":"Demirel"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747490"},{"key":"ref32","article-title":"Codified audio language modeling learns useful representations for music information retrieval","volume-title":"The 22nd International Society for Music Information Retrieval Conference (ISMIR)","author":"Castellon"},{"key":"ref33","article-title":"Jukebox: A generative model for music","author":"Dhariwal","year":"2020"},{"key":"ref34","article-title":"A fine-tuned wav2vec 2.0\/hubert benchmark for speech emotion recognition, speaker verification and spoken language understanding","author":"Wang","year":"2021"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746952"},{"key":"ref36","article-title":"Applying wav2vec2. 0 to speech recognition in various low-resource languages","author":"Yi","year":"2020"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2021-1965"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.80"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3497510"},{"key":"ref40","first-page":"16 251","article-title":"Neural analysis and synthesis: Reconstructing speech from self-supervised representations","volume":"34","author":"Choi","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/SLT54892.2023.10023234"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-1280"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU51503.2021.9688093"},{"key":"ref44","article-title":"Classifying music audio with timbral and chroma features","volume-title":"The 8th International Conference for Music Information Retrieval Conference (ISMIR)","author":"Ellis"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096956"},{"key":"ref46","article-title":"Adam: A method for stochastic optimization","author":"Kingma","year":"2014"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-4012"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414601"},{"key":"ref49","article-title":"Evaluation framework for automatic singing transcription","volume-title":"The 15th International Society for Music Information Retrieval Conference (ISMIR)","author":"Molina"},{"key":"ref50","first-page":"6105","article-title":"Efficientnet: Rethinking model scaling for convolutional neural networks","volume-title":"International conference on machine learning","author":"Tan"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747147"},{"key":"ref52","first-page":"468","article-title":"Vocalset: A singing voice dataset","volume-title":"The Proceedings of the 19th International Society for Music Information Retrieval Conference (ISMIR)","author":"Wilkins"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11137"},{"key":"ref54","article-title":"Decoupling representation and classifier for long-tailed recognition","volume-title":"International Conference on Learning Representations (ICLR)","author":"Kang"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747814"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096707"},{"key":"ref57","article-title":"End-to-end lyrics transcription informed by pitch and onset estimation","volume-title":"The 23rd International Society for Music Information Retrieval Conference (ISMIR)","author":"Deng"},{"key":"ref58","article-title":"Melody extraction from polyphonic music by deep learning approaches: A review","author":"Rao","year":"2022"},{"key":"ref59","article-title":"Automatic lyrics transcription of polyphonic music","volume-title":"Ph.D. dissertation","author":"Gao","year":"2022"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2022.3166262"}],"event":{"name":"2023 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","location":"Taipei, Taiwan","start":{"date-parts":[[2023,10,31]]},"end":{"date-parts":[[2023,11,3]]}},"container-title":["2023 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10317071\/10317095\/10317286.pdf?arnumber=10317286","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,13]],"date-time":"2024-03-13T17:04:21Z","timestamp":1710349461000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10317286\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,31]]},"references-count":60,"URL":"https:\/\/doi.org\/10.1109\/apsipaasc58517.2023.10317286","relation":{},"subject":[],"published":{"date-parts":[[2023,10,31]]}}}