{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T18:14:03Z","timestamp":1770142443268,"version":"3.49.0"},"reference-count":37,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,4]],"date-time":"2023-06-04T00:00:00Z","timestamp":1685836800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6,4]]},"DOI":"10.1109\/icassp49357.2023.10095935","type":"proceedings-article","created":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T13:28:30Z","timestamp":1683293310000},"page":"1-5","source":"Crossref","is-referenced-by-count":10,"title":["Diffroll: Diffusion-Based Generative Music Transcription with Unsupervised Pretraining Capability"],"prefix":"10.1109","author":[{"given":"Kin Wai","family":"Cheuk","sequence":"first","affiliation":[{"name":"Singapore University of Technology and Design,Singapore"}]},{"given":"Ryosuke","family":"Sawata","sequence":"additional","affiliation":[{"name":"Sony Group Corporation,Tokyo,Japan"}]},{"given":"Toshimitsu","family":"Uesaka","sequence":"additional","affiliation":[{"name":"Sony Group Corporation,Tokyo,Japan"}]},{"given":"Naoki","family":"Murata","sequence":"additional","affiliation":[{"name":"Sony Group Corporation,Tokyo,Japan"}]},{"given":"Naoya","family":"Takahashi","sequence":"additional","affiliation":[{"name":"Sony Group Corporation,Tokyo,Japan"}]},{"given":"Shusuke","family":"Takahashi","sequence":"additional","affiliation":[{"name":"Sony Group Corporation,Tokyo,Japan"}]},{"given":"Dorien","family":"Herremans","sequence":"additional","affiliation":[{"name":"Singapore University of Technology and Design,Singapore"}]},{"given":"Yuki","family":"Mitsufuji","sequence":"additional","affiliation":[{"name":"Sony Group Corporation,Tokyo,Japan"}]}],"member":"263","reference":[{"key":"ref13","article-title":"Semi-supervised convolutive nmf for automatic music transcription","author":"wu","year":"2022"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR48806.2021.9412155"},{"key":"ref12","first-page":"14918","article-title":"Unaligned supervision for automatic music transcription in the wild","author":"maman","year":"2022","journal-title":"International Conference on Machine Learning"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN52387.2021.9533407"},{"key":"ref15","first-page":"183","article-title":"Deep unsupervised drum transcription","author":"choi","year":"2019","journal-title":"Proceedings of the International Society for Music Information Retrieval (ISMIR)"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2016.2533858"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475405"},{"key":"ref36","article-title":"Maps-a piano database for multipitch estimation and automatic transcription of music","author":"emiya","year":"2010"},{"key":"ref31","article-title":"Enabling factorized piano music modeling and generation with the MAESTRO dataset","author":"hawthorne","year":"2019","journal-title":"International Conference on Learning Representations"},{"key":"ref30","article-title":"Progressive distillation for fast sampling of diffusion models","author":"salimans","year":"2022","journal-title":"International Conference on Learning Representations"},{"key":"ref11","article-title":"Hppnet: Modeling the harmonic structure and pitch invariance in piano transcription","author":"wei","year":"2022","journal-title":"Proceedings of the International Society for Music Information Retrieval (ISMIR)"},{"key":"ref33","article-title":"nnaudio: An on-the-fly gpu audio to spectrogram conversion toolbox using 1d convolution neural networks","author":"cheuk","year":"2019"},{"key":"ref10","article-title":"Jointist: Joint learning for multi-instrument transcription and its applications","author":"cheuk","year":"2022"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN48605.2020.9207605"},{"key":"ref2","article-title":"Bayesian music transcription","author":"cemgil","year":"2004","journal-title":"Ph D thesis"},{"key":"ref1","first-page":"175","article-title":"A classification-based polyphonic piano transcription approach using learned feature representations","author":"nam","year":"2011","journal-title":"Proceedings of the International Society for Music Information Retrieval (ISMIR)"},{"key":"ref17","first-page":"2256","article-title":"Deep unsupervised learning using nonequilibrium thermodynamics","volume":"37","author":"sohl-dickstein","year":"2015","journal-title":"Proceedings of The 32nd International Conference on Machine Learning"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ASPAA.2003.1285860"},{"key":"ref19","article-title":"Wavegrad: Estimating gradients for waveform generation","author":"chen","year":"2020","journal-title":"International Conference on Learning Representations"},{"key":"ref18","article-title":"Diffwave: A versatile diffusion model for audio synthesis","author":"kong","year":"2020","journal-title":"International Conference on Learning Representations"},{"key":"ref24","first-page":"8821","article-title":"Zero-shot text-to-image generation","author":"ramesh","year":"2021","journal-title":"International Conference on Machine Learning"},{"key":"ref23","article-title":"Analog bits: Generating discrete data using diffusion models with self-conditioning","author":"chen","year":"2022"},{"key":"ref26","article-title":"Multi-instrument music synthesis with spectrogram diffusion","author":"hawthorne","year":"2022","journal-title":"Proceedings of the International Society for Music Information Retrieval (ISMIR)"},{"key":"ref25","first-page":"468","article-title":"Symbolic music generation with diffusion models","author":"mittal","year":"2021","journal-title":"Proceedings of the International Society for Music Information Retrieval (ISMIR)"},{"key":"ref20","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"ho","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref22","article-title":"Denoising diffusion implicit models","author":"song","year":"2021","journal-title":"International Conference on Learning Representations"},{"key":"ref21","article-title":"Regularizing score-based models with score fokker-planck equations","author":"lai","year":"2022"},{"key":"ref28","article-title":"Classifier-free diffusion guidance","author":"ho","year":"2021","journal-title":"NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications"},{"key":"ref27","article-title":"Skipping the frame-level: Event-based piano transcription with neural semi-CRFs","author":"yan","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref29","article-title":"Denoising diffusion restoration models","author":"kawar","year":"2022"},{"key":"ref8","article-title":"Mt3: Multi-task multitrack music transcription","author":"gardner","year":"2021"},{"key":"ref7","first-page":"50","article-title":"Onsets and frames: Dual-objective piano transcription","author":"hawthorne","year":"2018","journal-title":"Proceedings of the International Society for Music Information Retrieval (ISMIR)"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3121991"},{"key":"ref4","article-title":"Musicvae: Creating a palette for musical scores with machine learning","author":"roberts","year":"2018"},{"key":"ref3","first-page":"1538","article-title":"Unsupervised transcription of piano music","author":"berg-kirkpatrick","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11312"},{"key":"ref5","article-title":"GANSynth: Adversarial neural audio synthesis","author":"engel","year":"2019","journal-title":"International Conference on Learning Representations"}],"event":{"name":"ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Rhodes Island, Greece","start":{"date-parts":[[2023,6,4]]},"end":{"date-parts":[[2023,6,10]]}},"container-title":["ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10094559\/10094560\/10095935.pdf?arnumber=10095935","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,20]],"date-time":"2023-11-20T13:57:56Z","timestamp":1700488676000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10095935\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,4]]},"references-count":37,"URL":"https:\/\/doi.org\/10.1109\/icassp49357.2023.10095935","relation":{},"subject":[],"published":{"date-parts":[[2023,6,4]]}}}