{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T22:45:11Z","timestamp":1775083511763,"version":"3.50.1"},"reference-count":66,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100007040","name":"Singapore University of Technology and Design - Massachusetts Institute of Technology (SUTD-MIT) International Design Center","doi-asserted-by":"publisher","award":["IDG31800103"],"award-info":[{"award-number":["IDG31800103"]}],"id":[{"id":"10.13039\/501100007040","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Ministry of Education (MOE) Startup Research","award":["MOE2018-T2-2-161"],"award-info":[{"award-number":["MOE2018-T2-2-161"]}]},{"name":"Ministry of Education (MOE) Startup Research","award":["SRG ISTD 2017 129"],"award-info":[{"award-number":["SRG ISTD 2017 129"]}]},{"name":"Singapore International Graduate Award (SINGA) from the Agency for Science, Technology and Research","award":["SING-2018-02-0204"],"award-info":[{"award-number":["SING-2018-02-0204"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2020]]},"DOI":"10.1109\/access.2020.3019084","type":"journal-article","created":{"date-parts":[[2020,8,24]],"date-time":"2020-08-24T20:47:24Z","timestamp":1598302044000},"page":"161981-162003","source":"Crossref","is-referenced-by-count":65,"title":["nnAudio: An on-the-Fly GPU Audio to Spectrogram Conversion Toolbox Using 1D Convolutional Neural Networks"],"prefix":"10.1109","volume":"8","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3213-8242","authenticated-orcid":false,"given":"Kin Wai","family":"Cheuk","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8230-7724","authenticated-orcid":false,"given":"Hans","family":"Anderson","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7260-2447","authenticated-orcid":false,"given":"Kat","family":"Agres","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8607-1640","authenticated-orcid":false,"given":"Dorien","family":"Herremans","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3184558.3191822"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CSCI.2017.82"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1811"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.3390\/app7090901"},{"key":"ref31","first-page":"475","article-title":"On the potential of simple framewise approaches to piano transcription","author":"kelz","year":"2016","journal-title":"Proc of the Int Soc for Music Inf Retr Conf (ISMIR)"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2923806"},{"key":"ref37","first-page":"1","article-title":"Acoustic scene classification using deep convolutional neural network and multiple spectrograms fusion","author":"weiping","year":"2017","journal-title":"Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE)"},{"key":"ref36","first-page":"1","article-title":"Acoustic scene classification by ensemble of spectrograms based on adaptive temporal divisions","author":"sakashita","year":"2018","journal-title":"Proc DCASE2016 Challenge"},{"key":"ref35","article-title":"Combining high-level features of raw audio waves and mel-spectrograms for audio tagging","author":"lederle","year":"2018","journal-title":"arXiv 1811 10708"},{"key":"ref34","article-title":"Audio spectrogram representations for processing with convolutional neural networks","author":"wyse","year":"2017","journal-title":"arXiv 1706 09559"},{"key":"ref60","first-page":"11","author":"emiya","year":"2010","journal-title":"MAPS&#x2014;A piano database for multipitch estimation and automatic transcription of music"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1117\/12.2522969"},{"key":"ref61","article-title":"Importance of data loading pipeline in training deep neural networks","author":"zolnouri","year":"2020","journal-title":"arXiv 2005 02130"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461686"},{"key":"ref28","author":"ericson","year":"2019","journal-title":"How to Install Torch Audio on Windows 10 Conda?"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/s10844-013-0258-3"},{"key":"ref27","author":"qu","year":"2019","journal-title":"Some Problems When Installing Torchaudio on MAC"},{"key":"ref65","first-page":"678","article-title":"Automatic music transcription and ethnomusicology: A user study","author":"holzapfel","year":"2019","journal-title":"Proc of the Int Soc for Music Inf Retr Conf (ISMIR)"},{"key":"ref66","first-page":"1","article-title":"The impact of audio input representations on neural network based music transcription","author":"cheuk","year":"2020","journal-title":"Proc Int Joint Conf Neural Netw (IJCNN)"},{"key":"ref29","article-title":"NnAudio: A pytorch audio processing tool using 1D convolution neural networks","author":"cheuk","year":"2019","journal-title":"Proc ISMIR&#x2013;Late Breaking Demo"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1989.266448"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TAI.1989.65324"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/MMRP.2019.00020"},{"key":"ref22","first-page":"1","article-title":"Learning features of music from scratch","author":"thickstun","year":"2017","journal-title":"Proc Int Conf Learn Represent (ICLR)"},{"key":"ref21","first-page":"376","article-title":"Towards interpretable polyphonic transcription with invertible neural networks","author":"kelz","year":"2019","journal-title":"Proc of the Int Soc for Music Inf Retr Conf (ISMIR)"},{"key":"ref24","author":"abadi","year":"2015","journal-title":"TensorFlow Large-Scale Machine Learning on Heterogeneous Systems"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.25080\/Majora-7b98e3ed-003"},{"key":"ref26","first-page":"1","article-title":"Automatic differentiation in Pytorch","author":"paszke","year":"2017","journal-title":"Proc NIPS Autodiff Workshop"},{"key":"ref25","article-title":"Kapre: On-GPU audio preprocessing layers for a quick implementation of deep neural network models with keras","author":"choi","year":"2017","journal-title":"arXiv 1706 05781"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1999.758101"},{"key":"ref51","author":"o\u2019shaughnessy","year":"1987","journal-title":"Speech Communications Human and Machine"},{"key":"ref59","first-page":"3","article-title":"Constant-Q transform toolbox for music processing","author":"sch\u00f6rkhuber","year":"2010","journal-title":"Proc 7th Sound Music Comput Conf"},{"key":"ref58","first-page":"1","article-title":"Implementation of fir filter using efficient window function and its application in filtering a speech signal","volume":"1","author":"rajput","year":"2012","journal-title":"Int J Electr Electron Mech Controls"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1978.1170547"},{"key":"ref56","author":"smith","year":"2007","journal-title":"Introduction to Digital Filters with Audio Applications"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1111\/j.1365-246X.1988.tb01131.x"},{"key":"ref54","volume":"64","author":"rabiner","year":"2011","journal-title":"Theory and Applications of Digital Speech Processing"},{"key":"ref53","author":"slaney","year":"1998","journal-title":"A Matlab Toolbox for Auditory Modeling Work"},{"key":"ref52","first-page":"12","article-title":"The HTK book","volume":"3","author":"young","year":"2002","journal-title":"Cambridge Univ Eng Dept"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2019.2908700"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.3390\/app8010150"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.32470\/CCN.2018.1153-0"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003976"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1016\/j.bspc.2018.08.035"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU46091.2019.9003853"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ECACE.2019.8679106"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-018-3933-z"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9054582"},{"key":"ref18","first-page":"67","article-title":"Zero-shot learning for audio-based music classification and tagging","author":"choi","year":"2019","journal-title":"Proc of the Int Soc for Music Inf Retr Conf (ISMIR)"},{"key":"ref19","first-page":"107","article-title":"Cover detection using dominant melody embeddings","author":"doras","year":"2019","journal-title":"Proc of the Int Soc for Music Inf Retr Conf (ISMIR)"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1986.1168654"},{"key":"ref3","article-title":"Feedforward neural network system for the detection and characterization of sonar signals with characteristic spectrogram textures","author":"recchione","year":"1996"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1121\/1.400476"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1980.1163420"},{"key":"ref8","article-title":"WaveNet: A generative model for raw audio","author":"van den oord","year":"2016","journal-title":"arXiv 1609 03499"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1121\/1.404385"},{"key":"ref49","first-page":"299","article-title":"A new frequency scala for acoustic measurements","author":"koening","year":"1949","journal-title":"Bell Lab Rec"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2019.2909479"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1121\/1.1901999"},{"key":"ref45","first-page":"60","article-title":"Fourier transform theorems","author":"oppenheim","year":"1989","journal-title":"Discrete-Time Signal Processing"},{"key":"ref48","first-page":"139","author":"fant","year":"1949","journal-title":"Analys av de svenska konsonantljuden"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.2307\/1417526"},{"key":"ref42","first-page":"1995","article-title":"Convolutional networks for images, speech, and time series","volume":"3361","author":"lecun","year":"1995","journal-title":"Handbook Brain Theory Neural Netw"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1984.1164399"},{"key":"ref44","first-page":"289","article-title":"Short-time Fourier transform","author":"nawab","year":"1987","journal-title":"Advanced Topics in Signal Processing"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/8948470\/09174990.pdf?arnumber=9174990","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2021,12,17]],"date-time":"2021-12-17T19:55:25Z","timestamp":1639770925000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9174990\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"references-count":66,"URL":"https:\/\/doi.org\/10.1109\/access.2020.3019084","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]}}}