{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,27]],"date-time":"2026-04-27T21:47:58Z","timestamp":1777326478029,"version":"3.51.4"},"reference-count":53,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,9,8]],"date-time":"2021-09-08T00:00:00Z","timestamp":1631059200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,9,8]],"date-time":"2021-09-08T00:00:00Z","timestamp":1631059200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,9,8]]},"DOI":"10.23919\/dafx51585.2021.9768298","type":"proceedings-article","created":{"date-parts":[[2022,5,11]],"date-time":"2022-05-11T20:33:41Z","timestamp":1652301221000},"page":"230-237","source":"Crossref","is-referenced-by-count":25,"title":["A Generative Model for Raw Audio Using Transformer Architectures"],"prefix":"10.23919","author":[{"given":"Prateek","family":"Verma","sequence":"first","affiliation":[{"name":"Center for Computer Research in Music and Acoustics Stanford University,Stanford,CA,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chris","family":"Chafe","sequence":"additional","affiliation":[{"name":"Center for Computer Research in Music and Acoustics Stanford University,Stanford,CA,USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","article-title":"vq-wav2vec: Self-supervised learning of discrete speech repre-sentations","author":"baevski","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref38","article-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations","author":"baevski","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref33","article-title":"An image is worth 16&#x00D7;16 words: Trans-formers for image recognition at scale","author":"dosovitskiy","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00033"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref30","article-title":"Progen: Language modeling for pro-tein generation","author":"madani","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref37","article-title":"Jukebox: A gener-ative model for music","author":"dhariwal","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref36","article-title":"Neural discrete representation learning","author":"van den oord","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref35","article-title":"Generating long sequences with sparse transformers","author":"child","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref34","first-page":"4055","article-title":"Im-age transformer","author":"parmar","year":"0","journal-title":"Int Conference on Machine Learning"},{"key":"ref28","article-title":"Audio transformers: Transformer architectures for large scale audio understanding. adieu convolutions","author":"verma","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref27","article-title":"Music transformer","author":"huang","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref29","article-title":"Language models are few-shot learners","author":"brown","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref2","first-page":"526","article-title":"The synthesis of complex audio spectra by means of frequency modulation","volume":"21","author":"john","year":"1973","journal-title":"Journal of the Audio Engineering Society"},{"key":"ref1","article-title":"Wavenet: A generative model for raw audio","author":"van den oord","year":"2016","journal-title":"ArXiv Preprint"},{"key":"ref20","article-title":"Ctrl: A conditional transformer language model for controllable generation","author":"keskar","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-1"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2017-1452"},{"key":"ref24","first-page":"2410","article-title":"Efficient neural audio synthesis","author":"kalchbrenner","year":"0","journal-title":"International Confer-ence on Machine Learning"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2016-679"},{"key":"ref26","year":"2021","journal-title":"Transformers back in 1991"},{"key":"ref25","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"ArXiv Preprint"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref53","author":"sotelo","year":"2017","journal-title":"Char2Wav End-to-end speech synthesis"},{"key":"ref52","doi-asserted-by":"crossref","first-page":"8823","DOI":"10.1109\/ACCESS.2016.2628440","article-title":"An overview on networked music performance technologies","volume":"4","author":"chafe","year":"2016","journal-title":"IEEE Access"},{"key":"ref10","first-page":"3918","article-title":"Paral-lel wavenet: Fast high-fidelity speech synthesis","author":"oord","year":"0","journal-title":"Inter-national Conference on Machine Learning"},{"key":"ref11","article-title":"Neuralo-gram: A deep neural network based representation for audio signals","author":"chafe","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.1999.759875"},{"key":"ref12","article-title":"A framework for contrastive and generative learning of audio representations","author":"verma","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref13","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1984.1164317"},{"key":"ref15","article-title":"Con-ditional end-to-end audio transforms","author":"haque","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref16","article-title":"Neural style transfer for audio spectograms","author":"verma","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref17","article-title":"End-to-end spoken language translation","author":"guo","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682553"},{"key":"ref19","first-page":"268","article-title":"A deep learning approach for low-latency packet loss concealment of audio signals in networked mu-sic performance applications","author":"verma","year":"0","journal-title":"2020 27th Conference of Open Innovations Association (FRUCT) IEEE"},{"key":"ref4","article-title":"The challenge of realistic music generation: modelling raw audio at scale","author":"dieleman","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.2307\/3680062"},{"key":"ref6","article-title":"A universal music translation network","author":"mor","year":"0","journal-title":"ArXiv Preprint"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8462417"},{"key":"ref8","article-title":"Wave-u-net: A multi-scale neural network for end-to-end audio source separation","author":"stoller","year":"2018","journal-title":"ArXiv Preprint"},{"key":"ref7","first-page":"4779","article-title":"Natural tts syn-thesis by conditioning wavenet on mel spectrogram predictions","author":"jonathan","year":"0","journal-title":"2018 IEEE International Conference on Acous-tics Speech and Signal Processing (ICASSP) IEEE"},{"key":"ref49","first-page":"265","article-title":"Tensor-flow: A system for large-scale machine learning","author":"abadi","year":"0","journal-title":"12th USENIX Symposium on Operating Systems Design and Implementation ( OSDI 16)"},{"key":"ref9","doi-asserted-by":"crossref","first-page":"1118","DOI":"10.21437\/Interspeech.2017-314","article-title":"Speaker-dependent wavenet vocoder","volume":"2017","author":"tamamori","year":"2017","journal-title":"InterSpeech"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-4012"},{"key":"ref45","first-page":"18","article-title":"librosa: Audio and music signal analysis in python","volume":"8","author":"brian","year":"2015","journal-title":"Proceedings of the 14th Python in Science Conference"},{"key":"ref48","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014","journal-title":"ArXiv Preprint"},{"key":"ref47","article-title":"Neu-ral machine translation in linear time","author":"kalchbrenner","year":"2016","journal-title":"ArXiv Preprint"},{"key":"ref42","first-page":"155","article-title":"Medleydb: A multitrack dataset for annotation-intensive mir research","volume":"14","author":"bittner","year":"2014","journal-title":"ISMIR"},{"key":"ref41","author":"fedus","year":"2021","journal-title":"Switch transformers Scaling to trillion parameter models with sim-ple and efficient sparsity"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1038\/s41592-019-0686-2"},{"key":"ref43","article-title":"Enabling factor-ized piano music modeling and generation with the maestro dataset","author":"hawthorne","year":"2018","journal-title":"ArXiv Preprint"}],"event":{"name":"2021 24th International Conference on Digital Audio Effects (DAFx)","location":"Vienna, Austria","start":{"date-parts":[[2021,9,8]]},"end":{"date-parts":[[2021,9,10]]}},"container-title":["2021 24th International Conference on Digital Audio Effects (DAFx)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9768159\/9768211\/09768298.pdf?arnumber=9768298","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,4]],"date-time":"2022-07-04T20:11:46Z","timestamp":1656965506000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9768298\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,9,8]]},"references-count":53,"URL":"https:\/\/doi.org\/10.23919\/dafx51585.2021.9768298","relation":{},"subject":[],"published":{"date-parts":[[2021,9,8]]}}}