{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,24]],"date-time":"2026-06-24T16:08:13Z","timestamp":1782317293862,"version":"3.54.5"},"reference-count":39,"publisher":"IEEE","license":[{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,6,6]],"date-time":"2021-06-06T00:00:00Z","timestamp":1622937600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,6,6]]},"DOI":"10.1109\/icassp39728.2021.9414235","type":"proceedings-article","created":{"date-parts":[[2021,5,13]],"date-time":"2021-05-13T19:53:45Z","timestamp":1620935625000},"page":"96-100","source":"Crossref","is-referenced-by-count":20,"title":["Self-Supervised VQ-VAE for One-Shot Music Style Transfer"],"prefix":"10.1109","author":[{"given":"Ondrej","family":"Cifka","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Alexey","family":"Ozerov","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Umut","family":"Simsekli","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gael","family":"Richard","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref39","article-title":"WaveNet: A generative model for raw audio","author":"van den oord","year":"2016","journal-title":"SSW"},{"key":"ref38","article-title":"The challenge of realistic music generation: modelling raw audio at scale","author":"dieleman","year":"2018","journal-title":"NeurIPS"},{"key":"ref33","article-title":"Essentia: An audio analysis library for music information retrieval","author":"bogdanov","year":"2013","journal-title":"ISMIR"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TASL.2012.2188515"},{"key":"ref31","article-title":"Supervised symbolic music style translation using synthetic data","author":"c\u00edfka","year":"2019","journal-title":"ISMIR"},{"key":"ref30","article-title":"Rectifier nonlinearities improve neural network acoustic models","author":"maas","year":"2013","journal-title":"ICML"},{"key":"ref37","article-title":"Upsampling artifacts in neural audio synthesis","author":"pons","year":"2021","journal-title":"ICASSP"},{"key":"ref36","article-title":"The &#x2018;Mixing Secrets&#x2019; free multitrack download library","year":"0"},{"key":"ref35","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1007\/978-3-319-24261-3_7","article-title":"Deep metric learning using triplet network","author":"hoffer","year":"2015","journal-title":"International Workshop on Similarity-Based Pattern Recognition"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2013.2251591"},{"key":"ref10","article-title":"Neural discrete representation learning","author":"van den oord","year":"2017","journal-title":"NIPS"},{"key":"ref11","article-title":"Let it Bee &#x2013;towards NMF-inspired audio mosaicing","author":"driedger","year":"2015","journal-title":"ISMIR"},{"key":"ref12","article-title":"Cover song synthesis by analogy","author":"tralie","year":"2018","journal-title":"ISMIR"},{"key":"ref13","article-title":"Music retiler: Using NMF2D source separation for audio mosaicing","author":"foroughmand","year":"2018","journal-title":"Audio Mostly 2018 Sound in Immersion and Emotion (AM18)"},{"key":"ref14","article-title":"Musical mosaicing","author":"zils","year":"2001","journal-title":"COST G-6 Conference on Digital Audio Effects (DAFX-01)"},{"key":"ref15","article-title":"Audio texture synthesis and style transfer","author":"ulyanov","year":"0"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2018.8461711"},{"key":"ref17","article-title":"Bridging audio analysis, perception and synthesis with perceptually-regularized variational timbre spaces","author":"esling","year":"2018","journal-title":"ISMIR"},{"key":"ref18","article-title":"Learning disentangled representations of timbre and pitch for musical instrument sounds using Gaussian mixture variational autoencoders","author":"luo","year":"2019","journal-title":"ISMIR"},{"key":"ref19","article-title":"Assisted sound sample generation with musical conditioning in adversarial auto-encoders","author":"bitton","year":"2019","journal-title":"Proceedings of the 22nd International Conference on Digital Audio Effects (DAFx-19)"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref4","article-title":"TimbreTron: A WaveNet(CycleGAN(CQT(Audio))) pipeline for musical timbre transfer","author":"huang","year":"2019","journal-title":"ICLRE"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TASSP.1984.1164317"},{"key":"ref3","article-title":"A universal music translation network","author":"noam mor","year":"2019","journal-title":"ICLRE"},{"key":"ref6","article-title":"Vector-quantized timbre representation","author":"bitton","year":"2020"},{"key":"ref29","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2015","journal-title":"ICLRE"},{"key":"ref5","article-title":"DDSP: Differentiable digital signal processing","author":"engel","year":"2020","journal-title":"ICLRE"},{"key":"ref8","article-title":"Learning interpretable representation for controllable polyphonic music generation","author":"wang","year":"2020","journal-title":"ISMIR"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3019642"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.167"},{"key":"ref9","article-title":"Zero-shot singing voice conversion","author":"nercessian","year":"2020","journal-title":"ISMIR"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.265"},{"key":"ref20","article-title":"Unsupervised disentanglement of pitch and timbre for isolated musical instrument sounds","author":"luo","year":"2020","journal-title":"ISMIR"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.278"},{"key":"ref21","article-title":"Efficient estimation of word representations in vector space","author":"mikolov","year":"2013","journal-title":"ICLRE"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00840"},{"key":"ref23","article-title":"Unsupervised representation learning by predicting image rotations","author":"gidaris","year":"2018","journal-title":"ICLRE"},{"key":"ref26","article-title":"Sound Fonts and SFZ files","year":"0","journal-title":"Handbook for Muse Score 3"},{"key":"ref25","author":"raffel","year":"2016","journal-title":"Ph D thesis"}],"event":{"name":"ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Toronto, ON, Canada","start":{"date-parts":[[2021,6,6]]},"end":{"date-parts":[[2021,6,11]]}},"container-title":["ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9413349\/9413350\/09414235.pdf?arnumber=9414235","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T15:40:58Z","timestamp":1652197258000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9414235\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6,6]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1109\/icassp39728.2021.9414235","relation":{},"subject":[],"published":{"date-parts":[[2021,6,6]]}}}