{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,13]],"date-time":"2025-05-13T16:31:10Z","timestamp":1747153870261,"version":"3.40.5"},"reference-count":28,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2024,1,15]],"date-time":"2024-01-15T00:00:00Z","timestamp":1705276800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,15]],"date-time":"2024-01-15T00:00:00Z","timestamp":1705276800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach. Intell. Res."],"published-print":{"date-parts":[[2024,2]]},"DOI":"10.1007\/s11633-023-1441-9","type":"journal-article","created":{"date-parts":[[2024,1,15]],"date-time":"2024-01-15T07:01:55Z","timestamp":1705302115000},"page":"55-62","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Audio Mixing Inversion via Embodied Self-supervised Learning"],"prefix":"10.1007","volume":"21","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-0876-3709","authenticated-orcid":false,"given":"Haotian","family":"Zhou","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Feng","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-5236-7469","authenticated-orcid":false,"given":"Xihong","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,1,15]]},"reference":[{"key":"1441_CR1","doi-asserted-by":"publisher","DOI":"10.4324\/9781315716947","volume-title":"Mixing Audio: Concepts, Practices, and Tools","author":"R Izhaki","year":"2017","unstructured":"R. Izhaki. Mixing Audio: Concepts, Practices, and Tools, New York, USA: Routledge, 2017."},{"key":"1441_CR2","doi-asserted-by":"publisher","first-page":"608","DOI":"10.1121\/10.0005622","volume":"150","author":"J T Colonel","year":"2021","unstructured":"J. T. Colonel, J. Reiss. Reverse engineering of a recording mix with differentiable digital signal processing. The Journal of the Acoustical Society of America, vol. 150, pp. 608\u2013619, 2021. DOI: https:\/\/doi.org\/10.1121\/10.0005622.","journal-title":"The Journal of the Acoustical Society of America"},{"issue":"5","key":"1441_CR3","doi-asserted-by":"publisher","first-page":"1830","DOI":"10.1109\/TSP.2007.912893","volume":"56","author":"G S Yu","year":"2008","unstructured":"G. S. Yu, S. \u00c9 Mallat, E. Bacry. Audio denoising by time-frequency block thresholding. IEEE Transactions on Signal Processing, vol. 56, no. 5, pp. 1830\u20131839, 2008. DOI: https:\/\/doi.org\/10.1109\/TSP.2007.912893.","journal-title":"IEEE Transactions on Signal Processing"},{"issue":"3","key":"1441_CR4","first-page":"359","volume":"87","author":"K Lebart","year":"2001","unstructured":"K. Lebart, J. M. Boucher, P. N. Denbigh. A new method based on spectral subtraction for speech dereverberation. Acta Acustica United with Acustica, vol. 87, no. 3, pp. 359\u2013366, 2001.","journal-title":"Acta Acustica United with Acustica"},{"issue":"2","key":"1441_CR5","doi-asserted-by":"publisher","first-page":"434","DOI":"10.1109\/78.554307","volume":"45","author":"A Belouchrani","year":"1997","unstructured":"A. Belouchrani, K. Abed-Meraim, J. F. Cardoso, E. Moulines. A blind source separation technique using second-order statistics. IEEE Transactions on Signal Processing, vol. 45, no. 2, pp. 434\u2013444, 1997. DOI: https:\/\/doi.org\/10.1109\/78.554307.","journal-title":"IEEE Transactions on Signal Processing"},{"key":"1441_CR6","doi-asserted-by":"publisher","first-page":"2985","DOI":"10.1109\/ICASSP.2000.861162","volume":"5","author":"A Jourjine","year":"2000","unstructured":"A. Jourjine, S. Rickard, O. Yilmaz. Blind separation of disjoint orthogonal signals: Demixing n sources from 2 mixtures. In Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing, Istanbul, Turkey, vol. 5, pp. 2985\u20132988, 2000. DOI: https:\/\/doi.org\/10.1109\/ICASSP.2000.861162.","journal-title":"Proceedings of IEEE International Conference on Acoustics, Speech, and Signal Processing"},{"key":"1441_CR7","unstructured":"S. Gorlow, S. Marchand. Reverse engineering stereo music recordings pursuing an informed two-stage approach, [Online], Available: https:\/\/hal.science\/hal-00857676\/document, 2013."},{"issue":"7","key":"1441_CR8","doi-asserted-by":"publisher","first-page":"1434","DOI":"10.1109\/TASL.2013.2253099","volume":"21","author":"S Gorlow","year":"2013","unstructured":"S. Gorlow, J. D. Reiss. Model-based inversion of dynamic range compression. IEEE Transactions on Audio, Speech, and Language Processing, vol. 21, no. 7, pp. 1434\u20131444, 2013. DOI: https:\/\/doi.org\/10.1109\/TASL.2013.2253099.","journal-title":"IEEE Transactions on Audio, Speech, and Language Processing"},{"key":"1441_CR9","unstructured":"M. Ram\u00edrez, J. D. Reiss. End-to-end equalization with convo-lutional neural networks, [Online], Available: https:\/\/www.dafx.de\/paper-archive\/2018\/papers\/DAFx2018_paper_27.pdf, 2018."},{"key":"1441_CR10","unstructured":"S. Hawley, B. Colburn, S. I. Mimilakis. Profiling audio compressors with deep neural networks, [Online], Available: https:\/\/arxiv.org\/abs\/1905.11928, 2019."},{"key":"1441_CR11","unstructured":"C. J. Steinmetz, J. D. Reiss. Efficient neural networks for real-time modeling of analog dynamic range compression, [Online], Available: https:\/\/arxiv.org\/abs\/2102.06200, 2021."},{"key":"1441_CR12","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1109\/ICASSP.2019.8683529","volume-title":"Proceedings of ICASSPI\/IEEE International Conference on Acoustics, Speech and Signal Processing","author":"M A M Ram\u00edrez","year":"2019","unstructured":"M. A. M. Ram\u00edrez, J. D. Reiss. Modeling nonlinear audio effects with end-to-end deep neural networks. In Proceedings of ICASSPI\/IEEE International Conference on Acoustics, Speech and Signal Processing, IEEE, Brighton, UK, pp. 171\u2013175, 2019. DOI: https:\/\/doi.org\/10.1109\/ICASSP.2019.8683529."},{"key":"1441_CR13","series-title":"Ph.D. dissertation","volume-title":"Deep Learning for Audio Effects Modeling","author":"M A Mart\u00ednez-Ram\u00edrez","year":"2021","unstructured":"M. A. Mart\u00ednez-Ram\u00edrez. Deep Learning for Audio Effects Modeling, Ph.D. dissertation, Queen Mary University of London, UK, 2021."},{"key":"1441_CR14","first-page":"563","volume":"58","author":"D Barchiesi","year":"2010","unstructured":"D. Barchiesi, J. Reiss. Reverse engineering of a mix. Journal of The Audio Engineering Society, vol. 58, vol. 7, pp. 563\u2013576, 2010.","journal-title":"Journal of The Audio Engineering Society"},{"key":"1441_CR15","unstructured":"J. H. Engel, L. Hantrakul, C. J. Gu, A. Roberts. DDSP: Differentiable digital signal processing. In Proceedings of the 8th International Conference on Learning Representations, Addis Ababa, Ethiopia, 2019."},{"key":"1441_CR16","unstructured":"J. T. Colonel, M. Comunit\u00e0, J. Reiss. Reverse engineering memoryless distortion effects with differentiable waveshapers, [Online], Available: https:\/\/www.eecs.qmul.ac.uk\/\u223cjosh\/documents\/2022\/21955.pdf, 2022."},{"key":"1441_CR17","unstructured":"J. T. Colonel, J. D. Reiss. Approximating ballistics in a differentiable dynamic range compressor, [Online], Available: https:\/\/www.eecs.qmul.ac.uk\/\u223cjosh\/documents\/2022\/21915.pdf, 2022."},{"key":"1441_CR18","unstructured":"Y. F. Sun, X. H. Wu. Embodied self-supervised learning by coordinated sampling and training, [Online], Available: https:\/\/arxiv.org\/abs\/2006.13350, 2020."},{"key":"1441_CR19","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Proceedings of the 18th International Conference on Medical Image Computing and Computer-assisted Intervention","author":"O Ronneberger","year":"2015","unstructured":"O. Ronneberger, P. Fischer, T. Brox. U-Net: Convolutional networks for biomedical image segmentation. In Proceedings of the 18th International Conference on Medical Image Computing and Computer-assisted Intervention, Springer, Munich, Germany, pp. 234\u2013241, 2015. DOI: https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28."},{"key":"1441_CR20","doi-asserted-by":"publisher","first-page":"71","DOI":"10.1109\/ICASSP39728.2021.9414364","volume-title":"Proceedings of ICASS\/IEEE International Conference on Acoustics, Speech and Signal Processing","author":"C J Steinmetz","year":"2021","unstructured":"C. J. Steinmetz, J. Pons, S. Pascual, J. Serr\u00e0. Automatic multitrack mixing with a differentiable mixing console of neural audio effects. In Proceedings of ICASS\/IEEE International Conference on Acoustics, Speech and Signal Processing, IEEE, Toronto, Canada, pp. 71\u201375, 2021. DOI: https:\/\/doi.org\/10.1109\/ICASSP39728.2021.9414364."},{"key":"1441_CR21","doi-asserted-by":"crossref","unstructured":"A. Gulati, J. Qin, C. C. Chiu, N. Parmar, Y. Zhang, J. H. Yu, W. Han, S. B. Wang, Z. D. Zhang, Y. H. Wu, R. M. Pang. Conformer: Convolution-augmented transformer for speech recognition. In Proceedings of the 21st Annual Conference of the International Speech Communication Association, Shanghai, China, pp. 5036\u20135040, 2020.","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"1441_CR22","unstructured":"D. Braun. DawDreamer: Bridging the gap between digital audio workstations and python interfaces, [Online], Available: https:\/\/arxiv.org\/abs\/2111.09931, 2021."},{"key":"1441_CR23","unstructured":"O. Gillet, G. Richard. ENST-Drums: An extensive audiovisual database for drum signals processing. In Proceedings of the 7th International Conference on Music Information Retrieval, Victoria, Canada, pp. 156\u2013159, 2006."},{"key":"1441_CR24","unstructured":"R. M. Bittner, J. Salamon, M. Tierney, M. Mauch, C. Cannam, J. P. Bello. MedleyDB: A multitrack dataset for annotation-intensive MIR research. In Proceedings of the 15th International Society for Music Information Retrieval Conference, Taipei, China, pp. 155\u2013160, 2014."},{"key":"1441_CR25","unstructured":"R. Bittner, J. Wilkins, H. Yip, J. Bello. MedleyDB 2.0: New data and a system for sustainable data collection. In Proceedings of International Conference on Music Information Retrieval, New York, USA, 2016, [Online], Available: https:\/\/wp.nyu.edu\/ismir2016\/wp-content\/uploads\/sites\/2294\/2016\/08\/bittner-medleydb.pdf."},{"key":"1441_CR26","unstructured":"D. P. Kingma, J. Ba. Adam: A method for stochastic optimization, [Online], Available: https:\/\/arxiv.org\/abs\/1412.6980, 2014."},{"key":"1441_CR27","doi-asserted-by":"publisher","unstructured":"R. Yamamoto, E. Song, J. M. Kim. Parallel wavegan: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram. In-Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing, Barcelona, Spain, pp. 6199\u20136203, 2020. DOI: https:\/\/doi.org\/10.1109\/ICASSP40776.2020.9053795.","DOI":"10.1109\/ICASSP40776.2020.9053795"},{"key":"1441_CR28","unstructured":"C. J. Steinmetz, J. Reiss. Pyloudnorm: A simple yet flexible loudness meter in python, [Online], Available: https:\/\/csteinmetz1.github.io\/pyloudnorm-eval\/paper\/pyloud-norm_preprint.pdf, 2021."}],"container-title":["Machine Intelligence Research"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11633-023-1441-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11633-023-1441-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11633-023-1441-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,18]],"date-time":"2024-01-18T09:41:51Z","timestamp":1705570911000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11633-023-1441-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,1,15]]},"references-count":28,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2024,2]]}},"alternative-id":["1441"],"URL":"https:\/\/doi.org\/10.1007\/s11633-023-1441-9","relation":{},"ISSN":["2731-538X","2731-5398"],"issn-type":[{"type":"print","value":"2731-538X"},{"type":"electronic","value":"2731-5398"}],"subject":[],"published":{"date-parts":[[2024,1,15]]},"assertion":[{"value":"29 November 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 March 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 January 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declared that they have no conflicts of interest to this work.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations of conflict of interest"}}]}}