{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:37:35Z","timestamp":1763192255116,"version":"3.45.0"},"reference-count":40,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,12]]},"DOI":"10.1109\/waspaa66052.2025.11230951","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:47Z","timestamp":1763146007000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["Self-Supervised Representation Learning with a JEPA Framework for Multi-instrument Music Transcription"],"prefix":"10.1109","author":[{"given":"Mary","family":"Pilataki","sequence":"first","affiliation":[{"name":"Queen Mary University of London,London,UK"}]},{"given":"Matthias","family":"Mauch","sequence":"additional","affiliation":[{"name":"Apple,London,UK"}]},{"given":"Simon","family":"Dixon","sequence":"additional","affiliation":[{"name":"Queen Mary University of London,London,UK"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2018.2869928"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030482"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746549"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475405"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446141"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3696409.3700202"},{"article-title":"Jointist: Joint learning for multi-instrument transcription and its applications","year":"2022","author":"Cheuk","key":"ref7"},{"key":"ref8","article-title":"Perceiver: General perception with iterative attention","volume-title":"ArXiv","volume":"abs\/2103.03206","author":"Jaegle","year":"2021"},{"article-title":"MT3: Multi-task multitrack music transcription","volume-title":"Proc. ICLR","author":"Gardner","key":"ref9"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP58920.2024.10734819"},{"article-title":"MR-MT3: Memory retaining multi-track music transcription to mitigate instrument leakage","year":"2024","author":"Tan","key":"ref11"},{"article-title":"Melody transcription via generative pre-training","volume-title":"Proc. ISMIR","author":"Donahue","key":"ref12"},{"article-title":"Codified audio language modeling learns useful representations for music information retrieval","volume-title":"Proc. ISMIR","author":"Castellon","key":"ref13"},{"article-title":"Mert: Acoustic music understanding model with large-scale self-supervised training","volume-title":"Proc. ICLR","author":"Li","key":"ref14"},{"article-title":"MuseBERT: Pre-training music representation for music understanding and controllable generation","volume-title":"Proc. ISMIR","author":"Wang","key":"ref15"},{"article-title":"Contrastive learning of musical representations","volume-title":"Proc. ISMIR","author":"Spijkervet","key":"ref16"},{"article-title":"Semi-supervised contrastive learning of musical representations","volume-title":"Proc. ISMIR","author":"Guinot","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3389636"},{"article-title":"Stem-JEPA: A joint-embedding predictive architecture for musical stem compatibility estimation","volume-title":"Proc. ISMIR","author":"Riou","key":"ref19"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01499"},{"article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","year":"2021","author":"Kolesnikov","key":"ref21"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389648"},{"article-title":"The kaldi speech recognition toolkit","volume-title":"IEEE 2011 workshop on automatic speech recognition and understanding","author":"Povey","key":"ref23"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TSA.2002.800560"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747217"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446182"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/waspaa.2019.8937170"},{"article-title":"Learning features of music from scratch","volume-title":"Proc. ICLR","author":"Thickstun","key":"ref28"},{"article-title":"Unaligned supervision for automatic music transcription in the wild","volume-title":"Proc. ICML","author":"Maman","key":"ref29"},{"article-title":"Guitarset: A dataset for guitar transcription","volume-title":"Proc. ISMIR","author":"Xi","key":"ref30"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2856090"},{"article-title":"Enabling factorized piano music modeling and generation with the MAESTRO dataset","volume-title":"Proc. ICLR","author":"Hawthorne","key":"ref32"},{"article-title":"Decoupled weight decay regularization","volume-title":"Proc. ICLR","author":"Loshchilov","key":"ref33"},{"article-title":"Exponential moving average of weights in deep learning: Dynamics and benefits","year":"2024","author":"Morales-Brotons","key":"ref34"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"article-title":"Jukebox: A generative model for music","year":"2020","author":"Dhariwal","key":"ref36"},{"article-title":"Understanding intermediate layers using linear classifier probes","volume-title":"Proc. ICLR","author":"Alain","key":"ref37"},{"article-title":"mir eval: A transparent implementation of common MIR metrics","volume-title":"Proc. ISMIR","author":"Raffel","key":"ref38"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1080\/03610927408827101"},{"article-title":"Neural audio synthesis of musical notes with wavenet autoencoders","volume-title":"Proceedings of the 34th International Conference on Machine Learning","author":"Engel","key":"ref40"}],"event":{"name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","start":{"date-parts":[[2025,10,12]]},"location":"Tahoe City, CA, USA","end":{"date-parts":[[2025,10,15]]}},"container-title":["2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11230875\/11230917\/11230951.pdf?arnumber=11230951","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:32:37Z","timestamp":1763191957000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11230951\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/waspaa66052.2025.11230951","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]}}}