{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:37:35Z","timestamp":1763192255115,"version":"3.45.0"},"reference-count":54,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,12]]},"DOI":"10.1109\/waspaa66052.2025.11230965","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:47Z","timestamp":1763146007000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["OpenBEATs: A Fully Open-Source General-Purpose Audio Encoder"],"prefix":"10.1109","author":[{"given":"Shikhar","family":"Bharadwaj","sequence":"first","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Samuele","family":"Cornell","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Kwanghee","family":"Choi","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Satoru","family":"Fukayama","sequence":"additional","affiliation":[{"name":"National Institute of Advanced Industrial Science and Technology (AIST),Japan"}]},{"given":"Hye-Jin","family":"Shim","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Soham","family":"Deshmukh","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]},{"given":"Shinji","family":"Watanabe","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University,USA"}]}],"member":"263","reference":[{"key":"ref1","article-title":"BEATs: audio pre-training with acoustic tokenizers","author":"Chen","year":"2023","journal-title":"ICML"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i10.21315"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10961"},{"key":"ref4","article-title":"Masked autoencoders that listen","author":"Huang","year":"2022","journal-title":"NeurIPS"},{"article-title":"DCASE 2024 task 4: Sound event detection with heterogeneous data and missing labels","volume-title":"DCASE Workshop","author":"Cornell","key":"ref5"},{"journal-title":"DCASE2023 Challenge, Tech. Rep","article-title":"BEATs-based audio captioning model with instructor embedding supervision and chatgpt mix-up","author":"Wu","key":"ref6"},{"key":"ref7","article-title":"Few-shot bioacoustic event detection using beats","author":"Gelderblom","year":"2023","journal-title":"DCASE2023 Challenge, Tech. Rep"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSPW62465.2024.10626364"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ASRU57964.2023.10389676"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-1873"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3122291"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2018-1456"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.naacl-demo.19"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890750"},{"key":"ref15","article-title":"Mert: Acoustic music understanding model with large-scale self-supervised training","author":"Li","year":"2024","journal-title":"ICLR"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3188113"},{"article-title":"Google USM: Scaling automatic speech recognition beyond 100 languages","year":"2023","author":"Zhang","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.570"},{"key":"ref19","article-title":"Pengi: An audio language model for audio tasks","author":"Deshmukh","year":"2023","journal-title":"NeurIPS"},{"key":"ref20","article-title":"SALMONN: Towards generic hearing abilities for large language models","author":"Tang","year":"2023","journal-title":"ICLR"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.361"},{"article-title":"Mellow: a small audio language model for reasoning","year":"2025","author":"Deshmukh","key":"ref22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-246"},{"key":"ref26","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Alexey","year":"2020","journal-title":"ICLR"},{"key":"ref27","article-title":"Self-supervised learning with random-projection quantizer for speech recognition","author":"Chiu","year":"2022","journal-title":"ICML"},{"journal-title":"NeurIPS","year":"2017","author":"van den Oord","key":"ref28"},{"article-title":"FMA: A dataset for music analysis","volume-title":"International Society for Music Information Retrieval Conference","author":"Defferrard","key":"ref29"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00914"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10447250"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.23919\/APSIPAASC55919.2022.9979822"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2991965"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.33682\/006b-jx26"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2021.3133208"},{"article-title":"General-purpose tagging of freesound audio with audioset labels: Task description, dataset, and baseline","volume-title":"DCASE Workshop","author":"Fonseca","key":"ref38"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096686"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i22.34548"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO55093.2022.9909680"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCMC51019.2021.9418035"},{"article-title":"The GTZAN dataset: Its contents, its faults, their effects on evaluation, and its future use","year":"2013","author":"Sturm","key":"ref45"},{"key":"ref46","article-title":"Neural audio synthesis of musical notes with wavenet autoencoders","author":"Engel","year":"2017","journal-title":"ICML"},{"article-title":"The ICME 2025 audio encoder capability challenge","year":"2025","author":"Zhang","key":"ref47"},{"key":"ref48","article-title":"Data2vec: A general framework for self-supervised learning in speech, vision and language","author":"Baevski","year":"2022","journal-title":"ICML"},{"key":"ref49","article-title":"Robust speech recognition via large-scale weak supervision","author":"Radford","year":"2023","journal-title":"ICML"},{"article-title":"EAT: self-supervised pre-training with efficient audio transformer","volume-title":"Proceedings of the International Joint Conference on Artificial Intelligence","author":"Chen","key":"ref50"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095642"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-1242"},{"journal-title":"DCASE2024 Challenge, Tech. Rep","article-title":"Automatic audio captioning with encoder fusion, multi-layer aggregation, and large language model enriched summarization","author":"Jung","key":"ref54"}],"event":{"name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","start":{"date-parts":[[2025,10,12]]},"location":"Tahoe City, CA, USA","end":{"date-parts":[[2025,10,15]]}},"container-title":["2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11230875\/11230917\/11230965.pdf?arnumber=11230965","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:32:37Z","timestamp":1763191957000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11230965\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":54,"URL":"https:\/\/doi.org\/10.1109\/waspaa66052.2025.11230965","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]}}}