{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:27:35Z","timestamp":1763191655770,"version":"3.45.0"},"reference-count":31,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,12]]},"DOI":"10.1109\/waspaa66052.2025.11230975","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:47Z","timestamp":1763146007000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["FlexSED: Towards Open-Vocabulary Sound Event Detection"],"prefix":"10.1109","author":[{"given":"Jiarui","family":"Hai","sequence":"first","affiliation":[{"name":"Johns Hopkins University,Department of Electrical and Computer Engineering,Maryland,USA"}]},{"given":"Helin","family":"Wang","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Department of Electrical and Computer Engineering,Maryland,USA"}]},{"given":"Weizhe","family":"Guo","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Department of Electrical and Computer Engineering,Maryland,USA"}]},{"given":"Mounya","family":"Elhilali","sequence":"additional","affiliation":[{"name":"Johns Hopkins University,Department of Electrical and Computer Engineering,Maryland,USA"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2021.3090678"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.33682\/006b-jx26"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10127"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446159"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3352248"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888942"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414579"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"ref9","first-page":"72983","article-title":"Scaling open-vocabulary object detection","volume":"36","author":"Minderer","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref10","first-page":"52132","article-title":"Geneval: An object-focused framework for evaluating text-to-image alignment","volume":"36","author":"Ghosh","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Convincing rationales for visual question answering reasoning","year":"2024","author":"Li","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2025-1137"},{"article-title":"Tangoflux: Super fast and faithful text to audio generation with flow matching and clap-ranked preference optimization","year":"2024","author":"Hung","key":"ref13"},{"article-title":"Qwen2-audio technical report","year":"2024","author":"Chu","key":"ref14"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.361"},{"key":"ref16","first-page":"53728","article-title":"Direct preference optimization: Your language model is secretly a reward model","volume":"36","author":"Rafailov","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890066"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3520017"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889789"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-246"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"article-title":"Gpt-4 technical report","year":"2023","author":"Achiam","key":"ref22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747680"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2019-2680"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052995"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747556"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"article-title":"Beats: Audio pre-training with acoustic tokenizers","year":"2022","author":"Chen","key":"ref29"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02171"},{"key":"ref31","first-page":"14200","article-title":"Attention bottlenecks for multimodal fusion","volume":"34","author":"Nagrani","year":"2021","journal-title":"Advances in neural information processing systems"}],"event":{"name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","start":{"date-parts":[[2025,10,12]]},"location":"Tahoe City, CA, USA","end":{"date-parts":[[2025,10,15]]}},"container-title":["2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11230875\/11230917\/11230975.pdf?arnumber=11230975","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:22:39Z","timestamp":1763191359000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11230975\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":31,"URL":"https:\/\/doi.org\/10.1109\/waspaa66052.2025.11230975","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]}}}