{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T12:54:48Z","timestamp":1775220888058,"version":"3.50.1"},"reference-count":52,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,12]],"date-time":"2025-10-12T00:00:00Z","timestamp":1760227200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,12]]},"DOI":"10.1109\/waspaa66052.2025.11230997","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:47Z","timestamp":1763146007000},"page":"1-5","source":"Crossref","is-referenced-by-count":1,"title":["TACOS: Temporally-aligned Audio CaptiOnS for Language-Audio Pretraining"],"prefix":"10.1109","author":[{"given":"Paul","family":"Primus","sequence":"first","affiliation":[{"name":"Johannes Kepler University,Institute of Computational Perception (CP-JKU),Austria"}]},{"given":"Florian","family":"Schmid","sequence":"additional","affiliation":[{"name":"Johannes Kepler University,Institute of Computational Perception (CP-JKU),Austria"}]},{"given":"Gerhard","family":"Widmer","sequence":"additional","affiliation":[{"name":"Johannes Kepler University,Institute of Computational Perception (CP-JKU),Austria"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2021-698"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2022-227"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096110"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096787"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3376984"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10127"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446159"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3352248"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-714"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-61"},{"key":"ref12","first-page":"161","article-title":"Distilling the knowledge of transformers and cnns with CP-mobile","volume-title":"Proc. DCASE","author":"Schmid"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10890296"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414579"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3149712"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-11115"},{"key":"ref18","first-page":"151","article-title":"Advancing natural-language based audio retrieval with PaSST and large audio-caption data sets","volume-title":"Proc. DCASE","author":"Primus"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746786"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1186\/s13636-022-00259-2"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/WASPAA.2017.8170058"},{"key":"ref22","first-page":"170","article-title":"Automated audio captioning by fine-tuning BART with audioset tags","volume-title":"Proc. DCASE","author":"Gontier"},{"key":"ref23","first-page":"211","article-title":"Audio captioning transformer","volume-title":"Proc. DCASE","author":"Mei"},{"key":"ref24","article-title":"Audiogen: Textually guided audio generation","volume-title":"Proc. ICLR","author":"Kreuk"},{"key":"ref25","first-page":"13916","article-title":"Make-an-audio: Text-to-audio generation with prompt-enhanced diffusion models","volume-title":"Proc. ICML","author":"Huang"},{"key":"ref26","first-page":"21450","article-title":"Audioldm: Text-to-audio generation with latent diffusion models","volume-title":"Proc. ICML","author":"Liu"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2023.3288409"},{"key":"ref28","article-title":"Listen, think, and understand","volume-title":"Proc. ICLR","author":"Gong"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.23919\/EUSIPCO55093.2022.9909680"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3010650"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052990"},{"key":"ref32","first-page":"119","article-title":"Audiocaps: Generating captions for audios in the wild","volume-title":"Proc. NAACL-HLT","author":"Kim"},{"key":"ref33","first-page":"90","article-title":"Diversity and bias in audio captioning datasets","volume-title":"Proc. DCASE","author":"Mart\u00edn-Morat\u00f3"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10097117"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10889879"},{"key":"ref38","article-title":"FLAM: frame-wise language-audio modeling","volume":"abs\/2505.05335","author":"Wu","year":"2025","journal-title":"CoRR"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP58920.2024.10734763"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/2502081.2502245"},{"key":"ref41","article-title":"Gpt-4o mini: advancing cost-efficient intelligence","year":"2024"},{"key":"ref42","article-title":"Label Studio: Data labeling software","year":"2025"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref44","article-title":"Roberta: A robustly optimized BERT pretraining approach","volume":"abs\/1907.11692","author":"Liu","year":"2019","journal-title":"CoRR"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.11"},{"key":"ref46","article-title":"Pointer sentinel mixture models","volume-title":"Proc. ICLR","author":"Merity"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3428908"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49660.2025.10888942"},{"key":"ref49","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. ICLR","author":"Kingma"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9052995"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747556"},{"key":"ref52","first-page":"121","article-title":"Estimated audio-caption correspondences improve language-based audio retrieval","volume-title":"Proc. DCASE","author":"Primus"}],"event":{"name":"2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)","location":"Tahoe City, CA, USA","start":{"date-parts":[[2025,10,12]]},"end":{"date-parts":[[2025,10,15]]}},"container-title":["2025 IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11230875\/11230917\/11230997.pdf?arnumber=11230997","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,17]],"date-time":"2025-11-17T18:38:39Z","timestamp":1763404719000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11230997\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,12]]},"references-count":52,"URL":"https:\/\/doi.org\/10.1109\/waspaa66052.2025.11230997","relation":{},"subject":[],"published":{"date-parts":[[2025,10,12]]}}}