{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:10:35Z","timestamp":1776888635363,"version":"3.51.2"},"reference-count":29,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,4,6]],"date-time":"2025-04-06T00:00:00Z","timestamp":1743897600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,4,6]]},"DOI":"10.1109\/icassp49660.2025.10888241","type":"proceedings-article","created":{"date-parts":[[2025,3,12]],"date-time":"2025-03-12T17:15:02Z","timestamp":1741799702000},"page":"1-5","source":"Crossref","is-referenced-by-count":7,"title":["LAVCap: LLM-based Audio-Visual Captioning using Optimal Transport"],"prefix":"10.1109","author":[{"given":"Kyeongha","family":"Rho","sequence":"first","affiliation":[{"name":"KAIST,South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hyeongkeun","family":"Lee","sequence":"additional","affiliation":[{"name":"KAIST,South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Valentio","family":"Iverson","sequence":"additional","affiliation":[{"name":"University of Waterloo,Canada"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Joon Son","family":"Chung","sequence":"additional","affiliation":[{"name":"KAIST,South Korea"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i11.26518"},{"key":"ref2","first-page":"211","article-title":"Audio captioning transformer","volume-title":"Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop","author":"Mei"},{"key":"ref3","article-title":"Automated audio captioning by fine-tuning bart with audioset tags","volume-title":"DCASE 2021 - 6th Workshop on Detection and Classification of Acoustic Scenes and Events","author":"Gontier"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096877"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446672"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3430813"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-65"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-acl.546"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2020.3030497"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746312"},{"key":"ref12","article-title":"BEATs: audio pre-training with acoustic tokenizers","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Chen"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref14","article-title":"High fidelity neural audio compression","author":"D\u00e9fossez","year":"2023","journal-title":"Transactions on Machine Learning Research"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.21437\/interspeech.2023-914"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2024-526"},{"key":"ref17","article-title":"Multi-granularity correspondence learning from long-term noisy videos","volume-title":"The Twelfth International Conference on Learning Representations","author":"Lin"},{"key":"ref18","first-page":"2292","article-title":"Sinkhorn distances: Lightspeed computation of optimal transport","volume-title":"Proceedings of the 26th International Conference on Neural Information Processing Systems","author":"Cuturi"},{"key":"ref19","first-page":"119","article-title":"Audiocaps: Generating captions for audios in the wild","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Kim"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446348"},{"key":"ref21","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Radford"},{"key":"ref22","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"ref23","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"The Tenth International Conference on Learning Representations","author":"Hu"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref25","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","author":"Lin","year":"2004","journal-title":"Text Summarization Branches Out"},{"key":"ref26","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","volume-title":"Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization","author":"Banerjee"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.100"}],"event":{"name":"ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","location":"Hyderabad, India","start":{"date-parts":[[2025,4,6]]},"end":{"date-parts":[[2025,4,11]]}},"container-title":["ICASSP 2025 - 2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10887540\/10887541\/10888241.pdf?arnumber=10888241","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T05:23:28Z","timestamp":1774416208000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10888241\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,6]]},"references-count":29,"URL":"https:\/\/doi.org\/10.1109\/icassp49660.2025.10888241","relation":{},"subject":[],"published":{"date-parts":[[2025,4,6]]}}}