{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T22:32:04Z","timestamp":1769034724959,"version":"3.49.0"},"reference-count":46,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,22]]},"DOI":"10.1109\/cbmi66578.2025.11339348","type":"proceedings-article","created":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T20:38:56Z","timestamp":1768941536000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["Dialogue-AV: A Dialogue-Attended Audiovisual Dataset"],"prefix":"10.1109","author":[{"given":"Lu\u00eds","family":"Vilaca","sequence":"first","affiliation":[{"name":"INESC TEC,Porto,Portugal"}]},{"given":"Paula","family":"Viana","sequence":"additional","affiliation":[{"name":"INESC TEC,Porto,Portugal"}]},{"given":"Yi","family":"Yu","sequence":"additional","affiliation":[{"name":"Graduate School of Advanced Science and Engineering, Hiroshima University,Hiroshima,Japan"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Flamingo: a visual language model for few-shot learning","volume-title":"Proceedings of the 36th International Conference on Neural Information Processing Systems","author":"Alayrac","year":"2022"},{"key":"ref2","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proceedings of the 39th International Conference on Machine Learning, ser. Proceedings of Machine Learning Research","volume":"162","author":"Li"},{"key":"ref3","article-title":"Macaw-LLM: Multi-Modal Language Modeling with Image, Audio, Video, and Text Integration","author":"Lyu","year":"2023","journal-title":"ar Xiv"},{"key":"ref4","article-title":"Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language mod-els","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Li"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3479776"},{"key":"ref6","article-title":"Vast: a vision-audio-subtitle-text omni-modality foundation model and dataset","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"Chen","year":"2023"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"ref8","doi-asserted-by":"crossref","first-page":"6787","DOI":"10.18653\/v1\/2021.emnlp-main.544","article-title":"VideoCLIP: Contrastive pre-training for zero-shot video-text understanding","volume-title":"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing","author":"Xu","year":"2021"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00498"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2015.7298698"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref12","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proceedings of the 38th International Conference on Machine Learning","volume":"139","author":"Radford","year":"2021"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01427"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612132"},{"key":"ref16","article-title":"LanguageBind: Extending Video-Language Pretraining to N-modality by Language-based Semantic Alignment","author":"Zhu","year":"2023","journal-title":"ar Xiv"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01413"},{"key":"ref18","first-page":"23634","article-title":"Merlot: Multimodal neural script knowledge models","volume-title":"Advances in Neural Information Processing Systems","volume":"34","author":"Zellers","year":"2021"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095969"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TASLP.2024.3419446"},{"key":"ref22","first-page":"119","article-title":"AudioCaps: Generating captions for audios in the wild","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)","author":"Kim","year":"2019"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9414640"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548291"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123427"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01852"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00298"},{"key":"ref31","doi-asserted-by":"crossref","first-page":"13484","DOI":"10.18653\/v1\/2023.acl-long.754","article-title":"Self-instruct: Aligning language models with self-generated instructions","volume-title":"Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","author":"Wang","year":"2023"},{"key":"ref32","volume-title":"Stanford alpaca: An instruction-following llama model","author":"Taori","year":"2023"},{"key":"ref33","volume-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","author":"Chiang","year":"2023"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2018.2848260"},{"issue":"8","key":"ref35","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00287"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"ref38","article-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs","author":"Schuhmann","year":"2021","journal-title":"arXiv preprint"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref40","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","volume-title":"Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies","author":"Chen","year":"2011"},{"key":"ref41","article-title":"Beats: audio pre-training with acoustic tokenizers","volume-title":"Proceedings of the 40th International Conference on Machine Learning","author":"Chen"},{"key":"ref42","article-title":"Llama 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023","journal-title":"arXiv preprint"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"ref44","article-title":"Openflamingo: An open-source framework for training large autoregressive vision-language models","author":"Awadalla","year":"2023","journal-title":"arXiv preprint"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00904"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"}],"event":{"name":"2025 International Conference on Content-Based Multimedia Indexing (CBMI)","location":"Dublin, Ireland","start":{"date-parts":[[2025,10,22]]},"end":{"date-parts":[[2025,10,24]]}},"container-title":["2025 International Conference on Content-Based Multimedia Indexing (CBMI)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11339229\/11339242\/11339348.pdf?arnumber=11339348","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T07:15:23Z","timestamp":1768979723000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11339348\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,22]]},"references-count":46,"URL":"https:\/\/doi.org\/10.1109\/cbmi66578.2025.11339348","relation":{},"subject":[],"published":{"date-parts":[[2025,10,22]]}}}