{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,16]],"date-time":"2026-02-16T18:16:39Z","timestamp":1771265799881,"version":"3.50.1"},"reference-count":47,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,22]]},"DOI":"10.1109\/cbmi66578.2025.11339325","type":"proceedings-article","created":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T20:38:56Z","timestamp":1768941536000},"page":"1-8","source":"Crossref","is-referenced-by-count":1,"title":["MI-Cap: A Multi-Modal Interpretable Model for Video Captioning"],"prefix":"10.1109","author":[{"given":"Antoine","family":"Hanna-Asaad","sequence":"first","affiliation":[{"name":"ESTACA Paris-Saclay,ESTACA&#x0027;Lab,Montigny-le-Bretonneux,France"}]},{"given":"Decky","family":"Aspandi-Latif","sequence":"additional","affiliation":[{"name":"SAMOVAR, T&#x00E9;l&#x00E9;com SudParis, Institute Polytechnique de Paris,Palaiseau,France,91120"}]},{"given":"Titus","family":"Zaharia","sequence":"additional","affiliation":[{"name":"SAMOVAR, T&#x00E9;l&#x00E9;com SudParis, Institute Polytechnique de Paris,Palaiseau,France,91120"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3522295"},{"key":"ref2","first-page":"24206","article-title":"Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text","volume":"34","author":"Akbari","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/FG.2019.8756557"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TAFFC.2022.3156026"},{"key":"ref5","first-page":"65","article-title":"Meteor: An automatic metric for mt evaluation with improved correlation with human judgments","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization","author":"Banerjee"},{"key":"ref6","article-title":"Video in sentences out","author":"Barbu","year":"2012","journal-title":"arXiv preprint"},{"key":"ref7","article-title":"Valor: Vision-audio-language omni-perception pretraining model and dataset","volume":"08345","author":"Chen","year":"2023","journal-title":"arXiv preprint"},{"key":"ref8","article-title":"Vast: A vision-audio-subtitle-text omni-modality foundation model and dataset","volume":"36","author":"Chen","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref9","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv preprint"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10095889"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.127"},{"key":"ref12","first-page":"297","article-title":"Noise-contrastive estimation: A new estimation principle for unnormalized statistical models","volume-title":"Proceedings of the thirteenth international conference on artificial intelligence and statistics","author":"Gutmann"},{"key":"ref13","author":"Kay","year":"2017","journal-title":"The kinetics human action video dataset"},{"key":"ref14","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"International conference on machine learning","author":"Li"},{"key":"ref15","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"Lin","year":"2004","journal-title":"Text summarization branches out"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.345"},{"issue":"11","key":"ref19","article-title":"Visualizing data using t-sne","volume":"9","author":"Van der Maaten","year":"2008","journal-title":"Journal of machine learning research"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref22","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.3390\/fi14010002"},{"key":"ref24","first-page":"31450","article-title":"Causal interpretation of self-attention in pre-trained transformers","volume":"36","author":"Rohekar","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01228-7"},{"key":"ref26","author":"Singh","year":"2020","journal-title":"arXiv preprint"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2017.03.021"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.21437\/icslp.1994-538"},{"key":"ref29","article-title":"Vl-bert: Pre-training of generic visual-linguistic representations","author":"Su","year":"2019","journal-title":"arXiv preprint"},{"key":"ref30","first-page":"3319","article-title":"Axiomatic attribution for deep networks","volume-title":"International conference on machine learning","author":"Sundararajan"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/2818048.2820013"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00273"},{"key":"ref37","article-title":"Git: A generative image-to-text transformer for vision and language","author":"Wang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.5555\/3045118.3045336"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01741"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.5220\/0011747300003417"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01311"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"}],"event":{"name":"2025 International Conference on Content-Based Multimedia Indexing (CBMI)","location":"Dublin, Ireland","start":{"date-parts":[[2025,10,22]]},"end":{"date-parts":[[2025,10,24]]}},"container-title":["2025 International Conference on Content-Based Multimedia Indexing (CBMI)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11339229\/11339242\/11339325.pdf?arnumber=11339325","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T07:11:47Z","timestamp":1768979507000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11339325\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,22]]},"references-count":47,"URL":"https:\/\/doi.org\/10.1109\/cbmi66578.2025.11339325","relation":{},"subject":[],"published":{"date-parts":[[2025,10,22]]}}}