{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T23:08:58Z","timestamp":1778800138793,"version":"3.51.4"},"reference-count":79,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T00:00:00Z","timestamp":1776816000000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006595","name":"Executive Unit for Financing Higher Education Research Development and Innovation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100006595","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Computer Vision and Image Understanding"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1016\/j.cviu.2026.104772","type":"journal-article","created":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T15:43:11Z","timestamp":1777131791000},"page":"104772","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["EARS4SEE: A multimodal audio description system dedicated to blind and visually impaired users"],"prefix":"10.1016","volume":"268","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3170-4150","authenticated-orcid":false,"given":"Ruxandra","family":"Tapu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bogdan","family":"Mocanu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.cviu.2026.104772_b1","series-title":"How to select an audio description vendor","author":"3PlayMedia","year":"2024"},{"issue":"6","key":"10.1016\/j.cviu.2026.104772_b2","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3355390","article-title":"Video description: A survey of methods, datasets, and evaluation metrics","volume":"52","author":"Aafaq","year":"2019","journal-title":"ACM Comput. Surv."},{"key":"10.1016\/j.cviu.2026.104772_b3","series-title":"Pixtral 12b","author":"Agrawal","year":"2024"},{"key":"10.1016\/j.cviu.2026.104772_b4","series-title":"Llama 3 model card","author":"AI@Meta","year":"2024"},{"key":"10.1016\/j.cviu.2026.104772_b5","doi-asserted-by":"crossref","first-page":"23716","DOI":"10.52202\/068431-1723","article-title":"Flamingo: a visual language model for few-shot learning","volume":"35","author":"Alayrac","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.cviu.2026.104772_b6","series-title":"Cloud qwen2.5-VL-7B-instruct, hugging face","author":"Alibaba Cloud","year":"2025"},{"key":"10.1016\/j.cviu.2026.104772_b7","doi-asserted-by":"crossref","unstructured":"Bain,\u00a0M., Huh,\u00a0J., Han,\u00a0T., Zisserman,\u00a0A., 2023. WhisperX: Time-accurate speech transcription of long-form audio. In: Proceedings of INTERSPEECH 2023. 2023.","DOI":"10.21437\/Interspeech.2023-78"},{"key":"10.1016\/j.cviu.2026.104772_b8","series-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing","article-title":"A video is worth 4096 tokens: Verbalize story videos to understand them in zero-shot","author":"Bhattacharya","year":"2023"},{"key":"10.1016\/j.cviu.2026.104772_b9","doi-asserted-by":"crossref","unstructured":"Brown,\u00a0A., Coto,\u00a0E., Zisserman,\u00a0A., 2021b. Automated video labeling: Identifying faces by corroborative evidence. In: Proceedings of the 2021 IEEE 4th International Conference on Multimedia Information Processing and Retrieval. MIPR, pp. 77\u201383.","DOI":"10.1109\/MIPR51284.2021.00019"},{"key":"10.1016\/j.cviu.2026.104772_b10","doi-asserted-by":"crossref","unstructured":"Brown,\u00a0A., Kalogeiton,\u00a0V., Zisserman,\u00a0A., 2021a. Face, body, voice: Video person-clustering with multiple modalities. In: Proceedings of the ICCV 2021 Workshop on AI for Creative Video Editing and UnderstandIng.","DOI":"10.1109\/ICCVW54120.2021.00357"},{"key":"10.1016\/j.cviu.2026.104772_b11","doi-asserted-by":"crossref","unstructured":"Chen,\u00a0L., Tong,\u00a0Z., Song,\u00a0Y., Wu,\u00a0G., Wang,\u00a0L., 2023a. Efficient video action detection with token dropout and context refinement. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. ICCV, pp. 10388\u201310399.","DOI":"10.1109\/ICCV51070.2023.00953"},{"key":"10.1016\/j.cviu.2026.104772_b12","doi-asserted-by":"crossref","unstructured":"Chen,\u00a0L., Tong,\u00a0Z., Song,\u00a0Y., Wu,\u00a0G., Wang,\u00a0L., 2023b. Efficient video action detection with token dropout and context refinement. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. ICCV, pp. 10388\u201310399.","DOI":"10.1109\/ICCV51070.2023.00953"},{"key":"10.1016\/j.cviu.2026.104772_b13","series-title":"Vicuna: An open-source chatbot impressing GPT-4 with 90% ChatGPT quality","author":"Chiang","year":"2023"},{"key":"10.1016\/j.cviu.2026.104772_b14","series-title":"PaLM: Scaling language modeling with pathways","author":"Chowdhery","year":"2022"},{"key":"10.1016\/j.cviu.2026.104772_b15","series-title":"LLM-AD: Large language model based audio description system","author":"Chu","year":"2024"},{"key":"10.1016\/j.cviu.2026.104772_b16","doi-asserted-by":"crossref","unstructured":"Deng,\u00a0J., Guo,\u00a0J., Ververas,\u00a0E., Kotsia,\u00a0I., Zafeiriou,\u00a0S., 2020. RetinaFace: Single-shot multi-level face localisation in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, pp. 5203\u20135212.","DOI":"10.1109\/CVPR42600.2020.00525"},{"key":"10.1016\/j.cviu.2026.104772_b17","doi-asserted-by":"crossref","unstructured":"Deng,\u00a0J., Guo,\u00a0J., Xue,\u00a0N., Zafeiriou,\u00a0S., 2019. ArcFace: Additive angular margin loss for deep face recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, pp. 4690\u20134699.","DOI":"10.1109\/CVPR.2019.00482"},{"key":"10.1016\/j.cviu.2026.104772_b18","unstructured":"Dosovitskiy,\u00a0A., Beyer,\u00a0L., Kolesnikov,\u00a0A., Weissenborn,\u00a0D., Zhai,\u00a0X., Unterthiner,\u00a0T., Dehghani,\u00a0M., Minderer,\u00a0M., Heigold,\u00a0G., Gelly,\u00a0S., Uszkoreit,\u00a0J., Houlsby,\u00a0N., 2020. An image is worth 16x16 words: Transformers for image recognition at scale. In: Proceedings of the International Conference on Learning Representations. ICLR."},{"issue":"4","key":"10.1016\/j.cviu.2026.104772_b19","doi-asserted-by":"crossref","first-page":"287","DOI":"10.1177\/0145482X1310700405","article-title":"Could audio-described films benefit from audio introductions? An audience response study","volume":"107","author":"Fresco","year":"2013","journal-title":"J. Vis. Impair. Blind."},{"key":"10.1016\/j.cviu.2026.104772_b20","series-title":"BERTopic: Neural topic modeling with a class-based TF-IDF procedure","author":"Grootendorst","year":"2022"},{"key":"10.1016\/j.cviu.2026.104772_b21","unstructured":"Gu,\u00a0A., Goel,\u00a0K., R\u00e9,\u00a0C., 2022. Efficiently modeling long sequences with structured state spaces. In: Proc. Int. Conf. Learn. Represent.. ICLR."},{"key":"10.1016\/j.cviu.2026.104772_b22","doi-asserted-by":"crossref","unstructured":"Han,\u00a0T., Bain,\u00a0M., Nagrani,\u00a0A., Varol,\u00a0G., Xie,\u00a0W., Zisserman,\u00a0A., 2023a. AutoAD II: The sequel \u2013 Who, when, and what in movie audio description. In: Proceedings of the IEEE International Conference on Computer Vision. ICCV.","DOI":"10.1109\/ICCV51070.2023.01255"},{"key":"10.1016\/j.cviu.2026.104772_b23","doi-asserted-by":"crossref","unstructured":"Han,\u00a0T., Bain,\u00a0M., Nagrani,\u00a0A., Varol,\u00a0G., Xie,\u00a0W., Zisserman,\u00a0A., 2023b. AutoAD II: The sequel\u2014Who, when, and what in movie audio description. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. ICCV.","DOI":"10.1109\/ICCV51070.2023.01255"},{"key":"10.1016\/j.cviu.2026.104772_b24","doi-asserted-by":"crossref","unstructured":"Han,\u00a0T., Bain,\u00a0M., Nagrani,\u00a0A., Varol,\u00a0G., Xie,\u00a0W., Zisserman,\u00a0A., 2023c. AutoAD: Movie description in context. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR.","DOI":"10.1109\/CVPR52729.2023.01815"},{"key":"10.1016\/j.cviu.2026.104772_b25","doi-asserted-by":"crossref","unstructured":"Han,\u00a0T., Bain,\u00a0M., Nagrani,\u00a0A., Varol,\u00a0G., Xie,\u00a0W., Zisserman,\u00a0A., 2024. AutoAD III: The Prequel\u2014Back to the Pixels. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, pp. 18164\u201318174.","DOI":"10.1109\/CVPR52733.2024.01720"},{"key":"10.1016\/j.cviu.2026.104772_b26","doi-asserted-by":"crossref","unstructured":"Hassani,\u00a0A., Walton,\u00a0S., Li,\u00a0J., Li,\u00a0S., Shi,\u00a0H., 2023. Neighborhood attention transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, pp. 6185\u20136194.","DOI":"10.1109\/CVPR52729.2023.00599"},{"key":"10.1016\/j.cviu.2026.104772_b27","doi-asserted-by":"crossref","unstructured":"Huang,\u00a0Q., Liu,\u00a0W., Lin,\u00a0D., 2018. Person search in videos with one portrait through visual and temporal links. In: Proceedings of the European Conference on Computer Vision. ECCV, pp. 425\u2013441.","DOI":"10.1007\/978-3-030-01261-8_26"},{"key":"10.1016\/j.cviu.2026.104772_b28","doi-asserted-by":"crossref","unstructured":"Huang,\u00a0Q., Xiong,\u00a0Y., Rao,\u00a0A., Wang,\u00a0J., Lin,\u00a0D., 2020. MovieNet: A holistic dataset for movie understanding. In: Proceedings of the European Conference on Computer Vision. ECCV, pp. 709\u2013727.","DOI":"10.1007\/978-3-030-58548-8_41"},{"key":"10.1016\/j.cviu.2026.104772_b29","doi-asserted-by":"crossref","unstructured":"Islam,\u00a0M.M., Hasan,\u00a0M., Athrey,\u00a0K.S., Braskich,\u00a0T., Bertasius,\u00a0G., 2023. Efficient movie scene detection using state-space transformers. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. CVPR, pp. 18749\u201318758.","DOI":"10.1109\/CVPR52729.2023.01798"},{"key":"10.1016\/j.cviu.2026.104772_b30","unstructured":"Korbar,\u00a0B., Zisserman,\u00a0A., 2022. Personalised CLIP OR: How to find your vacation videos. In: Proceedings of the British Machine Vision Conference. BMVC."},{"key":"10.1016\/j.cviu.2026.104772_b31","doi-asserted-by":"crossref","unstructured":"Krishna,\u00a0R., Hata,\u00a0K., Ren,\u00a0F., Fei-Fei,\u00a0L., Niebles,\u00a0J.C., 2017. Dense-captioning events in videos. In: Proceedings of the IEEE International Conference on Computer Vision. ICCV, pp. 706\u2013715.","DOI":"10.1109\/ICCV.2017.83"},{"key":"10.1016\/j.cviu.2026.104772_b32","series-title":"VideoChat: Chat-centric video understanding","author":"Li","year":"2023"},{"key":"10.1016\/j.cviu.2026.104772_b33","series-title":"MM-Vid: Advancing video understanding with GPT-4V(ision)","author":"Lin","year":"2023"},{"key":"10.1016\/j.cviu.2026.104772_b34","series-title":"MM-VID: Advancing video understanding with GPT-4V(ision)","author":"Lin","year":"2023"},{"key":"10.1016\/j.cviu.2026.104772_b35","doi-asserted-by":"crossref","unstructured":"Lin,\u00a0K., Li,\u00a0L., Lin,\u00a0C.-C., Ahmed,\u00a0F., Gan,\u00a0Z., Liu,\u00a0Z., Lu,\u00a0Y., Wang,\u00a0L., 2022. SwinBERT: End-to-end transformers with sparse attention for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR.","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"10.1016\/j.cviu.2026.104772_b36","doi-asserted-by":"crossref","unstructured":"Lu,\u00a0Z., Grauman,\u00a0K., 2013. Story-driven summarization for egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR.","DOI":"10.1109\/CVPR.2013.350"},{"key":"10.1016\/j.cviu.2026.104772_b37","series-title":"UniViLM: A unified video and language pre-training model for multimodal understanding and generation","author":"Luo","year":"2020"},{"key":"10.1016\/j.cviu.2026.104772_b38","article-title":"DiffusionTrack: Diffusion model for multi-object tracking","volume":"vol. 38","author":"Luo","year":"2024"},{"issue":"11","key":"10.1016\/j.cviu.2026.104772_b39","doi-asserted-by":"crossref","first-page":"205","DOI":"10.21105\/joss.00205","article-title":"HDBSCAN: Hierarchical density-based clustering","volume":"2","author":"McInnes","year":"2017","journal-title":"J. Open Source Softw."},{"key":"10.1016\/j.cviu.2026.104772_b40","series-title":"Llama 3.2: Revolutionizing edge AI and vision with open-source models, AI at Meta","author":"Meta AI","year":"2024"},{"key":"10.1016\/j.cviu.2026.104772_b41","doi-asserted-by":"crossref","unstructured":"Mocanu,\u00a0B., Tapu,\u00a0R., Zaharia,\u00a0T., 2017. Single object tracking using offline trained deep regression networks. In: Proceedings of the 7th International Conference on Image Processing Theory, Tools, and Applications. IPTA, pp. 1\u20136.","DOI":"10.1109\/IPTA.2017.8310091"},{"key":"10.1016\/j.cviu.2026.104772_b42","series-title":"Proc. Asian Conf. Comput. Vis.","first-page":"485","article-title":"BaSSL: Boundary-aware self-supervised learning for video scene segmentation","author":"Mun","year":"2022"},{"key":"10.1016\/j.cviu.2026.104772_b43","series-title":"GPT-4 technical report","author":"OpenAI","year":"2023"},{"issue":"3","key":"10.1016\/j.cviu.2026.104772_b44","doi-asserted-by":"crossref","first-page":"424","DOI":"10.1075\/target.28.3.04per","article-title":"Gains and losses of watching audio-described films for sighted viewers","volume":"28","author":"Perego","year":"2016","journal-title":"Target - Int. J. Transl. Stud."},{"key":"10.1016\/j.cviu.2026.104772_b45","unstructured":"Radford,\u00a0A., Kim,\u00a0J.W., Hallacy,\u00a0C., Ramesh,\u00a0A., Goh,\u00a0G., Agarwal,\u00a0S., Sastry,\u00a0G., Askell,\u00a0A., Mishkin,\u00a0P., Clark,\u00a0J., et al., 2021. Learning transferable visual models from natural language supervision. In: Proceedings of the International Conference on Machine Learning. ICML, pp. 8748\u20138763."},{"key":"10.1016\/j.cviu.2026.104772_b46","doi-asserted-by":"crossref","unstructured":"Rao,\u00a0A., Xu,\u00a0L., Xiong,\u00a0Y., Xu,\u00a0G., Huang,\u00a0Q., Zhou,\u00a0B., Lin,\u00a0D., 2020. A local-to-global approach to multi-modal movie scene segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. CVPR, pp. 10143\u201310152.","DOI":"10.1109\/CVPR42600.2020.01016"},{"key":"10.1016\/j.cviu.2026.104772_b47","series-title":"Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing","article-title":"Sentence-BERT: Sentence embeddings using siamese BERT-networks","author":"Reimers","year":"2019"},{"key":"10.1016\/j.cviu.2026.104772_b48","doi-asserted-by":"crossref","first-page":"232","DOI":"10.26034\/cm.jostrans.2016.286","article-title":"Audio description services in Europe: an update","volume":"26","author":"Reviers","year":"2016","journal-title":"JoSTrans: J. Spec. Transl."},{"key":"10.1016\/j.cviu.2026.104772_b49","doi-asserted-by":"crossref","unstructured":"Seo,\u00a0P.H., Nagrani,\u00a0A., Arnab,\u00a0A., Schmid,\u00a0C., 2022. End-to-end generative pretraining for multimodal video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR.","DOI":"10.1109\/CVPR52688.2022.01743"},{"key":"10.1016\/j.cviu.2026.104772_b50","series-title":"The Visual Made Verbal: A Comprehensive Training Manual and Guide to the History and Applications of Audio Description","author":"Snyder","year":"2014"},{"key":"10.1016\/j.cviu.2026.104772_b51","doi-asserted-by":"crossref","unstructured":"Soldan,\u00a0M., Pardo,\u00a0A., Alc\u00e1zar,\u00a0J.L., Caba Heilbron,\u00a0F., Zhao,\u00a0C., Giancola,\u00a0S., Ghanem,\u00a0B., 2022. MAD: A Scalable Dataset for Language Grounding in Videos from Movie Audio Descriptions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, pp. 5026\u20135035.","DOI":"10.1109\/CVPR52688.2022.00497"},{"key":"10.1016\/j.cviu.2026.104772_b52","series-title":"MovieChat: From dense token to sparse memory for long video understanding","author":"Song","year":"2023"},{"key":"10.1016\/j.cviu.2026.104772_b53","first-page":"6978","article-title":"Temporal coherent object flow for multi-object tracking","volume":"vol. 39","author":"Song","year":"2025"},{"key":"10.1016\/j.cviu.2026.104772_b54","first-page":"2321","article-title":"Compact transformer tracker with correlative masked modeling","volume":"vol. 37","author":"Song","year":"2023"},{"key":"10.1016\/j.cviu.2026.104772_b55","doi-asserted-by":"crossref","unstructured":"Song,\u00a0Z., Yu,\u00a0J., Chen,\u00a0Y.-P.P., Yang,\u00a0W., 2022. Transformer tracking with cyclic shifting window attention. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit.. CVPR, pp. 8791\u20138800.","DOI":"10.1109\/CVPR52688.2022.00859"},{"key":"10.1016\/j.cviu.2026.104772_b56","series-title":"TransNet V2: An effective deep network architecture for fast shot transition detection","author":"Sou\u010dek","year":"2020"},{"key":"10.1016\/j.cviu.2026.104772_b57","doi-asserted-by":"crossref","unstructured":"Tan,\u00a0J., Wang,\u00a0H., Li,\u00a0J., Ou,\u00a0Z., Qian,\u00a0Z., 2024a. Neighbor relations matter in video scene detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. CVPR, pp. 18473\u201318482.","DOI":"10.1109\/CVPR52733.2024.01748"},{"issue":"4","key":"10.1016\/j.cviu.2026.104772_b58","doi-asserted-by":"crossref","DOI":"10.1145\/3630257","article-title":"Characters link shots: Character attention network for movie scene segmentation","volume":"20","author":"Tan","year":"2024","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"10.1016\/j.cviu.2026.104772_b59","doi-asserted-by":"crossref","unstructured":"Tapaswi,\u00a0M., B\u00e4uml,\u00a0M., Stiefelhagen,\u00a0R., 2012. Knock! Knock! Who is it? Probabilistic person identification in TV series. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. CVPR.","DOI":"10.1109\/CVPR.2012.6247986"},{"key":"10.1016\/j.cviu.2026.104772_b60","doi-asserted-by":"crossref","unstructured":"Tapaswi,\u00a0M., Law,\u00a0M.T., Fidler,\u00a0S., 2019. Video face clustering with an unknown number of clusters. In: Proceedings of the IEEE International Conference on Computer Vision. ICCV.","DOI":"10.1109\/ICCV.2019.00513"},{"key":"10.1016\/j.cviu.2026.104772_b61","doi-asserted-by":"crossref","first-page":"99582","DOI":"10.1109\/ACCESS.2020.2997949","article-title":"Deep-AD: A multimodal temporal video segmentation framework for online video advertising","volume":"8","author":"Tapu","year":"2020","journal-title":"IEEE Access"},{"key":"10.1016\/j.cviu.2026.104772_b62","doi-asserted-by":"crossref","unstructured":"Tapu,\u00a0R., Zaharia,\u00a0T., 2013. Salient object detection based on spatiotemporal attention models. In: Proceedings of the IEEE International Conference on Consumer Electronics. ICCE, pp. 39\u201342.","DOI":"10.1109\/ICCE.2013.6486786"},{"key":"10.1016\/j.cviu.2026.104772_b63","series-title":"LLaMA 2: Open foundation and fine-tuned chat models","author":"Touvron","year":"2023"},{"key":"10.1016\/j.cviu.2026.104772_b64","doi-asserted-by":"crossref","unstructured":"Vedantam,\u00a0R., Zitnick,\u00a0C.L., Parikh,\u00a0D., 2015. CIDEr: Consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. CVPR, pp. 4566\u20134575.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"10.1016\/j.cviu.2026.104772_b65","doi-asserted-by":"crossref","unstructured":"Wang,\u00a0J., Jiang,\u00a0W., Ma,\u00a0L., Liu,\u00a0W., Xu,\u00a0Y., 2018. Bidirectional attentive fusion with context gating for dense video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR.","DOI":"10.1109\/CVPR.2018.00751"},{"key":"10.1016\/j.cviu.2026.104772_b66","series-title":"Contextual AD narration with interleaved multimodal sequence","author":"Wang","year":"2024"},{"key":"10.1016\/j.cviu.2026.104772_b67","doi-asserted-by":"crossref","unstructured":"Wang,\u00a0T., Zhang,\u00a0R., Lu,\u00a0Z., Zheng,\u00a0F., Cheng,\u00a0R., Luo,\u00a0P., 2021. End-to-end dense video captioning with parallel decoding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. ICCV.","DOI":"10.1109\/ICCV48922.2021.00677"},{"key":"10.1016\/j.cviu.2026.104772_b68","unstructured":"Wei,\u00a0J., Wang,\u00a0X., Schuurmans,\u00a0D., Bosma,\u00a0M., Xia,\u00a0F., Chi,\u00a0E., Le,\u00a0Q.V., Zhou,\u00a0D., et al., 2022. Chain-of-thought prompting elicits reasoning in large language models. In: Proceedings of the Conference on Neural Information Processing Systems. NeurIPS."},{"key":"10.1016\/j.cviu.2026.104772_b69","series-title":"Blindness and vision impairment. World Health Organization","author":"World Health Organization, (WHO)","year":"2025"},{"key":"10.1016\/j.cviu.2026.104772_b70","first-page":"2731","article-title":"Pale transformer: A general vision transformer backbone with pale-shaped attention","volume":"vol. 36","author":"Wu","year":"2022"},{"key":"10.1016\/j.cviu.2026.104772_b71","doi-asserted-by":"crossref","unstructured":"Xie,\u00a0J., Han,\u00a0T., Bain,\u00a0M., Nagrani,\u00a0A., Varol,\u00a0G., Xie,\u00a0W., Zisserman,\u00a0A., 2024a. AutoAD-Zero: A Training-Free Framework for Zero-Shot Audio Description. In: Proceedings of the Asian Conference on Computer Vision. ACCV, pp. 1\u201317.","DOI":"10.1007\/978-981-96-0908-6_5"},{"key":"10.1016\/j.cviu.2026.104772_b72","series-title":"AutoAD-Zero: A training-free framework for zero-shot audio description","author":"Xie","year":"2024"},{"key":"10.1016\/j.cviu.2026.104772_b73","doi-asserted-by":"crossref","unstructured":"Yin,\u00a0H., Vahdat,\u00a0A., Alvarez,\u00a0J.M., Mallya,\u00a0A., Kautz,\u00a0J., Molchanov,\u00a0P., 2022. A-ViT: Adaptive tokens for efficient vision transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, pp. 10809\u201310818.","DOI":"10.1109\/CVPR52688.2022.01054"},{"key":"10.1016\/j.cviu.2026.104772_b74","series-title":"VideoBLIP, software, GitHub repository","author":"Yu","year":"2023"},{"key":"10.1016\/j.cviu.2026.104772_b75","doi-asserted-by":"crossref","unstructured":"Yun,\u00a0S., Ro,\u00a0Y., 2024. ShViT: Single-head vision transformer with memory-efficient macro design. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. CVPR, pp. 5756\u20135767.","DOI":"10.1109\/CVPR52733.2024.00550"},{"key":"10.1016\/j.cviu.2026.104772_b76","doi-asserted-by":"crossref","unstructured":"Zhang,\u00a0K., Chao,\u00a0W.L., Sha,\u00a0F., Grauman,\u00a0K., 2016. Video summarization with long short-term memory. In: Proceedings of the European Conference on Computer Vision. ECCV.","DOI":"10.1007\/978-3-319-46478-7_47"},{"key":"10.1016\/j.cviu.2026.104772_b77","doi-asserted-by":"crossref","unstructured":"Zhang,\u00a0H., Li,\u00a0X., Bing,\u00a0L., 2023. Video-LLaMA: An instruction-tuned audio-visual language model for video understanding. In: Proc. Conf. Empir. Methods Nat. Lang. Process. EMNLP.","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"10.1016\/j.cviu.2026.104772_b78","series-title":"MM-Narrator: Narrating long-form videos with multimodal in-context learning","author":"Zhang","year":"2023"},{"key":"10.1016\/j.cviu.2026.104772_b79","series-title":"Video instruction tuning with synthetic data","author":"Zhang","year":"2024"}],"container-title":["Computer Vision and Image Understanding"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1077314226001396?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1077314226001396?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T22:30:43Z","timestamp":1778797843000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1077314226001396"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5]]},"references-count":79,"alternative-id":["S1077314226001396"],"URL":"https:\/\/doi.org\/10.1016\/j.cviu.2026.104772","relation":{},"ISSN":["1077-3142"],"issn-type":[{"value":"1077-3142","type":"print"}],"subject":[],"published":{"date-parts":[[2026,5]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"EARS4SEE: A multimodal audio description system dedicated to blind and visually impaired users","name":"articletitle","label":"Article Title"},{"value":"Computer Vision and Image Understanding","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.cviu.2026.104772","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 The Author(s). Published by Elsevier Inc.","name":"copyright","label":"Copyright"}],"article-number":"104772"}}