{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,17]],"date-time":"2026-01-17T00:48:08Z","timestamp":1768610888666,"version":"3.49.0"},"publisher-location":"Cham","reference-count":31,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032144942","type":"print"},{"value":"9783032144959","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-14495-9_7","type":"book-chapter","created":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T15:22:34Z","timestamp":1768576954000},"page":"85-97","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Seeing Through Words: A Zero-Shot Multimodal Audio Description System with Foundation Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8010-9943","authenticated-orcid":false,"given":"Bogdan","family":"Mocanu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3170-4150","authenticated-orcid":false,"given":"Ruxandra","family":"Tapu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,1,17]]},"reference":[{"key":"7_CR1","unstructured":"World Health Organization: Blindness and vision impairment. https:\/\/www.who.int\/news-room\/fact-sheets\/detail\/blindness-and-visual-impairment. Accessed 30 July 2025"},{"key":"7_CR2","unstructured":"Snyder, J.: The visual made verbal. Am. Council Blind (2014)"},{"key":"7_CR3","doi-asserted-by":"publisher","first-page":"232","DOI":"10.26034\/cm.jostrans.2016.286","volume":"26","author":"N Reviers","year":"2016","unstructured":"Reviers, N.: Audio description services in Europe: an update. JoSTrans 26, 232\u2013247 (2016)","journal-title":"JoSTrans"},{"key":"7_CR4","unstructured":"PlayMedia: How to select an audio description vendor. https:\/\/www.3playmedia.com\/blog\/select-audio-description-vendor\/. Accessed 30 July 2025"},{"issue":"3","key":"7_CR5","doi-asserted-by":"publisher","first-page":"424","DOI":"10.1075\/target.28.3.04per","volume":"28","author":"E Perego","year":"2016","unstructured":"Perego, E.: Gains and losses of watching audio-described films. Target 28(3), 424\u2013444 (2016)","journal-title":"Target"},{"issue":"4","key":"7_CR6","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1177\/0145482X1310700405","volume":"107","author":"PR Fresco","year":"2013","unstructured":"Fresco, P.R., Fryer, L.: Could audio-described films benefit from audio introductions? J. Vis. Impair. Blind. 107(4), 287\u2013295 (2013)","journal-title":"J. Vis. Impair. Blind."},{"key":"7_CR7","doi-asserted-by":"crossref","unstructured":"Han, T., Bain, M., Nagrani, A., et al.: AutoAD III: the prequel\u2014back to the pixels. In: CVPR, pp. 18164\u201318174 (2024)","DOI":"10.1109\/CVPR52733.2024.01720"},{"key":"7_CR8","doi-asserted-by":"crossref","unstructured":"Xie, J., Han, T., Bain, M., et al.: AutoAD-zero: a training-free framework for zero-shot AD. In: ACCV, pp. 1\u201317 (2024)","DOI":"10.1007\/978-981-96-0908-6_5"},{"key":"7_CR9","doi-asserted-by":"crossref","unstructured":"Soldan, M., Pardo, A., Alc\u00e1zar, J.L., et al.: MAD: a scalable dataset for language grounding in videos. In: CVPR, pp. 5026\u20135035 (2022)","DOI":"10.1109\/CVPR52688.2022.00497"},{"key":"7_CR10","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., et al., Dense-captioning events in videos. In: ICCV, pp. 706\u2013715 (2017)","DOI":"10.1109\/ICCV.2017.83"},{"issue":"6","key":"7_CR11","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3355390","volume":"52","author":"A Aafaq","year":"2019","unstructured":"Aafaq, A., Mian, A., Liu, W., et al.: Video description: a survey. ACM Comput. Surv. 52(6), 1\u201337 (2019)","journal-title":"ACM Comput. Surv."},{"key":"7_CR12","doi-asserted-by":"crossref","unstructured":"Wang, T., Zhang, R., Lu, Z., et al., End-to-end dense video captioning with parallel decoding. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00677"},{"key":"7_CR13","doi-asserted-by":"crossref","unstructured":"Wang, H., Tong, Z., Zheng, K., et al.: Contextual AD narration with multimodal sequence. arXiv:2403.12922 (2024)","DOI":"10.1109\/CVPR52734.2025.00784"},{"key":"7_CR14","unstructured":"Chiang, W.-L., Li, Z., Lin, Z., et al.: Vicuna: an open-source chatbot (2023)"},{"key":"7_CR15","unstructured":"Touvron, H., Martin, L., Stone, K., et al.: LLaMA 2: open foundation and fine-tuned models. arXiv:2307.09288 (2023)"},{"key":"7_CR16","doi-asserted-by":"crossref","unstructured":"Song, E., Chai, W., Wang, G., et al.: MovieChat: sparse memory for long video understanding. arXiv:2307.16449 (2023)","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"7_CR17","unstructured":"Lin, K., Ahmed, F., Li, L., et al.: MM-VID: advancing video understanding with GPT-4V. arXiv:2310.19773 (2023)"},{"key":"7_CR18","unstructured":"OpenAI. GPT-4 technical report (2023)"},{"key":"7_CR19","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., Law, M.T., Fidler, S.: Video face clustering with unknown number of clusters. In: ICCV, pp. 5027\u20135036 (2019)","DOI":"10.1109\/ICCV.2019.00513"},{"key":"7_CR20","unstructured":"Chu, P., Wang, J., Abrantes, A.: LLM-AD: large language model based audio description. arXiv:2405.00983 (2024)"},{"key":"7_CR21","doi-asserted-by":"crossref","unstructured":"Zhang, C., Lin, K., Yang, Z., et al.: MM-narrator: narrating long-form videos with multimodal in-context learning. arXiv:2311.17435 (2023)","DOI":"10.1109\/CVPR52733.2024.01295"},{"key":"7_CR22","unstructured":"Sou\u010dek, T., Loko\u010d, J., TransNet V2: deep network for fast shot detection. ACM MM (2024)"},{"key":"7_CR23","unstructured":"Radford, A., Kim, J.W., Hallacy, C., et al.: Learning transferable visual models from language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"key":"7_CR24","doi-asserted-by":"crossref","unstructured":"Bain, M., Huh, J., Han, T., Zisserman, A.: WhisperX: time-accurate speech transcription. In: INTERSPEECH, pp. 4489\u20134493 (2023)","DOI":"10.21437\/Interspeech.2023-78"},{"key":"7_CR25","doi-asserted-by":"crossref","unstructured":"Reimers, N., Gurevych, I.: Sentence-BERT: sentence embeddings with Siamese BERT. In: EMNLP, pp. 3982\u20133992 (2019)","DOI":"10.18653\/v1\/D19-1410"},{"issue":"11","key":"7_CR26","doi-asserted-by":"publisher","first-page":"205","DOI":"10.21105\/joss.00205","volume":"2","author":"L McInnes","year":"2017","unstructured":"McInnes, L., Healy, J., Astels, S.: HDBSCAN: hierarchical density-based clustering. JOSS 2(11), 205 (2017)","journal-title":"JOSS"},{"key":"7_CR27","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Ververas, E., et al.: RetinaFace: multi-level face localisation in the wild. In: CVPR, pp. 5203\u20135212 (2020)","DOI":"10.1109\/CVPR42600.2020.00525"},{"key":"7_CR28","doi-asserted-by":"crossref","unstructured":"Deng, J., Guo, J., Xue, N., Zafeiriou, S.: ArcFace: additive angular margin loss for deep face recognition. In: CVPR, pp. 4690\u20134699 (2019)","DOI":"10.1109\/CVPR.2019.00482"},{"key":"7_CR29","doi-asserted-by":"crossref","unstructured":"Mocanu, B., Tapu, R., Zaharia, T.: Single object tracking with deep regression networks. In: IPTA, pp. 1\u20136 (2017)","DOI":"10.1109\/IPTA.2017.8310091"},{"key":"7_CR30","unstructured":"Meta AI, Llama 3.2: Edge AI and vision with open-source models. AI at Meta (2024)"},{"key":"7_CR31","doi-asserted-by":"publisher","first-page":"38","DOI":"10.4018\/jmdem.2011100103","volume":"2","author":"R Tapu","year":"2011","unstructured":"Tapu, R., Zaharia, T.: Video segmentation and structuring for indexing applications. Int. J. Multimed. Data Eng. Manag. 2, 38\u201358 (2011)","journal-title":"Int. J. Multimed. Data Eng. Manag."}],"container-title":["Lecture Notes in Computer Science","Advances in Visual Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-14495-9_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T15:22:38Z","timestamp":1768576958000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-14495-9_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9783032144942","9783032144959"],"references-count":31,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-14495-9_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"17 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ISVC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Symposium on Visual Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Las Vegas, NV","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"USA","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 November 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 November 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"isvc2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.isvc.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}