{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T18:03:40Z","timestamp":1779818620683,"version":"3.53.1"},"publisher-location":"Singapore","reference-count":27,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819620739","type":"print"},{"value":"9789819620746","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-2074-6_34","type":"book-chapter","created":{"date-parts":[[2024,12,31]],"date-time":"2024-12-31T11:05:25Z","timestamp":1735643125000},"page":"286-293","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["HORUS: Multimodal Large Language Models Framework for Video Retrieval at VBS 2025"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-7707-2069","authenticated-orcid":false,"given":"Tai","family":"Nguyen","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Vo Ngoc Minh","family":"Anh","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Duc Dat","family":"Pham","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tran Quang","family":"Vinh","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Nhu Duong Thi","family":"Quynh","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Le Anh","family":"Tien","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tan Duy","family":"Le","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Binh T.","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2025,1,1]]},"reference":[{"key":"34_CR1","doi-asserted-by":"crossref","unstructured":"Rossetto, L., Schuldt, H., Awad, G., Butt, A.A.: V3c - a research video collection. In: Kompatsiaris, I., Huet, B., Mezaris, V., Gurrin, C., Cheng, W.H., Vrochidis, S. (eds.) MultiMedia Modeling, pp. 349\u2013360. Springer International Publishing, Cham (2019)","DOI":"10.1007\/978-3-030-05710-7_29"},{"key":"34_CR2","unstructured":"Truong, Q.T., et al.: Marine video kit: a new marine video dataset for content-based analysis and retrieval. In: MultiMedia Modeling - 29th International Conference, MMM 2023, Bergen, Norway, January 9\u201312, 2023. Lecture Notes in Computer Science, Springer (2023)"},{"issue":"6","key":"34_CR3","doi-asserted-by":"publisher","first-page":"3481","DOI":"10.1007\/s00530-023-01143-5","volume":"29","author":"J Loko\u010d","year":"2023","unstructured":"Loko\u010d, J., et al.: Interactive video retrieval in the age of effective joint embedding deep models: lessons from the 11th VBS. Multimedia Syst. 29(6), 3481\u20133504 (2023)","journal-title":"Multimedia Syst."},{"key":"34_CR4","doi-asserted-by":"crossref","unstructured":"Amato, G., et al.: VISIONE 5.0: enhanced user interface and AI models for VBS2024. In: International Conference on Multimedia Modeling, pp. 332\u2013339. Springer (2024)","DOI":"10.1007\/978-3-031-53302-0_29"},{"key":"34_CR5","doi-asserted-by":"crossref","unstructured":"Amato, G., et al.: VISIONE at video browser showdown 2023. In: International conference on multimedia modeling, pp. 615\u2013621. Springer (2023)","DOI":"10.1007\/978-3-031-27077-2_48"},{"issue":"1","key":"34_CR6","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/s13735-021-00225-2","volume":"11","author":"S Heller","year":"2022","unstructured":"Heller, S., et al.: Interactive video retrieval evaluation at a distance: comparing sixteen interactive video search systems in a remote setting at the 10th video browser showdown. Int. J. Multimedia Inf. Retrieval 11(1), 1\u201318 (2022)","journal-title":"Int. J. Multimedia Inf. Retrieval"},{"key":"34_CR7","unstructured":"Oquab, M., et\u00a0al.: Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"34_CR8","doi-asserted-by":"crossref","unstructured":"Messina, N., et al.: Aladin: distilling fine-grained alignment scores for efficient image-text matching and retrieval. In: Proceedings of the 19th International Conference on Content-based Multimedia Indexing, pp. 64\u201370 (2022)","DOI":"10.1145\/3549555.3549576"},{"key":"34_CR9","unstructured":"Fang, H., Xiong, P., Xu, L., Chen, Y.: Clip2video: Mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097 (2021)"},{"key":"34_CR10","doi-asserted-by":"crossref","unstructured":"Cormack, G.V., Clarke, C.L., Buettcher, S.: Reciprocal rank fusion outperforms condorcet and individual rank learning methods. In: Proceedings of the 32nd International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 758\u2013759 (2009)","DOI":"10.1145\/1571941.1572114"},{"key":"34_CR11","doi-asserted-by":"crossref","unstructured":"Schall, K., Hezel, N., Jung, K., Barthel, K.U.: Vibro: video browsing with semantic and visual image embeddings. In: International Conference on Multimedia Modeling, pp. 665\u2013670. Springer (2023)","DOI":"10.1007\/978-3-031-27077-2_56"},{"key":"34_CR12","doi-asserted-by":"crossref","unstructured":"Loko\u010d, J., et\u00a0al.: Is the reign of interactive search eternal? Findings from the video browser showdown 2020. ACM Trans. Multimedia Comput. Commun. Appl. (TOMM) 17(3), 1\u201326 (2021)","DOI":"10.1145\/3445031"},{"key":"34_CR13","doi-asserted-by":"crossref","unstructured":"Barthel, K.U., Hezel, N., Jung, K., Schall, K.: Improved evaluation and generation of grid layouts using distance preservation quality and linear assignment sorting. In: Computer Graphics Forum, vol.\u00a042, pp. 261\u2013276. Wiley Online Library (2023)","DOI":"10.1111\/cgf.14718"},{"key":"34_CR14","unstructured":"Sou\u010dek, T., Loko\u010d, J.: Transnet v2: An effective deep network architecture for fast shot transition detection. arXiv preprint arXiv:2008.04838 (2020)"},{"key":"34_CR15","doi-asserted-by":"crossref","unstructured":"Chen, T.S., et\u00a0al.: Panda-70m: captioning 70m videos with multiple cross-modality teachers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13320\u201313331 (2024)","DOI":"10.1109\/CVPR52733.2024.01265"},{"key":"34_CR16","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"34_CR17","doi-asserted-by":"crossref","unstructured":"Carlsson, F., Eisen, P., Rekathati, F., Sahlgren, M.: Cross-lingual and multilingual clip. In: Proceedings of the Language Resources and Evaluation Conference, pp. 6848\u20136854. European Language Resources Association, Marseille, France (2022)","DOI":"10.63317\/573eyv53mcbd"},{"key":"34_CR18","doi-asserted-by":"crossref","unstructured":"Pham, K., Huynh, C., Lim, S.N., Shrivastava, A.: Composing object relations and attributes for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14354\u201314363 (2024)","DOI":"10.1109\/CVPR52733.2024.01361"},{"key":"34_CR19","doi-asserted-by":"crossref","unstructured":"Jiang, D., Ye, M.: Cross-modal implicit relation reasoning and aligning for text-to-image person retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2787\u20132797 (2023)","DOI":"10.1109\/CVPR52729.2023.00273"},{"key":"34_CR20","unstructured":"Cheng, Z., et\u00a0al.: Videollama 2: Advancing spatial-temporal modeling and audio understanding in video-llms. arXiv preprint arXiv:2406.07476 (2024)"},{"key":"34_CR21","unstructured":"Feng, F., Yang, Y., Cer, D., Arivazhagan, N., Wang, W.: Language-agnostic bert sentence embedding. arXiv preprint arXiv:2007.01852 (2020)"},{"key":"34_CR22","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-llama: An instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"34_CR23","unstructured":"Abid, A., Abdalla, A., Abid, A., Khan, D., Alfozan, A., Zou, J.: Gradio: Hassle-free sharing and testing of ml models in the wild. arXiv preprint arXiv:1906.02569 (2019)"},{"key":"34_CR24","doi-asserted-by":"crossref","unstructured":"Formal, T., Piwowarski, B., Clinchant, S.: Splade: sparse lexical and expansion model for first stage ranking. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 2288\u20132292 (2021)","DOI":"10.1145\/3404835.3463098"},{"key":"34_CR25","unstructured":"Jagerman, R., Zhuang, H., Qin, Z., Wang, X., Bendersky, M.: Query expansion by prompting large language models. arXiv preprint arXiv:2305.03653 (2023)"},{"key":"34_CR26","unstructured":"Bai, Z., et al.: Gqe: Generalized query expansion for enhanced text-video retrieval. arXiv preprint arXiv:2408.07249 (2024)"},{"key":"34_CR27","doi-asserted-by":"crossref","unstructured":"Varghese, R., Sambath, M.: YOLOv8: a novel object detection algorithm with enhanced performance and robustness. In: 2024 International Conference on Advances in Data Engineering and Intelligent Computing Systems (ADICS), pp.\u00a01\u20136. IEEE (2024)","DOI":"10.1109\/ADICS58448.2024.10533619"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-2074-6_34","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,26]],"date-time":"2026-05-26T17:21:30Z","timestamp":1779816090000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-2074-6_34"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819620739","9789819620746"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-2074-6_34","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"1 January 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Nara","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 January 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 January 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/mmm2025.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}