{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T18:50:44Z","timestamp":1771959044045,"version":"3.50.1"},"publisher-location":"Cham","reference-count":81,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726699","type":"print"},{"value":"9783031726705","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72670-5_10","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"166-185","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Elysium: Exploring Object-Level Perception in\u00a0Videos via\u00a0MLLM"],"prefix":"10.1007","author":[{"given":"Han","family":"Wang","sequence":"first","affiliation":[]},{"given":"Yongjie","family":"Ye","sequence":"additional","affiliation":[]},{"given":"Yanjie","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yuxiang","family":"Nie","sequence":"additional","affiliation":[]},{"given":"Can","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"10_CR1","unstructured":"https:\/\/www.pexels.com"},{"key":"10_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"191","DOI":"10.1007\/978-3-319-16181-5_14","volume-title":"Computer Vision - ECCV 2014 Workshops","author":"M Kristan","year":"2015","unstructured":"Kristan, M., et al.: The visual object tracking VOT2014 challenge results. In: Agapito, L., Bronstein, M.M., Rother, C. (eds.) ECCV 2014. LNCS, vol. 8926, pp. 191\u2013217. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-16181-5_14"},{"key":"10_CR3","unstructured":"Kristan, M., et al.: The visual object tracking VOT2017 challenge results. In: ICCVW (2017)"},{"key":"10_CR4","unstructured":"Bai, J., et al.: Qwen-VL: a frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966 (2023)"},{"key":"10_CR5","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A.: Frozen in time: a joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1728\u20131738 (2021)","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"10_CR6","unstructured":"Banerjee, S., Lavie, A.: Meteor: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp. 65\u201372 (2005)"},{"key":"10_CR7","doi-asserted-by":"publisher","unstructured":"Bertinetto, L., Valmadre, J., Henriques, J.F., Vedaldi, A., Torr, P.H.: Fully-convolutional Siamese networks for object tracking. In: Hua, G., Jegou, H. (eds.) Computer Vision\u2013ECCV 2016 Workshops: Amsterdam, The Netherlands, 8\u201310 and 15\u201316 October 2016, Proceedings, Part II 14, pp. 850\u2013865. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-48881-3_56","DOI":"10.1007\/978-3-319-48881-3_56"},{"key":"10_CR8","doi-asserted-by":"crossref","unstructured":"Bewley, A., Ge, Z., Ott, L., Ramos, F., Upcroft, B.: Simple online and realtime tracking. In: 2016 IEEE International Conference on Image Processing (ICIP), pp. 3464\u20133468. IEEE (2016)","DOI":"10.1109\/ICIP.2016.7533003"},{"key":"10_CR9","doi-asserted-by":"crossref","unstructured":"Bhat, G., Danelljan, M., Gool, L.V., Timofte, R.: Learning discriminative model prediction for tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6182\u20136191 (2019)","DOI":"10.1109\/ICCV.2019.00628"},{"key":"10_CR10","unstructured":"Brooks, T., et al.: Video generation models as world simulators (2024). https:\/\/openai.com\/research\/video-generation-models-as-world-simulators"},{"key":"10_CR11","unstructured":"Chen, D., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies, pp. 190\u2013200 (2011)"},{"key":"10_CR12","unstructured":"Chen, J., et al.: MiniGPT-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478 (2023)"},{"key":"10_CR13","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal LLM\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"10_CR14","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* ChatGPT quality, March 2023. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"10_CR15","doi-asserted-by":"crossref","unstructured":"Cui, Y., Jiang, C., Wang, L., Wu, G.: MixFormer: end-to-end tracking with iterative mixed attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13608\u201313618 (2022)","DOI":"10.1109\/CVPR52688.2022.01324"},{"key":"10_CR16","doi-asserted-by":"crossref","unstructured":"Du, F., Liu, P., Zhao, W., Tang, X.: Correlation-guided attention for corner detection based visual tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6836\u20136845 (2020)","DOI":"10.1109\/CVPR42600.2020.00687"},{"key":"10_CR17","doi-asserted-by":"crossref","unstructured":"Fan, H., et al.: LaSOT: a high-quality benchmark for large-scale single object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5374\u20135383 (2019)","DOI":"10.1109\/CVPR.2019.00552"},{"key":"10_CR18","doi-asserted-by":"crossref","unstructured":"Fang, Y., et al.: EVA: exploring the limits of masked visual representation learning at scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19358\u201319369 (2023)","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"10_CR19","first-page":"35946","volume":"35","author":"C Feichtenhofer","year":"2022","unstructured":"Feichtenhofer, C., Li, Y., He, K., et al.: Masked autoencoders as spatiotemporal learners. Adv. Neural. Inf. Process. Syst. 35, 35946\u201335958 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"10_CR20","doi-asserted-by":"crossref","unstructured":"Gavrilyuk, K., Ghodrati, A., Li, Z., Snoek, C.G.: Actor and action video segmentation from a sentence. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5958\u20135966 (2018)","DOI":"10.1109\/CVPR.2018.00624"},{"key":"10_CR21","doi-asserted-by":"crossref","unstructured":"Guo, D., Shao, Y., Cui, Y., Wang, Z., Zhang, L., Shen, C.: Graph attention tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9543\u20139552 (2021)","DOI":"10.1109\/CVPR46437.2021.00942"},{"key":"10_CR22","doi-asserted-by":"crossref","unstructured":"Huang, B., Wang, X., Chen, H., Song, Z., Zhu, W.: VTimeLLM: empower LLM to grasp video moments. arXiv preprint arXiv:2311.18445 (2023)","DOI":"10.1109\/CVPR52733.2024.01353"},{"issue":"5","key":"10_CR23","doi-asserted-by":"publisher","first-page":"1562","DOI":"10.1109\/TPAMI.2019.2957464","volume":"43","author":"L Huang","year":"2019","unstructured":"Huang, L., Zhao, X., Huang, K.: GOT-10k: a large high-diversity benchmark for generic object tracking in the wild. IEEE Trans. Pattern Anal. Mach. Intell. 43(5), 1562\u20131577 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10_CR24","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6700\u20136709 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"10_CR25","doi-asserted-by":"crossref","unstructured":"Jang, Y., Song, Y., Yu, Y., Kim, Y., Kim, G.: TGIF-QA: toward spatio-temporal reasoning in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2758\u20132766 (2017)","DOI":"10.1109\/CVPR.2017.149"},{"key":"10_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1007\/978-3-030-01225-0_6","volume-title":"Computer Vision \u2013 ECCV 2018","author":"I Jung","year":"2018","unstructured":"Jung, I., Son, J., Baek, M., Han, B.: Real-time MDNet. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11208, pp. 89\u2013104. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01225-0_6"},{"key":"10_CR27","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: ReferitGame: referring to objects in photographs of natural scenes. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 787\u2013798 (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"10_CR28","doi-asserted-by":"crossref","unstructured":"Khoreva, A., Rohrbach, A., Schiele, B.: Video object segmentation with language referring expressions. In: Computer Vision\u2013ACCV 2018: 14th Asian Conference on Computer Vision, Perth, Australia, 2\u20136 December 2018, Revised Selected Papers, Part IV 14, pp. 123\u2013141. Springer, Cham (2019)","DOI":"10.1007\/978-3-030-20870-7_8"},{"key":"10_CR29","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Carlos\u00a0Niebles, J.: Dense-captioning events in videos. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 706\u2013715 (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"10_CR30","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"10_CR31","first-page":"11846","volume":"34","author":"J Lei","year":"2021","unstructured":"Lei, J., Berg, T.L., Bansal, M.: Detecting moments and highlights in videos via natural language queries. Adv. Neural. Inf. Process. Syst. 34, 11846\u201311858 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"10_CR32","doi-asserted-by":"crossref","unstructured":"Li, B., Wu, W., Wang, Q., Zhang, F., Xing, J., Yan, J.: SiamRPN++: evolution of Siamese visual tracking with very deep networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4282\u20134291 (2019)","DOI":"10.1109\/CVPR.2019.00441"},{"key":"10_CR33","doi-asserted-by":"crossref","unstructured":"Li, B., Yan, J., Wu, W., Zhu, Z., Hu, X.: High performance visual tracking with Siamese region proposal network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8971\u20138980 (2018)","DOI":"10.1109\/CVPR.2018.00935"},{"key":"10_CR34","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"10_CR35","unstructured":"Li, K., et al.: VideoChat: chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)"},{"key":"10_CR36","unstructured":"Li, Z., et\u00a0al.: LEGO: language enhanced multi-modal grounding model. arXiv preprint arXiv:2401.06071 (2024)"},{"key":"10_CR37","doi-asserted-by":"crossref","unstructured":"Lin, B., Zhu, B., Ye, Y., Ning, M., Jin, P., Yuan, L.: Video-LLaVA: learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122 (2023)","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"10_CR38","doi-asserted-by":"publisher","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, 6\u201312 September 2014, Proceedings, Part V 13, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"10_CR39","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"10_CR40","doi-asserted-by":"crossref","unstructured":"Liu, S., et\u00a0al.: Grounding DINO: marrying DINO with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"10_CR41","unstructured":"Luo, R., et al.: Valley: video assistant with large language model enhanced ability. arXiv preprint arXiv:2306.07207 (2023)"},{"key":"10_CR42","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., Khan, F.S.: Video-ChatGPT: towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424 (2023)","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"10_CR43","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A.L., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 11\u201320 (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"10_CR44","unstructured":"Honnibal, M., Ines\u00a0Montani, S.V.L., Boyd, A.: spaCy: industrial-strength natural language processing in Python (2020)"},{"key":"10_CR45","doi-asserted-by":"publisher","unstructured":"Mueller, M., Smith, N., Ghanem, B.: A benchmark and simulator for UAV tracking. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, 11\u201314 October 2016, Proceedings, Part I 14, pp. 445\u2013461. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_27","DOI":"10.1007\/978-3-319-46448-0_27"},{"key":"10_CR46","doi-asserted-by":"publisher","unstructured":"Muller, M., Bibi, A., Giancola, S., Alsubaihi, S., Ghanem, B.: TrackingNet: a large-scale dataset and benchmark for object tracking in the wild. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) Proceedings of the European Conference on Computer Vision (ECCV), pp. 300\u2013317. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01246-5_19","DOI":"10.1007\/978-3-030-01246-5_19"},{"key":"10_CR47","unstructured":"Munasinghe, S., et al.: PG-Video-LLaVA: pixel grounding large video-language models. arXiv preprint arXiv:2311.13435 (2023)"},{"key":"10_CR48","unstructured":"Peng, Z., et al.: Kosmos-2: grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)"},{"key":"10_CR49","doi-asserted-by":"crossref","unstructured":"Perazzi, F., Pont-Tuset, J., McWilliams, B., Van\u00a0Gool, L., Gross, M., Sorkine-Hornung, A.: A benchmark dataset and evaluation methodology for video object segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 724\u2013732 (2016)","DOI":"10.1109\/CVPR.2016.85"},{"key":"10_CR50","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"10_CR51","doi-asserted-by":"crossref","unstructured":"Rasley, J., Rajbhandari, S., Ruwase, O., He, Y.: DeepSpeed: system optimizations enable training deep learning models with over 100 billion parameters. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 3505\u20133506 (2020)","DOI":"10.1145\/3394486.3406703"},{"key":"10_CR52","unstructured":"Ren, M., Kiros, R., Zemel, R.: Exploring models and data for image question answering. In: Advances in Neural Information Processing Systems, vol. 28 (2015)"},{"key":"10_CR53","doi-asserted-by":"crossref","unstructured":"Ren, S., Yao, L., Li, S., Sun, X., Hou, L.: TimeChat: a time-sensitive multimodal large language model for long video understanding. arXiv preprint arXiv:2312.02051 (2023)","DOI":"10.1109\/CVPR52733.2024.01357"},{"key":"10_CR54","doi-asserted-by":"publisher","unstructured":"Roffo, G., Melzi, S., et\u00a0al.: The visual object tracking VOT2016 challenge results. In: Hua, G., Jegou, H. (eds.) Computer Vision\u2013ECCV 2016 Workshops: Amsterdam, The Netherlands, 8\u201310 and 15\u201316 October 2016, Proceedings, Part II, pp. 777\u2013823. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-48881-3_54","DOI":"10.1007\/978-3-319-48881-3_54"},{"key":"10_CR55","doi-asserted-by":"publisher","unstructured":"Seo, S., Lee, J.Y., Han, B.: URVOS: unified referring video object segmentation network with a large-scale benchmark. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, 23\u201328 August 2020, Proceedings, Part XV 16, pp. 208\u2013223. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58555-6_13","DOI":"10.1007\/978-3-030-58555-6_13"},{"key":"10_CR56","doi-asserted-by":"crossref","unstructured":"Shao, S., et al.: Objects365: a large-scale, high-quality dataset for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8430\u20138439 (2019)","DOI":"10.1109\/ICCV.2019.00852"},{"key":"10_CR57","doi-asserted-by":"crossref","unstructured":"Song, E., et\u00a0al.: MovieChat: from dense token to sparse memory for long video understanding. arXiv preprint arXiv:2307.16449 (2023)","DOI":"10.1109\/CVPR52733.2024.01725"},{"key":"10_CR58","first-page":"10078","volume":"35","author":"Z Tong","year":"2022","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: VideoMAE: masked autoencoders are data-efficient learners for self-supervised video pre-training. Adv. Neural. Inf. Process. Syst. 35, 10078\u201310093 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"10_CR59","unstructured":"Touvron, H., et\u00a0al.: LlaMA: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"10_CR60","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"10_CR61","doi-asserted-by":"crossref","unstructured":"Wang, G., Luo, C., Sun, X., Xiong, Z., Zeng, W.: Tracking by instance detection: a meta-learning approach. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6288\u20136297 (2020)","DOI":"10.1109\/CVPR42600.2020.00632"},{"key":"10_CR62","unstructured":"Wang, P., et al.: OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR (2022)"},{"key":"10_CR63","unstructured":"Wang, W., et\u00a0al.: CogVLM: visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)"},{"key":"10_CR64","unstructured":"Wang, W., et\u00a0al.: VisionLLM: large language model is also an open-ended decoder for vision-centric tasks. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"10_CR65","doi-asserted-by":"crossref","unstructured":"Wojke, N., Bewley, A., Paulus, D.: Simple online and realtime tracking with a deep association metric. In: 2017 IEEE International Conference on Image Processing (ICIP), pp. 3645\u20133649. IEEE (2017)","DOI":"10.1109\/ICIP.2017.8296962"},{"key":"10_CR66","doi-asserted-by":"crossref","unstructured":"Wu, Y., Lim, J., Yang, M.H.: Online object tracking: a benchmark. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2411\u20132418 (2013)","DOI":"10.1109\/CVPR.2013.312"},{"issue":"09","key":"10_CR67","doi-asserted-by":"publisher","first-page":"1834","DOI":"10.1109\/TPAMI.2014.2388226","volume":"37","author":"Y Wu","year":"2015","unstructured":"Wu, Y., Lim, J., Yang, M.H.: Object tracking benchmark. IEEE Trans. Pattern Anal. Mach. Intell. 37(09), 1834\u20131848 (2015)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10_CR68","doi-asserted-by":"crossref","unstructured":"Xu, D., et al.: Video question answering via gradually refined attention over appearance and motion. In: Proceedings of the 25th ACM International Conference on Multimedia, pp. 1645\u20131653 (2017)","DOI":"10.1145\/3123266.3123427"},{"key":"10_CR69","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"10_CR70","unstructured":"Xu, N., et al.: YouTube-VOS: a large-scale video object segmentation benchmark. arXiv preprint arXiv:1809.03327 (2018)"},{"key":"10_CR71","doi-asserted-by":"crossref","unstructured":"Xu, Y., Wang, Z., Li, Z., Yuan, Y., Yu, G.: SiamFC++: towards robust and accurate visual tracking with target estimation guidelines. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 12549\u201312556 (2020)","DOI":"10.1609\/aaai.v34i07.6944"},{"key":"10_CR72","first-page":"124","volume":"35","author":"A Yang","year":"2022","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Zero-shot video question answering via frozen bidirectional language models. Adv. Neural. Inf. Process. Syst. 35, 124\u2013141 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"10_CR73","unstructured":"You, H., et al.: Ferret: refer and ground anything anywhere at any granularity. arXiv preprint arXiv:2310.07704 (2023)"},{"key":"10_CR74","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"10_CR75","doi-asserted-by":"crossref","unstructured":"Yu, Z., et al.: ActivityNet-QA: a dataset for understanding complex web videos via question answering. In: AAAI, pp. 9127\u20139134 (2019)","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"10_CR76","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-LLaMA: an instruction-tuned audio-visual language model for video understanding. arXiv preprint arXiv:2306.02858 (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"10_CR77","unstructured":"Zhang, R., et al.: LLaMA-adapter: efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199 (2023)"},{"key":"10_CR78","unstructured":"Zhang, S., et al.: GPT4RoI: Instruction tuning large language model on region-of-interest. arXiv preprint arXiv:2307.03601 (2023)"},{"key":"10_CR79","doi-asserted-by":"crossref","unstructured":"Zhou, J., Wang, P., Sun, H.: Discriminative and robust online learning for Siamese visual tracking. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 13017\u201313024 (2020)","DOI":"10.1609\/aaai.v34i07.7002"},{"key":"10_CR80","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"},{"key":"10_CR81","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Groth, O., Bernstein, M., Fei-Fei, L.: Visual7W: grounded question answering in images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4995\u20135004 (2016)","DOI":"10.1109\/CVPR.2016.540"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72670-5_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:20:39Z","timestamp":1732828839000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72670-5_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031726699","9783031726705"],"references-count":81,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72670-5_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}