{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T16:11:00Z","timestamp":1778083860873,"version":"3.51.4"},"publisher-location":"Cham","reference-count":69,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730320","type":"print"},{"value":"9783031730337","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73033-7_11","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:03:55Z","timestamp":1730333035000},"page":"186-204","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":13,"title":["ViLA: Efficient Video-Language Alignment for\u00a0Video Question Answering"],"prefix":"10.1007","author":[{"given":"Xijun","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junbang","family":"Liang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chun-Kai","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kenan","family":"Deng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu","family":"Lou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ming C.","family":"Lin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shan","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"11_CR1","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR2","doi-asserted-by":"crossref","unstructured":"Amrani, E., Ben-Ari, R., Rotman, D., Bronstein, A.: Noise estimation using density estimation for self-supervised multimodal learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 6644\u20136652 (2021)","DOI":"10.1609\/aaai.v35i8.16822"},{"key":"11_CR3","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: VIVIT: a video vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6836\u20136846 (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"11_CR4","first-page":"32897","volume":"35","author":"H Bao","year":"2022","unstructured":"Bao, H., et al.: VLMO: unified vision-language pre-training with mixture-of-modality-experts. Adv. Neural. Inf. Process. Syst. 35, 32897\u201332912 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR5","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: ICML, vol.\u00a02, p.\u00a04 (2021)"},{"key":"11_CR6","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR7","unstructured":"Chen, K., Li, Y., Zhang, Z., Ren, X., Li, H.: SAS video-QA: self-adaptive sampling for efficient video question-answering. arXiv preprint arXiv:2307.04192 (2023)"},{"key":"11_CR8","unstructured":"Chen, X., et\u00a0al.: PaLI: a jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794 (2022)"},{"key":"11_CR9","unstructured":"Chung, H.W., et\u00a0al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"11_CR10","unstructured":"Dai, W., et al.: Instructblip: towards general-purpose vision-language models with instruction tuning. arXiv preprint arXiv:2305.06500 (2023)"},{"key":"11_CR11","doi-asserted-by":"crossref","unstructured":"Deng, C., Chen, Q., Qin, P., Chen, D., Wu, Q.: Prompt switch: efficient clip adaptation for text-video retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 15648\u201315658 (2023)","DOI":"10.1109\/ICCV51070.2023.01434"},{"key":"11_CR12","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"11_CR13","doi-asserted-by":"crossref","unstructured":"Fan, C., Zhuo, T., Zhang, P., Li, X.: Heterogeneous memory enhanced multimodal attention model for video question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1999\u20132008 (2019)","DOI":"10.1109\/CVPR.2019.00210"},{"key":"11_CR14","unstructured":"Fang, H., Xiong, P., Xu, L., Chen, Y.: Clip2video: mastering video-text retrieval via image clip. arXiv preprint arXiv:2106.11097 (2021)"},{"key":"11_CR15","doi-asserted-by":"crossref","unstructured":"Fang, Y., et al.: Eva: exploring the limits of masked visual representation learning at scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19358\u201319369 (2023)","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"11_CR16","doi-asserted-by":"crossref","unstructured":"Gao, D., Zhou, L., Ji, L., Zhu, L., Yang, Y., Shou, M.Z.: Mist: multi-modal iterative spatial-temporal transformer for long-form video question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14773\u201314783 (2023)","DOI":"10.1109\/CVPR52729.2023.01419"},{"key":"11_CR17","doi-asserted-by":"crossref","unstructured":"Gupta, S., Hoffman, J., Malik, J.: Cross modal distillation for supervision transfer. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2827\u20132836 (2016)","DOI":"10.1109\/CVPR.2016.309"},{"key":"11_CR18","doi-asserted-by":"crossref","unstructured":"Heigold, G., et al.: Video owl-vit: temporally-consistent open-world localization in video. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 13802\u201313811 (2023)","DOI":"10.1109\/ICCV51070.2023.01269"},{"key":"11_CR19","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. In: Advances in Neural Information Processing Systems (NIPS) (2015)"},{"key":"11_CR20","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"11_CR21","unstructured":"Jang, E., Gu, S., Poole, B.: Categorical reparameterization with gumbel-softmax. arXiv preprint arXiv:1611.01144 (2016)"},{"key":"11_CR22","doi-asserted-by":"crossref","unstructured":"Jiang, J., Chen, Z., Lin, H., Zhao, X., Gao, Y.: Divide and conquer: question-guided spatio-temporal contextual attention for video question answering. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 11101\u201311108 (2020)","DOI":"10.1609\/aaai.v34i07.6766"},{"key":"11_CR23","doi-asserted-by":"crossref","unstructured":"Khani, M., Hamadanian, P., Nasr-Esfahany, A., Alizadeh, M.: Real-time video inference on edge devices via adaptive model streaming. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4572\u20134582 (2021)","DOI":"10.1109\/ICCV48922.2021.00453"},{"key":"11_CR24","doi-asserted-by":"crossref","unstructured":"Ko, D., Lee, J.S., Kang, W., Roh, B., Kim, H.J.: Large language models are temporal and causal reasoners for video question answering. arXiv preprint arXiv:2310.15747 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.261"},{"key":"11_CR25","doi-asserted-by":"crossref","unstructured":"Lei, J., Yu, L., Bansal, M., Berg, T.L.: TVQA: localized, compositional video question answering. arXiv preprint arXiv:1809.01696 (2018)","DOI":"10.18653\/v1\/D18-1167"},{"key":"11_CR26","doi-asserted-by":"crossref","unstructured":"Lei, J., Yu, L., Berg, T.L., Bansal, M.: What is more likely to happen next? Video-and-language future event prediction. arXiv preprint arXiv:2010.07999 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.706"},{"key":"11_CR27","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"11_CR28","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"11_CR29","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. Adv. Neural. Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR30","doi-asserted-by":"crossref","unstructured":"Li, K., et al.: Uniformerv2: Unlocking the potential of image vits for video understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1632\u20131643 (2023)","DOI":"10.1109\/ICCV51070.2023.00157"},{"key":"11_CR31","doi-asserted-by":"crossref","unstructured":"Li, L., Chen, Y.C., Cheng, Y., Gan, Z., Yu, L., Liu, J.: Hero: hierarchical encoder for video+ language omni-representation pre-training. arXiv preprint arXiv:2005.00200 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"11_CR32","doi-asserted-by":"crossref","unstructured":"Lin, Y., Wei, C., Wang, H., Yuille, A., Xie, C.: SMAUG: sparse masked autoencoder for efficient video-language pre-training. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2459\u20132469 (2023)","DOI":"10.1109\/ICCV51070.2023.00233"},{"key":"11_CR33","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. arXiv preprint arXiv:2310.03744 (2023)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"11_CR34","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"319","DOI":"10.1007\/978-3-031-19781-9_19","volume-title":"ECCV 2022","author":"Y Liu","year":"2022","unstructured":"Liu, Y., Xiong, P., Xu, L., Cao, S., Jin, Q.: Ts2-net: token shift and selection transformer for text-video retrieval. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13674, pp. 319\u2013335. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19781-9_19"},{"key":"11_CR35","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","volume":"508","author":"H Luo","year":"2022","unstructured":"Luo, H., et al.: Clip4clip: an empirical study of clip for end to end video clip retrieval and captioning. Neurocomputing 508, 293\u2013304 (2022)","journal-title":"Neurocomputing"},{"key":"11_CR36","unstructured":"MacFarland, S.: If a picture is worth a thousand words, what is a video worth? (2014). https:\/\/www.huffpost.com\/entry\/if-a-picture-video-production_b_4996655"},{"key":"11_CR37","doi-asserted-by":"crossref","unstructured":"Mullapudi, R.T., Chen, S., Zhang, K., Ramanan, D., Fatahalian, K.: Online model distillation for efficient video inference. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3573\u20133582 (2019)","DOI":"10.1109\/ICCV.2019.00367"},{"key":"11_CR38","doi-asserted-by":"crossref","unstructured":"Pan, B., et al.: Spatio-temporal graph for video captioning with knowledge distillation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10870\u201310879 (2020)","DOI":"10.1109\/CVPR42600.2020.01088"},{"key":"11_CR39","unstructured":"Pan, J., Lin, Z., Zhu, X., Shao, J., Li, H.: St-adapter: Parameter-efficient image-to-video transfer learning. In: Koyejo, S., Mohamed, S., Agarwal, A., Belgrave, D., Cho, K., Oh, A. (eds.) Advances in Neural Information Processing Systems. vol.\u00a035, pp. 26462\u201326477. Curran Associates, Inc. (2022), https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/a92e9165b22d4456fc6d87236e04c266-Paper-Conference.pdf"},{"key":"11_CR40","doi-asserted-by":"crossref","unstructured":"Pramanick, S., et al.: Egovlpv2: egocentric video-language pre-training with fusion in the backbone. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5285\u20135297 (2023)","DOI":"10.1109\/ICCV51070.2023.00487"},{"key":"11_CR41","doi-asserted-by":"crossref","unstructured":"Qing, Z., et al.: Disentangling spatial and temporal learning for efficient image-to-video transfer learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 13934\u201313944 (2023)","DOI":"10.1109\/ICCV51070.2023.01281"},{"key":"11_CR42","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"issue":"6","key":"11_CR43","doi-asserted-by":"publisher","first-page":"328","DOI":"10.3390\/info14060328","volume":"14","author":"Z Shao","year":"2023","unstructured":"Shao, Z., Wan, J., Zong, L.: A video question answering model based on knowledge distillation. Information 14(6), 328 (2023)","journal-title":"Information"},{"key":"11_CR44","unstructured":"Shen, Y., Wang, X., Gao, P., Lin, M.: Auxiliary modality learning with generalized curriculum distillation (2023)"},{"key":"11_CR45","doi-asserted-by":"crossref","unstructured":"Shen, Y., Yang, L., Wang, X., Lin, M.C.: Small-shot multi-modal distillation for vision-based autonomous steering. In: 2023 IEEE International Conference on Robotics and Automation (ICRA), pp. 7763\u20137770. IEEE (2023)","DOI":"10.1109\/ICRA48891.2023.10160803"},{"key":"11_CR46","unstructured":"Touvron, H., et\u00a0al.: Llama: open and efficient foundation language models (2023). https:\/\/arxiv.org\/abs\/2302.13971"},{"key":"11_CR47","doi-asserted-by":"crossref","unstructured":"Wang, J., et\u00a0al.: All in one: Exploring unified video-language pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6598\u20136608 (2023)","DOI":"10.1109\/CVPR52729.2023.00638"},{"key":"11_CR48","doi-asserted-by":"crossref","unstructured":"Wang, R., et al.: Masked video distillation: rethinking masked feature modeling for self-supervised video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6312\u20136322 (2023)","DOI":"10.1109\/CVPR52729.2023.00611"},{"key":"11_CR49","unstructured":"Wang, W., et\u00a0al.: Cogvlm: visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)"},{"key":"11_CR50","doi-asserted-by":"crossref","unstructured":"Wang, W., et\u00a0al.: Image as a foreign language: beit pretraining for all vision and vision-language tasks. arXiv preprint arXiv:2208.10442 (2022)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"11_CR51","unstructured":"Wang, X., Liu, D., Kan, M., Han, C., Wu, Z., Shan, S.: Triplet knowledge distillation. arXiv preprint arXiv:2305.15975 (2023)"},{"key":"11_CR52","unstructured":"Wang, Y., et\u00a0al.: Internvideo: general video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191 (2022)"},{"key":"11_CR53","doi-asserted-by":"crossref","unstructured":"Wang, Z., Sung, Y.L., Cheng, F., Bertasius, G., Bansal, M.: Unified coarse-to-fine alignment for video-text retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 2816\u20132827 (2023)","DOI":"10.1109\/ICCV51070.2023.00264"},{"key":"11_CR54","unstructured":"Wu, B., Yu, S., Chen, Z., Tenenbaum, J.B., Gan, C.: Star: a benchmark for situated reasoning in real-world videos. In: Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2) (2021)"},{"key":"11_CR55","unstructured":"Wu, C., Yin, S., Qi, W., Wang, X., Tang, Z., Duan, N.: Visual ChatGPT: talking, drawing and editing with visual foundation models. arXiv preprint arXiv:2303.04671 (2023)"},{"key":"11_CR56","doi-asserted-by":"crossref","unstructured":"Xiao, J., Shang, X., Yao, A., Chua, T.S.: Next-QA: next phase of question-answering to explaining temporal actions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9777\u20139786 (2021)","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"11_CR57","unstructured":"Xue, H., et al.: Clip-VIP: adapting pre-trained image-text model to video-language representation alignment (2023)"},{"key":"11_CR58","doi-asserted-by":"crossref","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Just ask: learning to answer questions from millions of narrated videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1686\u20131697 (2021)","DOI":"10.1109\/ICCV48922.2021.00171"},{"key":"11_CR59","first-page":"124","volume":"35","author":"A Yang","year":"2022","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Zero-shot video question answering via frozen bidirectional language models. Adv. Neural. Inf. Process. Syst. 35, 124\u2013141 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR60","unstructured":"Yang, Z., et al.: Mm-react: prompting ChatGPT for multimodal reasoning and action. arXiv preprint arXiv:2303.11381 (2023)"},{"key":"11_CR61","unstructured":"Yao, L., et al.: Filip: fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783 (2021)"},{"key":"11_CR62","doi-asserted-by":"crossref","unstructured":"Ye, Q., et al.: Hitea: hierarchical temporal-aware video-language pre-training. arXiv preprint arXiv:2212.14546 (2022)","DOI":"10.1109\/ICCV51070.2023.01413"},{"key":"11_CR63","doi-asserted-by":"crossref","unstructured":"Ye, Y., Zhao, Z., Li, Y., Chen, L., Xiao, J., Zhuang, Y.: Video question answering via attribute-augmented attention network learning. In: Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 829\u2013832 (2017)","DOI":"10.1145\/3077136.3080655"},{"key":"11_CR64","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: Coca: contrastive captioners are image-text foundation models. arxiv 2022. arXiv preprint arXiv:2205.01917"},{"key":"11_CR65","unstructured":"Yu, S., Cho, J., Yadav, P., Bansal, M.: Self-chained image-language model for video localization and question answering. arXiv preprint arXiv:2305.06988 (2023)"},{"key":"11_CR66","doi-asserted-by":"crossref","unstructured":"Zellers, R., et al.: Merlot reserve: neural script knowledge through vision and language and sound. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16375\u201316387 (2022)","DOI":"10.1109\/CVPR52688.2022.01589"},{"key":"11_CR67","first-page":"23634","volume":"34","author":"R Zellers","year":"2021","unstructured":"Zellers, R., et al.: Merlot: multimodal neural script knowledge models. Adv. Neural. Inf. Process. Syst. 34, 23634\u201323651 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"11_CR68","unstructured":"Zhang, P., Zhuo, T., Li, X.: Learning to select key frames for video question answering. In: ACM Multimedia Conference (2018)"},{"key":"11_CR69","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Xiang, T., Hospedales, T.M., Lu, H.: Deep mutual learning. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4320\u20134328 (2018)","DOI":"10.1109\/CVPR.2018.00454"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73033-7_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:34:36Z","timestamp":1730334876000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73033-7_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031730320","9783031730337"],"references-count":69,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73033-7_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}