{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T16:11:08Z","timestamp":1778083868916,"version":"3.51.4"},"publisher-location":"Cham","reference-count":90,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031732539","type":"print"},{"value":"9783031732546","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T00:00:00Z","timestamp":1732752000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T00:00:00Z","timestamp":1732752000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73254-6_9","type":"book-chapter","created":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T07:23:33Z","timestamp":1732692213000},"page":"142-160","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["Vamos: Versatile Action Models for\u00a0Video Understanding"],"prefix":"10.1007","author":[{"given":"Shijie","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qi","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minh Quan","family":"Do","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nakul","family":"Agarwal","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kwonjoon","family":"Lee","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chen","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,28]]},"reference":[{"key":"9_CR1","doi-asserted-by":"crossref","unstructured":"Aein, M.J., Aksoy, E.E., W\u00f6rg\u00f6tter, F.: Library of actions: implementing a generic robot execution framework by using manipulation action semantics. Int. J. Robot. Res. (2019)","DOI":"10.1177\/0278364919850295"},{"key":"9_CR2","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning. In: NeurIPS (2022)"},{"key":"9_CR3","unstructured":"Allen-Zhu, Z., Li, Y.: Physics of language models: part 1, context-free grammar. arXiv preprint arXiv:2305.13673 (2023)"},{"key":"9_CR4","unstructured":"Boix-Adser\u00e0, E., Saremi, O., Abbe, E., Bengio, S., Littwin, E., Susskind, J.: When can transformers reason with abstract symbols? arXiv preprint arXiv:2310.09753 (2023)"},{"key":"9_CR5","doi-asserted-by":"crossref","unstructured":"Buch, S., Eyzaguirre, C., Gaidon, A., Wu, J., Fei-Fei, L., Niebles, J.C.: Revisiting the \u201cvideo\u201d in video-language understanding. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00293"},{"key":"9_CR6","doi-asserted-by":"crossref","unstructured":"Caba\u00a0Heilbron, F., Escorcia, V., Ghanem, B., Carlos\u00a0Niebles, J.: Activitynet: a large-scale video benchmark for human activity understanding. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"9_CR8","unstructured":"Chen, G., et\u00a0al.: VideoLLM: modeling video sequence with large language models. arXiv preprint arXiv:2305.13292 (2023)"},{"key":"9_CR9","doi-asserted-by":"crossref","unstructured":"Chen, J., Guo, H., Yi, K., Li, B., Elhoseiny, M.: VisualGPT: data-efficient adaptation of pretrained language models for image captioning. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01750"},{"key":"9_CR10","doi-asserted-by":"crossref","unstructured":"Chen, L.H., et\u00a0al.: Uniter: universal image-text representation learning. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"9_CR11","doi-asserted-by":"crossref","unstructured":"Damen, D., et\u00a0al.: Scaling egocentric vision: the epic-kitchens dataset. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01225-0_44"},{"key":"9_CR12","unstructured":"Dasgupta, I., et al.: Language models show human-like content effects on reasoning. arXiv preprint arXiv:2207.07051 (2022)"},{"key":"9_CR13","unstructured":"Ding, D., Hill, F., Santoro, A., Reynolds, M., Botvinick, M.: Attention over learned object embeddings enables complex visual reasoning. In: NeurIPS (2021)"},{"key":"9_CR14","doi-asserted-by":"crossref","unstructured":"Epstein, D., Wu, J., Schmid, C., Sun, C.: Learning temporal dynamics from cycles in narrated video. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00151"},{"key":"9_CR15","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"9_CR16","unstructured":"Fu, T.J., et al.: Violet: end-to-end video-language transformers with masked visual-token modeling. arXiv preprint arXiv:2111.12681 (2021)"},{"key":"9_CR17","unstructured":"Grauman, K., et\u00a0al.: Ego4D: around the world in 3,000 hours of egocentric video. In: CVPR (2022)"},{"key":"9_CR18","unstructured":"Gruver, N., Finzi, M., Qiu, S., Wilson, A.G.: Large language models are zero-shot time series forecasters. arXiv preprint arXiv:2310.07820 (2023)"},{"key":"9_CR19","doi-asserted-by":"crossref","unstructured":"Gu, C., et\u00a0al.: Ava: a video dataset of spatio-temporally localized atomic visual actions. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00633"},{"key":"9_CR20","doi-asserted-by":"crossref","unstructured":"Gupta, T., Kembhavi, A.: Visual programming: compositional visual reasoning without training. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01436"},{"key":"9_CR21","doi-asserted-by":"crossref","unstructured":"Hongeng, S., Nevatia, R., Bremond, F.: Video-based event recognition: activity representation and probabilistic recognition methods. In: Computer Vision and Image Understanding (2004)","DOI":"10.1016\/j.cviu.2004.02.005"},{"key":"9_CR22","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"9_CR23","unstructured":"Hu, Z., et al.: Avis: autonomous visual information seeking with large language models. arXiv preprint arXiv:2306.08129 (2023)"},{"key":"9_CR24","unstructured":"Huang, D., Hilliges, O., Van\u00a0Gool, L., Wang, X.: Palm: predicting actions through language models@ ego4d long-term action anticipation challenge. arXiv preprint arXiv:2306.16545 (2023)"},{"key":"9_CR25","unstructured":"Ishibashi, T., Ono, K., Kugo, N., Sato, Y.: Technical report for ego4d long term action anticipation challenge. arXiv preprint arXiv:2307.01467 (2023)"},{"key":"9_CR26","doi-asserted-by":"crossref","unstructured":"Ivanov, Y.A., Bobick, A.F.: Recognition of visual activities and interactions by stochastic parsing. IEEE Trans. Pattern Anal. Mach. Intell. (2000)","DOI":"10.1109\/34.868686"},{"key":"9_CR27","unstructured":"Jang, E., Gu, S., Poole, B.: Categorical reparameterization with gumbel-softmax. arXiv preprint arXiv:1611.01144 (2016)"},{"key":"9_CR28","unstructured":"Jayaraman, D., Ebert, F., Efros, A.A., Levine, S.: Time-agnostic prediction: predicting predictable video frames. arXiv preprint arXiv:1808.07784 (2018)"},{"key":"9_CR29","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: ICML (2021)"},{"key":"9_CR30","doi-asserted-by":"crossref","unstructured":"Jiang, P., Han, Y.: Reasoning with heterogeneous graph alignment for video question answering. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.6767"},{"key":"9_CR31","doi-asserted-by":"crossref","unstructured":"Jiang, Y.G., Bhattacharya, S., Chang, S.F., Shah, M.: High-level event recognition in unconstrained videos. Int. J. Multimed. Inf. Retrieval (2013)","DOI":"10.1007\/s13735-012-0024-2"},{"key":"9_CR32","doi-asserted-by":"crossref","unstructured":"Kalakonda, S.S., Maheshwari, S., Sarvadevabhatla, R.K.: Action-GPT: leveraging large-scale language models for improved and generalized zero shot action generation. arXiv preprint arXiv:2211.15603 (2022)","DOI":"10.1109\/ICME55011.2023.00014"},{"key":"9_CR33","doi-asserted-by":"crossref","unstructured":"Ke, Y., Sukthankar, R., Hebert, M.: Event detection in crowded videos. In: ICCV (2007)","DOI":"10.1109\/ICCV.2007.4409011"},{"key":"9_CR34","unstructured":"K\u0131c\u0131man, E., Ness, R., Sharma, A., Tan, C.: Causal reasoning and large language models: opening a new frontier for causality. arXiv preprint arXiv:2305.00050 (2023)"},{"key":"9_CR35","doi-asserted-by":"crossref","unstructured":"Ko, D., Lee, J.S., Kang, W., Roh, B., Kim, H.J.: Large language models are temporal and causal reasoners for video question answering. In: EMNLP (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.261"},{"key":"9_CR36","unstructured":"Koh, P.W., et al.: Concept bottleneck models. In: ICML (2020)"},{"key":"9_CR37","unstructured":"Krishnan, R.M., Tang, Z., Yu, Z., Sun, C.: Spacewalk-18: a benchmark for multimodal and long-form procedural video understanding in novel domains. arXiv preprint arXiv:2311.18773 (2023)"},{"key":"9_CR38","unstructured":"Lester, J., Choudhury, T., Kern, N., Borriello, G., Hannaford, B.: A hybrid discriminative\/generative approach for modeling human activities. In: IJCAI (2005)"},{"key":"9_CR39","doi-asserted-by":"crossref","unstructured":"Li, J., Wei, P., Han, W., Fan, L.: IntentQA: context-aware video intent reasoning. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01099"},{"key":"9_CR40","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"9_CR41","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: ICML (2023)"},{"key":"9_CR42","doi-asserted-by":"crossref","unstructured":"Li, K., et\u00a0al.: MVBench: a comprehensive multi-modal video understanding benchmark. arXiv preprint arXiv:2311.17005 (2023)","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"9_CR43","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"9_CR44","doi-asserted-by":"crossref","unstructured":"Li, R., Yang, S., Ross, D.A., Kanazawa, A.: AI choreographer: music conditioned 3D dance generation with AIST++. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01315"},{"key":"9_CR45","doi-asserted-by":"crossref","unstructured":"Li, X., et\u00a0al.: Oscar: object-semantics aligned pre-training for vision-language tasks. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"9_CR46","unstructured":"Lin, J., et al.: Vila: on pre-training for visual language models. arXiv preprint arXiv:2312.07533 (2023)"},{"key":"9_CR47","unstructured":"Lin, K.Q., et\u00a0al.: Egocentric video-language pretraining. In: NeurIPS (2022)"},{"key":"9_CR48","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. arXiv preprint arXiv:2304.08485 (2023)"},{"key":"9_CR49","doi-asserted-by":"crossref","unstructured":"Liu, M., Tang, S., Li, Y., Rehg, J.M.: Forecasting human-object interaction: joint prediction of motor attention and actions in first person video. In: ECCV (2020)","DOI":"10.1007\/978-3-030-58452-8_41"},{"key":"9_CR50","unstructured":"Mangalam, K., Akshulakov, R., Malik, J.: Egoschema: a diagnostic benchmark for very long-form video language understanding. arXiv preprint arXiv:2308.09126 (2023)"},{"key":"9_CR51","doi-asserted-by":"crossref","unstructured":"Min, J., Buch, S., Nagrani, A., Cho, M., Schmid, C.: MoReVQA: exploring modular reasoning models for video question answering. arXiv preprint arXiv:2404.06511 (2024)","DOI":"10.1109\/CVPR52733.2024.01257"},{"key":"9_CR52","unstructured":"Minderer, M., Sun, C., Villegas, R., Cole, F., Murphy, K.P., Lee, H.: Unsupervised learning of object structure and dynamics from videos. In: NeurIPS (2019)"},{"key":"9_CR53","unstructured":"Mirchandani, S., et al.: Large language models as general pattern machines. arXiv preprint arXiv:2307.04721 (2023)"},{"key":"9_CR54","doi-asserted-by":"crossref","unstructured":"Moon, S., et\u00a0al.: Anymal: an efficient and scalable any-modality augmented language model. arXiv preprint arXiv:2309.16058 (2023)","DOI":"10.18653\/v1\/2024.emnlp-industry.98"},{"key":"9_CR55","unstructured":"Nevatia, R., Hobbs, J., Bolles, B.: An ontology for video event representation. In: CVPR Workshop (2004)"},{"key":"9_CR56","unstructured":"Ouyang, L., et\u00a0al.: Training language models to follow instructions with human feedback. 13 (2022). https:\/\/arxiv.org\/abs\/2203.02155"},{"key":"9_CR57","doi-asserted-by":"crossref","unstructured":"Pastra, K., Aloimonos, Y.: The minimalist grammar of action. Philos. Trans. R. Soc. B: Biol. Sci. (2012)","DOI":"10.1098\/rstb.2011.0123"},{"key":"9_CR58","doi-asserted-by":"crossref","unstructured":"Pei, M., Jia, Y., Zhu, S.C.: Parsing video events with goal inference and intent prediction. In: ICCV (2011)","DOI":"10.1109\/ICCV.2011.6126279"},{"key":"9_CR59","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. arXiv preprint arXiv:2103.00020 (2021)"},{"key":"9_CR60","doi-asserted-by":"crossref","unstructured":"Sadanand, S., Corso, J.J.: Action bank: a high-level representation of activity in video. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6247806"},{"key":"9_CR61","unstructured":"Schick, T., et al.: Toolformer: language models can teach themselves to use tools. arXiv preprint arXiv:2302.04761 (2023)"},{"key":"9_CR62","doi-asserted-by":"crossref","unstructured":"Shao, Z., Yu, Z., Wang, M., Yu, J.: Prompting large language models with answer heuristics for knowledge-based visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14974\u201314983 (2023)","DOI":"10.1109\/CVPR52729.2023.01438"},{"key":"9_CR63","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Flava: a foundational language and vision alignment model. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"9_CR64","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., Schmid, C.: VideoBERT: a joint model for video and language representation learning. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00756"},{"key":"9_CR65","doi-asserted-by":"crossref","unstructured":"Sur\u00eds, D., Menon, S., Vondrick, C.: ViperGPT: visual inference via Python execution for reasoning. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"9_CR66","doi-asserted-by":"crossref","unstructured":"Tong, S., Liu, Z., Zhai, Y., Ma, Y., LeCun, Y., Xie, S.: Eyes wide shut? Exploring the visual shortcomings of multimodal LLMs. arXiv preprint arXiv:2401.06209 (2024)","DOI":"10.1109\/CVPR52733.2024.00914"},{"key":"9_CR67","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"9_CR68","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Generating videos with scene dynamics. In: NeurIPS (2016)"},{"key":"9_CR69","doi-asserted-by":"crossref","unstructured":"Wang, A.J., et al.: All in one: exploring unified video-language pre-training. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00638"},{"key":"9_CR70","unstructured":"Wang, X., Farhadi, A., Gupta, A.: Actions $$\\tilde{\\,}$$ transformations. In: CVPR (2016)"},{"key":"9_CR71","unstructured":"Wang, Y., et\u00a0al.: Internvideo: general video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191 (2022)"},{"key":"9_CR72","unstructured":"Wang, Z., et\u00a0al.: Language models with image descriptors are strong few-shot video-language learners. In: NeurIPS (2022)"},{"key":"9_CR73","doi-asserted-by":"crossref","unstructured":"Wei, C., Liu, C., Qiao, S., Zhang, Z., Yuille, A., Yu, J.: De-diffusion makes text a strong cross-modal interface. arXiv preprint arXiv:2311.00618 (2023)","DOI":"10.1109\/CVPR52733.2024.01281"},{"key":"9_CR74","unstructured":"Wei, J., et\u00a0al.: Chain-of-thought prompting elicits reasoning in large language models. In: NeurIPS (2022)"},{"key":"9_CR75","doi-asserted-by":"crossref","unstructured":"Xiao, J., Shang, X., Yao, A., Chua, T.S.: Next-QA: next phase of question-answering to explaining temporal actions. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"9_CR76","doi-asserted-by":"crossref","unstructured":"Xiao, J., Yao, A., Liu, Z., Li, Y., Ji, W., Chua, T.S.: Video as conditional graph hierarchy for multi-granular question answering. In: AAAI (2022)","DOI":"10.1609\/aaai.v36i3.20184"},{"key":"9_CR77","doi-asserted-by":"crossref","unstructured":"Xiao, J., Zhou, P., Chua, T.S., Yan, S.: Video graph transformer for video question answering. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20059-5_3"},{"key":"9_CR78","unstructured":"Yan, W., Zhang, Y., Abbeel, P., Srinivas, A.: VideoGPT: video generation using VQ-VAE and transformers. arXiv preprint arXiv:2104.10157 (2021)"},{"key":"9_CR79","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Zero-shot video question answering via frozen bidirectional language models. In: NeurIPS (2022)"},{"key":"9_CR80","doi-asserted-by":"crossref","unstructured":"Ye, Q., et al.: Hitea: hierarchical temporal-aware video-language pre-training. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01413"},{"key":"9_CR81","unstructured":"Ye, Q., et\u00a0al.: mplug-owl: modularization empowers large language models with multimodality. arXiv preprint arXiv:2304.14178 (2023)"},{"key":"9_CR82","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: Coca: contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)"},{"key":"9_CR83","unstructured":"Yu, S., Cho, J., Yadav, P., Bansal, M.: Self-chained image-language model for video localization and question answering. arXiv preprint arXiv:2305.06988 (2023)"},{"key":"9_CR84","unstructured":"Yuksekgonul, M., Wang, M., Zou, J.: Post-hoc concept bottleneck models. arXiv preprint arXiv:2205.15480 (2022)"},{"key":"9_CR85","doi-asserted-by":"crossref","unstructured":"Zellers, R., et al.: Merlot reserve: neural script knowledge through vision and language and sound. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01589"},{"key":"9_CR86","unstructured":"Zellers, R., et al.: Merlot: multimodal neural script knowledge models. In: NeurIPS (2021)"},{"key":"9_CR87","unstructured":"Zeng, A., et\u00a0al.: Socratic models: composing zero-shot multimodal reasoning with language. arXiv preprint arXiv:2204.00598 (2022)"},{"key":"9_CR88","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: A simple LLM framework for long-range video question-answering. arXiv preprint arXiv:2312.17235 (2023)","DOI":"10.18653\/v1\/2024.emnlp-main.1209"},{"key":"9_CR89","unstructured":"Zhang, R., et al.: Llama-adapter: efficient fine-tuning of language models with zero-init attention. arXiv preprint arXiv:2303.16199 (2023)"},{"key":"9_CR90","unstructured":"Zhao, Q., et al.: AntGPT: can large language models help long-term action anticipation from videos? In: ICLR (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73254-6_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T02:52:52Z","timestamp":1733107972000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73254-6_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,28]]},"ISBN":["9783031732539","9783031732546"],"references-count":90,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73254-6_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,28]]},"assertion":[{"value":"28 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}