{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T16:29:51Z","timestamp":1778257791071,"version":"3.51.4"},"publisher-location":"Cham","reference-count":68,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729423","type":"print"},{"value":"9783031729430","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72943-0_13","type":"book-chapter","created":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T13:39:52Z","timestamp":1732801192000},"page":"220-238","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Emergent Visual-Semantic Hierarchies in\u00a0Image-Text Representations"],"prefix":"10.1007","author":[{"given":"Morris","family":"Alper","sequence":"first","affiliation":[]},{"given":"Hadar","family":"Averbuch-Elor","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,29]]},"reference":[{"key":"13_CR1","unstructured":"Alper, M., Averbuch-Elor, H.: Kiki or bouba? Sound symbolism in vision-and-language models. In: Proceedings of Advances in Neural Information Processing Systems (NeurIPS) (2023)"},{"key":"13_CR2","doi-asserted-by":"crossref","unstructured":"Alper, M., Fiman, M., Averbuch-Elor, H.: Is BERT blind? Exploring the effect of vision-and-language pretraining on visual language understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00655"},{"key":"13_CR3","unstructured":"Athiwaratkun, B., Wilson, A.G.: Hierarchical density order embeddings. arXiv preprint arXiv:1804.09843 (2018)"},{"key":"13_CR4","doi-asserted-by":"crossref","unstructured":"Atigh, M.G., Schoep, J., Acar, E., Van\u00a0Noord, N., Mettes, P.: Hyperbolic image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4453\u20134462 (2022)","DOI":"10.1109\/CVPR52688.2022.00441"},{"key":"13_CR5","doi-asserted-by":"crossref","unstructured":"Bertinetto, L., Mueller, R., Tertikas, K., Samangooei, S., Lord, N.A.: Making better mistakes: leveraging class hierarchies with deep networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12506\u201312515 (2020)","DOI":"10.1109\/CVPR42600.2020.01252"},{"key":"13_CR6","unstructured":"Carlsson, F., Eisen, P., Rekathati, F., Sahlgren, M.: Cross-lingual and multilingual clip. In: Proceedings of the Thirteenth Language Resources and Evaluation Conference, pp. 6848\u20136854 (2022)"},{"key":"13_CR7","doi-asserted-by":"crossref","unstructured":"Chang, H.S., Wang, Z., Vilnis, L., McCallum, A.: Distributional inclusion vector embedding for unsupervised hypernymy detection. arXiv preprint arXiv:1710.00880 (2017)","DOI":"10.18653\/v1\/N18-1045"},{"key":"13_CR8","unstructured":"Chefer, H., et al.: The hidden language of diffusion models. arXiv preprint arXiv:2306.00966 (2023)"},{"key":"13_CR9","doi-asserted-by":"crossref","unstructured":"Chen, G., et al.: mCLIP: multilingual clip via cross-lingual transfer. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 13028\u201313043 (2023)","DOI":"10.18653\/v1\/2023.acl-long.728"},{"key":"13_CR10","doi-asserted-by":"crossref","unstructured":"Chen, L., Jiang, Z., Xiao, J., Liu, W.: Human-like controllable image captioning with verb-specific semantic roles. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 16846\u201316856 (2021)","DOI":"10.1109\/CVPR46437.2021.01657"},{"key":"13_CR11","unstructured":"Chen, X., et al.: Microsoft COCO captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"13_CR12","doi-asserted-by":"crossref","unstructured":"Cherti, M., et al.: Reproducible scaling laws for contrastive language-image learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2818\u20132829 (2023)","DOI":"10.1109\/CVPR52729.2023.00276"},{"key":"13_CR13","unstructured":"Chuang, C.Y., Robinson, J., Lin, Y.C., Torralba, A., Jegelka, S.: Debiased contrastive learning. In: Advance in Neural Information Processing System, vol. 33, pp. 8765\u20138775 (2020)"},{"key":"13_CR14","unstructured":"Chung, H.W., et\u00a0al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"13_CR15","doi-asserted-by":"crossref","unstructured":"Couairon, G., Douze, M., Cord, M., Schwenk, H.: Embedding arithmetic of multimodal queries for image retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4950\u20134958 (2022)","DOI":"10.1109\/CVPRW56347.2022.00542"},{"key":"13_CR16","unstructured":"Dai, B., Lin, D.: Contrastive learning for image captioning. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"13_CR17","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning (2023)"},{"key":"13_CR18","doi-asserted-by":"publisher","unstructured":"Dash, S., Chowdhury, M.F.M., Gliozzo, A., Mihindukulasooriya, N., Fauceglia, N.R.: Hypernym detection using strict partial order networks. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34. no. 05, pp. 7626\u20137633 (2020). https:\/\/doi.org\/10.1609\/aaai.v34i05.6263","DOI":"10.1609\/aaai.v34i05.6263"},{"key":"13_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"712","DOI":"10.1007\/978-3-030-58601-0_42","volume-title":"Computer Vision \u2013 ECCV 2020","author":"C Deng","year":"2020","unstructured":"Deng, C., Ding, N., Tan, M., Wu, Q.: Length-controllable image captioning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12358, pp. 712\u2013729. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58601-0_42"},{"key":"13_CR20","unstructured":"Desai, K., Nickel, M., Rajpurohit, T., Johnson, J., Vedantam, S.R.: Hyperbolic image-text representations. In: International Conference on Machine Learning, pp. 7694\u20137731. PMLR (2023)"},{"key":"13_CR21","doi-asserted-by":"crossref","unstructured":"Dhall, A., Makarova, A., Ganea, O., Pavllo, D., Greeff, M., Krause, A.: Hierarchical image classification using entailment cone embeddings. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops, pp. 836\u2013837 (2020)","DOI":"10.1109\/CVPRW50498.2020.00426"},{"key":"13_CR22","doi-asserted-by":"crossref","unstructured":"Dhingra, B., Shallue, C.J., Norouzi, M., Dai, A.M., Dahl, G.E.: Embedding text in hyperbolic spaces. arXiv preprint arXiv:1806.04313 (2018)","DOI":"10.18653\/v1\/W18-1708"},{"key":"13_CR23","doi-asserted-by":"crossref","unstructured":"Ermolov, A., Mirvakhabova, L., Khrulkov, V., Sebe, N., Oseledets, I.: Hyperbolic vision transformers: Combining improvements in metric learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7409\u20137419 (2022)","DOI":"10.1109\/CVPR52688.2022.00726"},{"key":"13_CR24","unstructured":"Ganea, O., B\u00e9cigneul, G., Hofmann, T.: Hyperbolic entailment cones for learning hierarchical embeddings. In: International Conference on Machine Learning, pp. 1646\u20131655. PMLR (2018)"},{"key":"13_CR25","doi-asserted-by":"crossref","unstructured":"Hong, J., Hayder, Z., Han, J., Fang, P., Harandi, M., Petersson, L.: Hyperbolic audio-visual zero-shot learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7873\u20137883 (2023)","DOI":"10.1109\/ICCV51070.2023.00724"},{"key":"13_CR26","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"13_CR27","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: DenseCap: fully convolutional localization networks for dense captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4565\u20134574 (2016)","DOI":"10.1109\/CVPR.2016.494"},{"key":"13_CR28","doi-asserted-by":"crossref","unstructured":"Kamath, A., Hessel, J., Chang, K.W.: Text encoders are performance bottlenecks in contrastive vision-language models. arXiv preprint arXiv:2305.14897 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.301"},{"key":"13_CR29","doi-asserted-by":"crossref","unstructured":"Kiela, D., Rimell, L., Vulic, I., Clark, S.: Exploiting image generality for lexical entailment detection. In: Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics (ACL 2015), pp. 119\u2013124. ACL; East Stroudsburg, PA (2015)","DOI":"10.3115\/v1\/P15-2020"},{"key":"13_CR30","doi-asserted-by":"crossref","unstructured":"Kornblith, S., Li, L., Wang, Z., Nguyen, T.: Guiding image captioning models toward more specific captions. arXiv preprint arXiv:2307.16686 (2023)","DOI":"10.1109\/ICCV51070.2023.01400"},{"key":"13_CR31","unstructured":"Krizhevsky, A.: Learning multiple layers of features from tiny images. University of Toronto, Technical report (2009)"},{"key":"13_CR32","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. In: Advances in Neural Information Processing Systems, vol. 25 (2012)"},{"key":"13_CR33","unstructured":"Lee, S., Zhang, Y., Wu, S., Wu, J.: Language-informed visual concept learning. arXiv preprint arXiv:2312.03587 (2023)"},{"key":"13_CR34","doi-asserted-by":"crossref","unstructured":"Li, A., Luo, T., Lu, Z., Xiang, T., Wang, L.: Large-scale few-shot learning: knowledge transfer with class hierarchy. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7212\u20137220 (2019)","DOI":"10.1109\/CVPR.2019.00738"},{"key":"13_CR35","doi-asserted-by":"crossref","unstructured":"Li, L., Zhang, Y., Wang, S.: The Euclidean space is evil: hyperbolic attribute editing for few-shot image generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 22714\u201322724 (2023)","DOI":"10.1109\/ICCV51070.2023.02076"},{"key":"13_CR36","unstructured":"Liang, V.W., Zhang, Y., Kwon, Y., Yeung, S., Zou, J.Y.: Mind the gap: understanding the modality gap in multi-modal contrastive representation learning. In: Koyejo, S., Mohamed, S., Agarwal, A., Belgrave, D., Cho, K., Oh, A. (eds.) Advances in Neural Information Processing Systems, vol.\u00a035, pp. 17612\u201317625. Curran Associates, Inc. (2022)"},{"key":"13_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014 Part V. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"13_CR38","doi-asserted-by":"crossref","unstructured":"Long, T., van Noord, N.: Cross-modal scalable hierarchical clustering in hyperbolic space. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16655\u201316664 (2023)","DOI":"10.1109\/ICCV51070.2023.01527"},{"key":"13_CR39","doi-asserted-by":"crossref","unstructured":"Luo, R., Price, B., Cohen, S., Shakhnarovich, G.: Discriminability objective for training descriptive captions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6964\u20136974 (2018)","DOI":"10.1109\/CVPR.2018.00728"},{"key":"13_CR40","doi-asserted-by":"crossref","unstructured":"Mettes, P., Atigh, M.G., Keller-Ressel, M., Gu, J., Yeung, S.: Hyperbolic deep learning in computer vision: a survey. arXiv preprint arXiv:2305.06611 (2023)","DOI":"10.1007\/s11263-024-02043-5"},{"key":"13_CR41","doi-asserted-by":"crossref","unstructured":"Nguyen, K.A., K\u00f6per, M., Walde, S.S.I., Vu, N.T.: Hierarchical embeddings for hypernymy detection and directionality. arXiv preprint arXiv:1707.07273 (2017)","DOI":"10.18653\/v1\/D17-1022"},{"key":"13_CR42","unstructured":"Nickel, M., Kiela, D.: Poincar\u00e9 embeddings for learning hierarchical representations. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"13_CR43","unstructured":"Nickel, M., Kiela, D.: Learning continuous hierarchies in the Lorentz model of hyperbolic geometry. In: International Conference on Machine Learning, pp. 3779\u20133788. PMLR (2018)"},{"key":"13_CR44","unstructured":"Novack, Z., McAuley, J., Lipton, Z.C., Garg, S.: CHILS: zero-shot image classification with hierarchical label sets. In: International Conference on Machine Learning, pp. 26342\u201326362. PMLR (2023)"},{"key":"13_CR45","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. In: Advance in Neural Information Processing System, vol. 35, pp. 27730\u201327744 (2022)"},{"key":"13_CR46","doi-asserted-by":"crossref","unstructured":"Phoo, C.P., Hariharan, B.: Coarsely-labeled data for better few-shot transfer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9052\u20139061 (2021)","DOI":"10.1109\/ICCV48922.2021.00892"},{"key":"13_CR47","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"13_CR48","doi-asserted-by":"crossref","unstructured":"Reimers, N., Gurevych, I.: Making monolingual sentence embeddings multilingual using knowledge distillation. arXiv preprint arXiv:2004.09813 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.365"},{"key":"13_CR49","unstructured":"Santurkar, S., Tsipras, D., Madry, A.: Breeds: benchmarks for subpopulation shift. arXiv preprint arXiv:2008.04859 (2020)"},{"key":"13_CR50","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"355","DOI":"10.1007\/978-3-642-25878-7_34","volume-title":"Graph Drawing","author":"R Sarkar","year":"2012","unstructured":"Sarkar, R.: Low distortion Delaunay embedding of trees in hyperbolic plane. In: van Kreveld, M., Speckmann, B. (eds.) GD 2011. LNCS, vol. 7034, pp. 355\u2013366. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-25878-7_34"},{"key":"13_CR51","doi-asserted-by":"crossref","unstructured":"Shah, A., Sra, S., Chellappa, R., Cherian, A.: Max-margin contrastive learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 8220\u20138230 (2022)","DOI":"10.1609\/aaai.v36i8.20796"},{"key":"13_CR52","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"13_CR53","unstructured":"Suzuki, R., Takahama, R., Onoda, S.: Hyperbolic disk embeddings for directed acyclic graphs. In: International Conference on Machine Learning, pp. 6066\u20136075. PMLR (2019)"},{"key":"13_CR54","doi-asserted-by":"crossref","unstructured":"Tewel, Y., Shalev, Y., Schwartz, I., Wolf, L.: Zerocap: zero-shot image-to-text generation for visual-semantic arithmetic. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17918\u201317928 (2022)","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"13_CR55","unstructured":"Touvron, H., et al.: Llama 2: open foundation and fine-tuned chat models (2023)"},{"key":"13_CR56","doi-asserted-by":"crossref","unstructured":"Trager, M., Perera, P., Zancato, L., Achille, A., Bhatia, P., Soatto, S.: Linear spaces of meanings: compositional structures in vision-language models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 15395\u201315404 (2023)","DOI":"10.1109\/ICCV51070.2023.01412"},{"key":"13_CR57","unstructured":"Vendrov, I., Kiros, R., Fidler, S., Urtasun, R.: Order-embeddings of images and language. In: Bengio, Y., LeCun, Y. (eds.) ICLR (2016)"},{"issue":"4","key":"13_CR58","doi-asserted-by":"publisher","first-page":"781","DOI":"10.1162\/COLI_a_00301","volume":"43","author":"I Vuli\u0107","year":"2017","unstructured":"Vuli\u0107, I., Gerz, D., Kiela, D., Hill, F., Korhonen, A.: HyperLex: a large-scale evaluation of graded lexical entailment. Comput. Linguist. 43(4), 781\u2013835 (2017)","journal-title":"Comput. Linguist."},{"key":"13_CR59","doi-asserted-by":"crossref","unstructured":"Vuli\u0107, I., Mrk\u0161i\u0107, N.: Specialising word vectors for lexical entailment. arXiv preprint arXiv:1710.06371 (2017)","DOI":"10.18653\/v1\/N18-1103"},{"key":"13_CR60","unstructured":"Xiong, B., et al.: Geometric relational embeddings: a survey. arXiv preprint arXiv:2304.11949 (2023)"},{"key":"13_CR61","doi-asserted-by":"crossref","unstructured":"Xu, Z., et al.: Challenges of zero-shot recognition with vision-language models: granularity and correctness. arXiv preprint arXiv:2306.16048 (2023)","DOI":"10.1109\/CVPRW63382.2024.00189"},{"key":"13_CR62","doi-asserted-by":"crossref","unstructured":"Yi, K., Shen, X., Gou, Y., Elhoseiny, M.: Exploring hierarchical graph representation for large-scale zero-shot image classification. ECCV (2022)","DOI":"10.1007\/978-3-031-20044-1_7"},{"key":"13_CR63","unstructured":"Yuksekgonul, M., Bianchi, F., Kalluri, P., Jurafsky, D., Zou, J.: When and why vision-language models behave like bags-of-words, and what to do about it? In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"13_CR64","doi-asserted-by":"crossref","unstructured":"Zhai, X., et al.: Lit: Zero-shot transfer with locked-image text tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18123\u201318133 (2022)","DOI":"10.1109\/CVPR52688.2022.01759"},{"key":"13_CR65","doi-asserted-by":"crossref","unstructured":"Zhang, C., Gao, J.: Hype-han: hyperbolic hierarchical attention network for semantic embedding. In: Proceedings of the Twenty-Ninth International Conference on International Joint Conferences on Artificial Intelligence, pp. 3990\u20133996 (2021)","DOI":"10.24963\/ijcai.2020\/552"},{"key":"13_CR66","doi-asserted-by":"publisher","unstructured":"Zhang, C., Van\u00a0Durme, B., Li, Z., Stengel-Eskin, E.: Visual commonsense in pretrained unimodal and multimodal models. In: Carpuat, M., de\u00a0Marneffe, M.C., Meza\u00a0Ruiz, I.V. (eds.) Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 5321\u20135335. Association for Computational Linguistics, Seattle, United States (2022).https:\/\/doi.org\/10.18653\/v1\/2022.naacl-main.390, https:\/\/aclanthology.org\/2022.naacl-main.390","DOI":"10.18653\/v1\/2022.naacl-main.390"},{"key":"13_CR67","doi-asserted-by":"publisher","unstructured":"Zhang, H., Hu, Z., Deng, Y., Sachan, M., Yan, Z., Xing, E.: Learning concept taxonomies from multi-modal data. In: Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 1791\u20131801. Association for Computational Linguistics, Berlin, Germany (2016). https:\/\/doi.org\/10.18653\/v1\/P16-1169","DOI":"10.18653\/v1\/P16-1169"},{"key":"13_CR68","unstructured":"Zhu, G., et\u00a0al.: Scene graph generation: a comprehensive survey. arXiv preprint arXiv:2201.00443 (2022)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72943-0_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T14:18:22Z","timestamp":1732803502000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72943-0_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,29]]},"ISBN":["9783031729423","9783031729430"],"references-count":68,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72943-0_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,29]]},"assertion":[{"value":"29 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}