{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T20:20:22Z","timestamp":1776889222281,"version":"3.51.2"},"publisher-location":"Cham","reference-count":65,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729720","type":"print"},{"value":"9783031729737","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72973-7_27","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T14:03:04Z","timestamp":1730383384000},"page":"464-481","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Controllable Contextualized Image Captioning: Directing the\u00a0Visual Narrative Through User-Defined Highlights"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-0326-9004","authenticated-orcid":false,"given":"Shunqi","family":"Mao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8492-9711","authenticated-orcid":false,"given":"Chaoyi","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0527-9919","authenticated-orcid":false,"given":"Hang","family":"Su","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1105-0818","authenticated-orcid":false,"given":"Hwanjun","family":"Song","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9664-1774","authenticated-orcid":false,"given":"Igor","family":"Shalyminov","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3706-8896","authenticated-orcid":false,"given":"Weidong","family":"Cai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"27_CR1","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems (NeurIPS). vol.\u00a035, pp. 23716\u201323736 (2022)"},{"key":"27_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition (CVPR), pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"27_CR3","doi-asserted-by":"crossref","unstructured":"Aneja, J., Agrawal, H., Batra, D., Schwing, A.G.: Sequential latent spaces for modeling the intention during diverse image captioning. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 4260\u20134269 (2019)","DOI":"10.1109\/ICCV.2019.00436"},{"key":"27_CR4","doi-asserted-by":"crossref","unstructured":"Biten, A., Gomez, L., Rusinol, M., Karatzas, D.: Good news, everyone! context driven entity-aware captioning for news images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12458\u201312467 (2019)","DOI":"10.1109\/CVPR.2019.01275"},{"key":"27_CR5","doi-asserted-by":"crossref","unstructured":"Burns, A., et al.: A suite of generative tasks for multi-level multimodal webpage understanding. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP) (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.119"},{"key":"27_CR6","doi-asserted-by":"crossref","unstructured":"Carlsson, F., \u00d6hman, J., Liu, F., Verlinden, S., Nivre, J., Sahlgren, M.: Fine-grained controllable text generation using non-residual prompting. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL), pp. 6837\u20136857 (2022)","DOI":"10.18653\/v1\/2022.acl-long.471"},{"key":"27_CR7","doi-asserted-by":"crossref","unstructured":"Chen, H., Yi, X., Sun, M., Li, W., Yang, C., Guo, Z.: Sentiment-controllable chinese poetry generation. In: Proceedings of the Twenty-Eighth International Joint Conference on Artificial Intelligence (IJCAI), pp. 4925\u20134931 (2019)","DOI":"10.24963\/ijcai.2019\/684"},{"key":"27_CR8","unstructured":"Chowdhery, A., et\u00a0al.: Palm: Scaling language modeling with pathways. arXiv preprint arXiv:2204.02311 (2022)"},{"key":"27_CR9","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"27_CR10","unstructured":"Dai, W., et al.: InstructBLIP: Towards general-purpose vision-language models with instruction tuning. arXiv preprint arXiv:2305.06500 (2023)"},{"key":"27_CR11","unstructured":"Dathathri, S., et al.: Plug and play language models: a simple approach to controlled text generation. In: Proceedings of the International Conference on Learning Representations (ICLR) (2020)"},{"key":"27_CR12","doi-asserted-by":"crossref","unstructured":"Ding, N., Deng, C., Tan, M., Du, Q., Ge, Z., Wu, Q.: Image captioning with controllable and adaptive length levels. IEEE Trans. Patt. Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3328298"},{"key":"27_CR13","doi-asserted-by":"crossref","unstructured":"Fei, J., Wang, T., Zhang, J., He, Z., Wang, C., Zheng, F.: Transferable decoding with visual entities for zero-shot image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 3136\u20133146 (2023)","DOI":"10.1109\/ICCV51070.2023.00291"},{"key":"27_CR14","doi-asserted-by":"crossref","unstructured":"Guo, M., et al.: LongT5: efficient text-to-text transformer for long sequences. In: North American Chapter of the Association for Computational Linguistics (NAACL), pp. 724\u2013736 (2022)","DOI":"10.18653\/v1\/2022.findings-naacl.55"},{"issue":"5","key":"27_CR15","doi-asserted-by":"publisher","first-page":"70","DOI":"10.1145\/253769.253798","volume":"40","author":"A Gupta","year":"1997","unstructured":"Gupta, A., Jain, R.: Visual information retrieval. Commun. ACM 40(5), 70\u201379 (1997)","journal-title":"Commun. ACM"},{"key":"27_CR16","doi-asserted-by":"crossref","unstructured":"He, X.: Parallel refinements for lexically constrained text generation with BART. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 8653\u20138666 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.681"},{"key":"27_CR17","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: CLIPScore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2022)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"27_CR18","doi-asserted-by":"crossref","unstructured":"Hu, X., et al.: Scaling up vision-language pre-training for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 17980\u201317989 (2022)","DOI":"10.1109\/CVPR52688.2022.01745"},{"key":"27_CR19","unstructured":"Huang, S., et al.: Language is not all you need: Aligning perception with language models. In: Oh, A., Neumann, T., Globerson, A., Saenko, K., Hardt, M., Levine, S. (eds.) Advances in Neural Information Processing Systems (NeurIPS). vol.\u00a036, pp. 72096\u201372109 (2023)"},{"key":"27_CR20","unstructured":"Kalarani, A.R., Bhattacharyya, P., Chhaya, N., Shekhar, S.: Let\u2019s not quote out of context: unified vision-language pretraining for context assisted image captioning. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL) (2023)"},{"key":"27_CR21","unstructured":"Khalifa, M., Elsahar, H., Dymetman, M.: A distributional approach to controlled text generation. In: Proceedings of the International Conference on Learning Representations (ICLR) (2021)"},{"key":"27_CR22","unstructured":"Koh, J.Y., Salakhutdinov, R., Fried, D.: Grounding language models to images for multimodal inputs and outputs. In: Proceedings of the International Conference on Machine Learning (ICML) (2023)"},{"key":"27_CR23","doi-asserted-by":"crossref","unstructured":"Krause, J., Johnson, J., Krishna, R., Fei-Fei, L.: A hierarchical approach for generating descriptive image paragraphs. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 317\u2013325 (2017)","DOI":"10.1109\/CVPR.2017.356"},{"key":"27_CR24","doi-asserted-by":"crossref","unstructured":"Li, C., et al.: mPLUG: effective and efficient vision-language learning by cross-modal skip-connections. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 7241\u20137259 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"27_CR25","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: Proceedings of the International Conference on Machine Learning (ICML) (2023)"},{"key":"27_CR26","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"27_CR27","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems (NeurIPS). vol.\u00a036, pp. 34892\u201334916 (2023)"},{"key":"27_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Y., Iter, D., Xu, Y., Wang, S., Xu, R., Zhu, C.: G-Eval: Nlg evaluation using gpt-4 with better human alignment. arXiv preprint arXiv:2303.16634 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.153"},{"key":"27_CR29","doi-asserted-by":"crossref","unstructured":"Liu, Y., Jia, Q., Zhu, K.: Length control in abstractive summarization by pretraining information selection. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL), pp. 6885\u20136895 (2022)","DOI":"10.18653\/v1\/2022.acl-long.474"},{"key":"27_CR30","doi-asserted-by":"crossref","unstructured":"Makino, T., Iwakura, T., Takamura, H., Okumura, M.: Global optimization under length constraint for neural text summarization. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL), pp. 1039\u20131048 (2019)","DOI":"10.18653\/v1\/P19-1099"},{"key":"27_CR31","doi-asserted-by":"crossref","unstructured":"Nguyen, K., Biten, A.F., Mafla, A., Gomez, L., Karatzas, D.: Show, interpret and tell: Entity-aware contextualised image captioning in wikipedia. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp. 1940\u20131948 (2022)","DOI":"10.1609\/aaai.v37i2.25285"},{"key":"27_CR32","doi-asserted-by":"publisher","unstructured":"Nguyen, V.Q., Suganuma, M., Okatani, T.: GRIT: faster and better image captioning transformer using dual visual features. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 167\u2013184. Springer (2022).https:\/\/doi.org\/10.1007\/978-3-031-20059-5_10","DOI":"10.1007\/978-3-031-20059-5_10"},{"key":"27_CR33","unstructured":"OpenAI: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"27_CR34","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, Y., Mei, T.: X-linear attention networks for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"27_CR35","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., jing Zhu, W.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the Association for Computational Linguistic (ACL), pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"27_CR36","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"27_CR37","doi-asserted-by":"crossref","unstructured":"Puduppully, R., Dong, L., Lapata, M.: Data-to-text generation with content selection and planning. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp. 6908\u20136915 (2019)","DOI":"10.1609\/aaai.v33i01.33016908"},{"key":"27_CR38","doi-asserted-by":"crossref","unstructured":"Qu, T., Tuytelaars, T., Moens, M.F.: Visually-aware context modeling for news image captioning. arXiv preprint arXiv:2308.08325 (2023)","DOI":"10.18653\/v1\/2024.naacl-long.162"},{"key":"27_CR39","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: Proceedings of the International Conference on Machine Learning (ICML), pp. 8748\u20138763 (2021)"},{"key":"27_CR40","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(140), 1\u201367 (2020)"},{"key":"27_CR41","doi-asserted-by":"crossref","unstructured":"Ramisa, A., Yan, F., Moreno-Noguer, F., Mikolajczyk, K.: Breakingnews: article annotation by image and text processing. In: Proceedings of the IEEE Transactions on Pattern Analysis and Machine Intelligence (PAMI), pp. 1072\u20131085 (2018)","DOI":"10.1109\/TPAMI.2017.2721945"},{"key":"27_CR42","doi-asserted-by":"crossref","unstructured":"Ribeiro, L.F.R., Zhang, Y., Gurevych, I.: Structural adapters in pretrained language models for AMR-to-Text generation. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 4269\u20134282 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.351"},{"issue":"01","key":"27_CR43","doi-asserted-by":"publisher","first-page":"842","DOI":"10.1109\/TAFFC.2021.3073809","volume":"14","author":"Y Ruan","year":"2023","unstructured":"Ruan, Y., Ling, Z.: Emotion-regularized conditional variational autoencoder for emotional response generation. IEEE Trans. Affect. Comput. 14(01), 842\u2013848 (2023)","journal-title":"IEEE Trans. Affect. Comput."},{"key":"27_CR44","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics, pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"27_CR45","doi-asserted-by":"crossref","unstructured":"Song, H., Wang, Y., Zhang, K., Zhang, W.N., Liu, T.: BoB: BERT over BERT for training persona-based dialogue models from limited personalized data. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL), pp. 167\u2013177 (2021)","DOI":"10.18653\/v1\/2021.acl-long.14"},{"key":"27_CR46","doi-asserted-by":"crossref","unstructured":"Song, H., Wang, Y., Zhang, W.N., Liu, X., Liu, T.: Generate, delete and rewrite: a three-stage framework for improving persona consistency of dialogue generation. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL), pp. 5821\u20135831 (2020)","DOI":"10.18653\/v1\/2020.acl-main.516"},{"key":"27_CR47","doi-asserted-by":"crossref","unstructured":"Song, Z., Zheng, X., Liu, L., Xu, M., Huang, X.: Generating responses with a specific emotion in dialog. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL), pp. 3685\u20133695 (2019)","DOI":"10.18653\/v1\/P19-1359"},{"key":"27_CR48","unstructured":"Su, Y., et al.: Language models can see: Plugging visual controls in text generation. arXiv preprint arXiv:2205.02655 (2022)"},{"key":"27_CR49","unstructured":"Touvron, H., et\u00a0al.: LLaMA 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"27_CR50","doi-asserted-by":"crossref","unstructured":"Tran, A., Mathews, A., Xie, L.: Transform and tell: entity-aware news image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13035\u201313045 (2020)","DOI":"10.1109\/CVPR42600.2020.01305"},{"key":"27_CR51","unstructured":"Tsimpoukelli, M., Menick, J.L., Cabi, S., Eslami, S.M.A., Vinyals, O., Hill, F.: Multimodal few-shot learning with frozen language models. In: Advances in Neural Information Processing Systems (NeurIPS). vol.\u00a034, pp. 200\u2013212 (2021)"},{"key":"27_CR52","unstructured":"Wang, J., et al.: GIT: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100 (2022)"},{"key":"27_CR53","doi-asserted-by":"crossref","unstructured":"Wang, N., Xie, J., Wu, J., Jia, M., Li, L.: Controllable image captioning via prompting. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI). vol.\u00a037, pp. 2617\u20132625 (2023)","DOI":"10.1609\/aaai.v37i2.25360"},{"key":"27_CR54","unstructured":"Wang, P., et al.: Large language models are not fair evaluators. arXiv preprint arXiv:2305.17926 (2023)"},{"key":"27_CR55","unstructured":"Wang, P., et al.: Ofa: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: Proceedings of the International Conference on Machine Learning (ICML), pp. 23318\u201323340 (2022)"},{"key":"27_CR56","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: Knowledge prompt makes composed pre-trained models zero-shot news captioner. In: Proceedings of the IEEE International Conference on Multimedia and Expo (ICME), pp. 28779\u20132884 (2023)","DOI":"10.1109\/ICME55011.2023.00489"},{"key":"27_CR57","doi-asserted-by":"crossref","unstructured":"Yang, P., Li, L., Luo, F., Liu, T., Sun, X.: Enhancing topic-to-essay generation with external commonsense knowledge. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL), pp. 2002\u20132012 (2019)","DOI":"10.18653\/v1\/P19-1193"},{"key":"27_CR58","doi-asserted-by":"crossref","unstructured":"Yang, X., Karaman, S., Tetreault, J., Jaimes, A.: Journalistic guidelines aware news image captioning. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 5162\u20135175 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.419"},{"key":"27_CR59","doi-asserted-by":"crossref","unstructured":"Zeng, Z., Zhang, H., Lu, R., Wang, D., Chen, B., Wang, Z.: ConZIC: controllable zero-shot image captioning by sampling-based polishing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 23465\u201323476 (2023)","DOI":"10.1109\/CVPR52729.2023.02247"},{"key":"27_CR60","doi-asserted-by":"crossref","unstructured":"Zhang, J., Fang, S., Mao, Z., Zhang, Z., Zhang, Y.: Fine-tuning with multi-modal entity prompts for news image captioning. In: Proceedings of the ACM International Conference on Multimedia (MM), pp. 4365\u20134373. MM \u201922 (2022)","DOI":"10.1145\/3503161.3547883"},{"key":"27_CR61","doi-asserted-by":"crossref","unstructured":"Zhao, W., Wu, X.: Boosting entity-aware image captioning with multi-modal knowledge graph. IEEE Trans. Multi. 1\u201312 (2023)","DOI":"10.1109\/TMM.2023.3301279"},{"key":"27_CR62","unstructured":"Zhao, Y., et al.: Controllable dense captioner with multimodal embedding bridging. arXiv preprint arXiv:2401.17910 (2024)"},{"key":"27_CR63","doi-asserted-by":"crossref","unstructured":"Zhong, P., Zhang, C., Wang, H., Liu, Y., Miao, C.: Towards persona-based empathetic conversational models. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 6556\u20136566 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.531"},{"key":"27_CR64","doi-asserted-by":"crossref","unstructured":"Zhou, M., Luo, G., Rohrbach, A., Yu, Z.: Focus! relevant and sufficient context selection for news image captioning. In: Findings of the Association for Computational Linguistics: Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 6078\u20136088 (2022)","DOI":"10.18653\/v1\/2022.findings-emnlp.450"},{"key":"27_CR65","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72973-7_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,15]],"date-time":"2025-02-15T15:00:04Z","timestamp":1739631604000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72973-7_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9783031729720","9783031729737"],"references-count":65,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72973-7_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}