{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,11]],"date-time":"2026-04-11T02:13:48Z","timestamp":1775873628699,"version":"3.50.1"},"publisher-location":"Cham","reference-count":64,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726699","type":"print"},{"value":"9783031726705","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72670-5_12","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"204-222","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Getting it Right: Improving Spatial Consistency in\u00a0Text-to-Image Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0961-9569","authenticated-orcid":false,"given":"Agneet","family":"Chatterjee","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6893-6647","authenticated-orcid":false,"given":"Gabriela Ben Melech","family":"Stan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-2860-6198","authenticated-orcid":false,"given":"Estelle","family":"Aflalo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0217-0778","authenticated-orcid":false,"given":"Sayak","family":"Paul","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8518-2696","authenticated-orcid":false,"given":"Dhruba","family":"Ghosh","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5593-2804","authenticated-orcid":false,"given":"Tejas","family":"Gokhale","sequence":"additional","affiliation":[]},{"given":"Ludwig","family":"Schmidt","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1055-6657","authenticated-orcid":false,"given":"Hannaneh","family":"Hajishirzi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5907-9898","authenticated-orcid":false,"given":"Vasudev","family":"Lal","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7549-723X","authenticated-orcid":false,"given":"Chitta","family":"Baral","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0126-8976","authenticated-orcid":false,"given":"Yezhou","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"12_CR1","doi-asserted-by":"publisher","unstructured":"Avrahami, O., et al.: SpaText: spatio-textual representation for controllable image generation. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR). IEEE, June 2023. https:\/\/doi.org\/10.1109\/cvpr52729.2023.01762","DOI":"10.1109\/cvpr52729.2023.01762"},{"key":"12_CR2","unstructured":"Beaumont, R.: Clip retrieval: easily compute clip embeddings and build a clip retrieval system with them (2022). https:\/\/github.com\/rom1504\/clip-retrieval"},{"key":"12_CR3","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., Soricut, R.: Conceptual 12M: pushing web-scale image-text pre-training to recognize long-tail visual concepts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3558\u20133568 (2021)","DOI":"10.1109\/CVPR46437.2021.00356"},{"issue":"4","key":"12_CR4","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592116","volume":"42","author":"H Chefer","year":"2023","unstructured":"Chefer, H., Alaluf, Y., Vinker, Y., Wolf, L., Cohen-Or, D.: Attend-and-excite: attention-based semantic guidance for text-to-image diffusion models. ACM Trans. Graph. (TOG) 42(4), 1\u201310 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"12_CR5","unstructured":"Chen, J., et al.: PixArt-$$alpha$$: fast training of diffusion transformer for photorealistic text-to-image synthesis. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"12_CR6","doi-asserted-by":"crossref","unstructured":"Chen, M., Laina, I., Vedaldi, A.: Training-free layout control with cross-attention guidance. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 5343\u20135353 (2024)","DOI":"10.1109\/WACV57701.2024.00526"},{"key":"12_CR7","unstructured":"Chen, X., et\u00a0al.: PaLI: a jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794 (2022)"},{"key":"12_CR8","doi-asserted-by":"crossref","unstructured":"Cho, J., Zala, A., Bansal, M.: DALL-Eval: probing the reasoning skills and social biases of text-to-image generation models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3043\u20133054 (2023)","DOI":"10.1109\/ICCV51070.2023.00283"},{"key":"12_CR9","doi-asserted-by":"crossref","unstructured":"Chong, M.J., Forsyth, D.: Effectively unbiased FID and inception score and where to find them. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6070\u20136079 (2020)","DOI":"10.1109\/CVPR42600.2020.00611"},{"key":"12_CR10","doi-asserted-by":"publisher","unstructured":"Dayma, B., et al.: dall$$\\cdot $$e mini, July 2021. https:\/\/doi.org\/10.5281\/zenodo.5146400, https:\/\/github.com\/borisdayma\/dalle-mini","DOI":"10.5281\/zenodo.5146400"},{"key":"12_CR11","first-page":"16890","volume":"35","author":"M Ding","year":"2022","unstructured":"Ding, M., Zheng, W., Hong, W., Tang, J.: CogView2: faster and better text-to-image generation via hierarchical transformers. Adv. Neural. Inf. Process. Syst. 35, 16890\u201316902 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"12_CR12","doi-asserted-by":"publisher","unstructured":"Dobreva, R., Keller, F.: Investigating negation in pre-trained vision-and-language models. In: Bastings, J., et al. (eds.) Proceedings of the Fourth BlackboxNLP Workshop on Analyzing and Interpreting Neural Networks for NLP, pp. 350\u2013362. Association for Computational Linguistics, Punta Cana, Dominican Republic, November 2021. https:\/\/doi.org\/10.18653\/v1\/2021.blackboxnlp-1.27, https:\/\/aclanthology.org\/2021.blackboxnlp-1.27","DOI":"10.18653\/v1\/2021.blackboxnlp-1.27"},{"key":"12_CR13","unstructured":"Feng, W., et al.: Training-free structured diffusion guidance for compositional text-to-image synthesis (2023)"},{"key":"12_CR14","unstructured":"Feng, W., et al.: LayoutGPT: compositional visual planning and generation with large language models. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"12_CR15","unstructured":"Gao, J., Hu, K., Xu, G., Xu, H.: Can pre-trained text-to-image models generate visual goals for reinforcement learning? In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"12_CR16","unstructured":"Ghosh, D., Hajishirzi, H., Schmidt, L.: GenEval: an object-focused framework for evaluating text-to-image alignment. In: Thirty-Seventh Conference on Neural Information Processing Systems Datasets and Benchmarks Track (2023). https:\/\/openreview.net\/forum?id=Wbr51vK331"},{"key":"12_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"379","DOI":"10.1007\/978-3-030-58589-1_23","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Gokhale","year":"2020","unstructured":"Gokhale, T., Banerjee, P., Baral, C., Yang, Y.: VQA-LOL: visual question answering under the lens of logic. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12366, pp. 379\u2013396. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58589-1_23"},{"key":"12_CR18","doi-asserted-by":"publisher","unstructured":"Gokhale, T., Chaudhary, A., Banerjee, P., Baral, C., Yang, Y.: Semantically distributed robust optimization for vision-and-language inference. In: Muresan, S., Nakov, P., Villavicencio, A. (eds.) Findings of the Association for Computational Linguistics: ACL 2022, pp. 1493\u20131513. Association for Computational Linguistics, Dublin, Ireland, May 2022. https:\/\/doi.org\/10.18653\/v1\/2022.findings-acl.118, https:\/\/aclanthology.org\/2022.findings-acl.118","DOI":"10.18653\/v1\/2022.findings-acl.118"},{"key":"12_CR19","unstructured":"Gokhale, T., et al.: Benchmarking spatial relationships in text-to-image generation. arXiv preprint arXiv:2212.10015 (2022)"},{"key":"12_CR20","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-or, D.: Prompt-to-prompt image editing with cross-attention control. In: The Eleventh International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=_CDixzkzeyb"},{"key":"12_CR21","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"12_CR22","first-page":"78723","volume":"36","author":"K Huang","year":"2023","unstructured":"Huang, K., Sun, K., Xie, E., Li, Z., Liu, X.: T2I-CompBench: a comprehensive benchmark for open-world compositional text-to-image generation. Adv. Neural. Inf. Process. Syst. 36, 78723\u201378747 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Jayasumana, S., Ramalingam, S., Veit, A., Glasner, D., Chakrabarti, A., Kumar, S.: Rethinking FID: towards a better evaluation metric for image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9307\u20139315 (2024)","DOI":"10.1109\/CVPR52733.2024.00889"},{"key":"12_CR24","doi-asserted-by":"crossref","unstructured":"Jing, L., Li, R., Chen, Y., Jia, M., Du, X.: FAITHSCORE: evaluating hallucinations in large vision-language models. arXiv preprint arXiv:2311.01477 (2023)","DOI":"10.18653\/v1\/2024.findings-emnlp.290"},{"key":"12_CR25","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4015\u20134026 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"12_CR26","doi-asserted-by":"crossref","unstructured":"Kondapaneni, N., Marks, M., Knott, M., Guimaraes, R., Perona, P.: Text-image alignment for diffusion-based perception. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13883\u201313893 (2024)","DOI":"10.1109\/CVPR52733.2024.01317"},{"key":"12_CR27","unstructured":"Kornblith, S., Norouzi, M., Lee, H., Hinton, G.: Similarity of neural network representations revisited. In: International Conference on Machine Learning, pp. 3519\u20133529. PMLR (2019)"},{"key":"12_CR28","unstructured":"kuprel: min-dalle (2022). https:\/\/github.com\/kuprel\/min-dalle"},{"key":"12_CR29","unstructured":"Lee, D., et al.: Karlo-v1.0.alpha on COYO-100M and CC15M (2022). https:\/\/github.com\/kakaobrain\/karlo"},{"key":"12_CR30","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: GLIGEN: open-set grounded text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22511\u201322521 (2023)","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"12_CR31","unstructured":"Lian, L., Li, B., Yala, A., Darrell, T.: LLM-grounded diffusion: enhancing prompt understanding of text-to-image diffusion models with large language models. arXiv preprint abs\/2305.13655 (2023). https:\/\/arxiv.org\/abs\/2305.13655"},{"key":"12_CR32","doi-asserted-by":"publisher","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, 6\u201312 September 2014, Proceedings, Part V 13, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"12_CR33","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26296\u201326306 (2024)","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"12_CR34","doi-asserted-by":"publisher","unstructured":"Liu, N., Li, S., Du, Y., Torralba, A., Tenenbaum, J.B.: Compositional visual generation with composable diffusion models. In: Avidan, S., Brostow, G., Cisse, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022. ECCV 2022. LNCS, vol. 13677, pp. 423\u2013439. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19790-1_26","DOI":"10.1007\/978-3-031-19790-1_26"},{"key":"12_CR35","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2019). https:\/\/openreview.net\/forum?id=Bkg6RiCqY7"},{"key":"12_CR36","unstructured":"Luo, S., Tan, Y., Huang, L., Li, J., Zhao, H.: Latent consistency models: synthesizing high-resolution images with few-step inference (2023)"},{"key":"12_CR37","unstructured":"Nguyen, T., Raghu, M., Kornblith, S.: Do wide and deep networks learn the same things? Uncovering how neural network representations vary with width and depth (2021)"},{"key":"12_CR38","unstructured":"Nichol, A., et al.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models (2022)"},{"key":"12_CR39","unstructured":"OpenAI: Dalle-3 (2023). https:\/\/openai.com\/dall-e-3"},{"key":"12_CR40","unstructured":"OpenAI: GPT-4(v) (2023). https:\/\/cdn.openai.com\/papers\/GPTV_System_Card.pdf"},{"key":"12_CR41","doi-asserted-by":"crossref","unstructured":"Parmar, G., Zhang, R., Zhu, J.Y.: On aliased resizing and surprising subtleties in GAN evaluation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11410\u201311420 (2022)","DOI":"10.1109\/CVPR52688.2022.01112"},{"key":"12_CR42","doi-asserted-by":"crossref","unstructured":"Patel, M., Kim, C., Cheng, S., Baral, C., Yang, Y.: ECLIPSE: a resource-efficient text-to-image prior for image generations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9069\u20139078 (2024)","DOI":"10.1109\/CVPR52733.2024.00866"},{"key":"12_CR43","unstructured":"von Platen, P., et al.: Diffusers: state-of-the-art diffusion models (2022). https:\/\/github.com\/huggingface\/diffusers"},{"key":"12_CR44","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. In: The Twelfth International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=di52zR8xgf"},{"key":"12_CR45","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18\u201324 July 2021, Virtual Event. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8748\u20138763. PMLR (2021). http:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"issue":"140","key":"12_CR46","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. J. Mach. Learn. Res. 21(140), 1\u201367 (2020)","journal-title":"J. Mach. Learn. Res."},{"key":"12_CR47","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125, vol. 1(2), p. 3 (2022)"},{"key":"12_CR48","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18\u201324 July 2021, Virtual Event. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8821\u20138831. PMLR (2021). http:\/\/proceedings.mlr.press\/v139\/ramesh21a.html"},{"key":"12_CR49","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"12_CR50","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"12_CR51","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"12_CR52","unstructured":"Segalis, E., Valevski, D., Lumen, D., Matias, Y., Leviathan, Y.: A picture is worth a thousand words: principled recaptioning improves image generation. arXiv preprint arXiv:2310.16656 (2023)"},{"key":"12_CR53","unstructured":"Touvron, H., et al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"12_CR54","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., et al.: Tune-A-Video: one-shot tuning of image diffusion models for text-to-video generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7623\u20137633 (2023)","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"12_CR55","unstructured":"Wu, W., et al.: Paragraph-to-image generation with information-enriched diffusion model. arXiv preprint arXiv:2311.14284 (2023)"},{"key":"12_CR56","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhang, Z., Wei, F., Hu, H., Bai, X.: Side adapter network for open-vocabulary semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2945\u20132954, June 2023","DOI":"10.1109\/CVPR52729.2023.00288"},{"key":"12_CR57","doi-asserted-by":"crossref","unstructured":"Yang, Z., et\u00a0al.: ReCo: region-controlled text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14246\u201314255 (2023)","DOI":"10.1109\/CVPR52729.2023.01369"},{"key":"12_CR58","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: CoCa: contrastive captioners are image-text foundation models. Trans. Mach. Learn. Res. (2022). https:\/\/openreview.net\/forum?id=Ee277P3AYC"},{"key":"12_CR59","doi-asserted-by":"crossref","unstructured":"Yun, S., Park, S.H., Seo, P.H., Shin, J.: IFSeg: image-free semantic segmentation via vision-language model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2967\u20132977, June 2023","DOI":"10.1109\/CVPR52729.2023.00290"},{"key":"12_CR60","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"12_CR61","unstructured":"Zhang, T., Zhang, Y., Vineet, V., Joshi, N., Wang, X.: Controllable text-to-image generation with GPT-4. arXiv preprint arXiv:2305.18583 (2023)"},{"key":"12_CR62","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et\u00a0al.: Recognize anything: a strong image tagging model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1724\u20131732 (2024)","DOI":"10.1109\/CVPRW63382.2024.00179"},{"key":"12_CR63","doi-asserted-by":"crossref","unstructured":"Zhao, W., Rao, Y., Liu, Z., Liu, B., Zhou, J., Lu, J.: Unleashing text-to-image diffusion models for visual perception. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5729\u20135739 (2023)","DOI":"10.1109\/ICCV51070.2023.00527"},{"key":"12_CR64","unstructured":"Zhong, M., et al.: Multi-LoRA composition for image generation. arXiv preprint arXiv:2402.16843 (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72670-5_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:20:43Z","timestamp":1732828843000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72670-5_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031726699","9783031726705"],"references-count":64,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72670-5_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}