{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T07:48:29Z","timestamp":1769154509181,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":17,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819626403","type":"print"},{"value":"9789819626410","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-2641-0_18","type":"book-chapter","created":{"date-parts":[[2025,3,31]],"date-time":"2025-03-31T01:00:20Z","timestamp":1743382820000},"page":"267-280","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Exploring Cross-Attention Maps in\u00a0Multi-modal Diffusion Transformers for\u00a0Training-Free Semantic Segmentation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-9163-8031","authenticated-orcid":false,"given":"Rento","family":"Yamaguchi","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0431-183X","authenticated-orcid":false,"given":"Keiji","family":"Yanai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,3,29]]},"reference":[{"key":"18_CR1","doi-asserted-by":"crossref","unstructured":"Barsellotti, L., Amoroso, R., Cornia, M., Baraldi, L., Cucchiara, R.: Training-free open-vocabulary segmentation with offline diffusion-augmented prototype generation. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00354"},{"key":"18_CR2","unstructured":"Brooks, T., et al.: Video generation models as world simulators (2024). https:\/\/openai.com\/research\/video-generation-models-as-world-simulators"},{"key":"18_CR3","doi-asserted-by":"crossref","unstructured":"Cordts, M., et al.: The cityscapes dataset for semantic urban scene understanding. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.350"},{"key":"18_CR4","unstructured":"Esser, P., et\u00a0al.: Scaling rectified flow transformers for high-resolution image synthesis. In: ICML (2024)"},{"issue":"2","key":"18_CR5","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C.K.I., Winn, J., Zisserman, A.: The Pascal visual object classes (VOC) challenge. Int. J. Comput. Vis. 88(2), 303\u2013338 (2010)","journal-title":"Int. J. Comput. Vis."},{"key":"18_CR6","doi-asserted-by":"crossref","unstructured":"Honbu, Y., Yanai, K.: Training-free region prediction with stable diffusion. In: ACM MM (2024)","DOI":"10.1007\/978-3-031-53302-0_2"},{"key":"18_CR7","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. In: ICCV, pp. 4015\u20134026 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"18_CR8","unstructured":"Lipman, Y., Chen, R.T., Ben-Hamu, H., Nickel, M., Le, M.: Flow matching for generative modeling. In: ICLR (2022)"},{"key":"18_CR9","doi-asserted-by":"crossref","unstructured":"L\u00fcddecke, T., Ecker, A.: Image segmentation using text and image prompts. In: CVPR, pp. 7086\u20137096 (2022)","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"18_CR10","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: ICCV, pp. 4195\u20134205 (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"18_CR11","unstructured":"von Platen, P., et al.: Diffusers: state-of-the-art diffusion models (2022). https:\/\/github.com\/huggingface\/diffusers"},{"key":"18_CR12","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML, pp. 8748\u20138763 (2021)"},{"key":"18_CR13","doi-asserted-by":"crossref","unstructured":"Rao, Y., et al.: DenseCLIP: language-guided dense prediction with context-aware prompting. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"18_CR14","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"18_CR15","doi-asserted-by":"crossref","unstructured":"Tian, J., Aggarwal, L., Colaco, A., Kira, Z., Gonzalez-Franco, M.: Diffuse, attend, and segment: unsupervised zero-shot segmentation using stable diffusion. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00341"},{"key":"18_CR16","doi-asserted-by":"crossref","unstructured":"Wu, W., Zhao, Y., Shou, M.Z., Zhou, H., Shen, C.: DiffuMask: synthesizing images with pixel-level annotations for semantic segmentation using diffusion models. In: ICCV, pp. 1206\u20131217 (2023)","DOI":"10.1109\/ICCV51070.2023.00117"},{"key":"18_CR17","first-page":"696","volume-title":"ECCV","author":"C Zhou","year":"2022","unstructured":"Zhou, C., Loy, C.C., Dai, B.: Extract free dense labels from clip. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV, pp. 696\u2013712. Springer, Cham (2022)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-2641-0_18","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,31]],"date-time":"2025-03-31T01:00:30Z","timestamp":1743382830000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-2641-0_18"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819626403","9789819626410"],"references-count":17,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-2641-0_18","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"29 March 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hanoi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vietnam","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}