{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T16:46:04Z","timestamp":1777653964573,"version":"3.51.4"},"publisher-location":"Cham","reference-count":55,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031938054","type":"print"},{"value":"9783031938061","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-93806-1_11","type":"book-chapter","created":{"date-parts":[[2025,5,31]],"date-time":"2025-05-31T18:29:10Z","timestamp":1748716150000},"page":"124-142","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["A Spitting Image: Modular Superpixel Tokenization in\u00a0Vision Transformers"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2353-9984","authenticated-orcid":false,"given":"Marius","family":"Aasan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8159-352X","authenticated-orcid":false,"given":"Odd","family":"Kolbj\u00f8rnsen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6149-971X","authenticated-orcid":false,"given":"Anne Schistad","family":"Solberg","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4321-9075","authenticated-orcid":false,"given":"Ad\u00edn Ramirez","family":"Rivera","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,5,20]]},"reference":[{"key":"11_CR1","doi-asserted-by":"publisher","unstructured":"Abnar, S., Zuidema, W.H.: Quantifying attention flow in transformers. In: Jurafsky, D., Chai, J., Schluter, N., Tetreault, J.R. (eds.) Conference on Association for Computational Linguistics (ACL), pp. 4190\u20134197. Association for Computational Linguistics (2020). https:\/\/doi.org\/10.18653\/v1\/2020.acl-main.385","DOI":"10.18653\/v1\/2020.acl-main.385"},{"issue":"11","key":"11_CR2","doi-asserted-by":"publisher","first-page":"2274","DOI":"10.1109\/TPAMI.2012.120","volume":"34","author":"R Achanta","year":"2012","unstructured":"Achanta, R., Shaji, A., Smith, K., Lucchi, A., Fua, P., S\u00fcsstrunk, S.: Slic superpixels compared to state-of-the-art superpixel methods. IEEE Trans. Pattern Anal. Mach. Intell. 34(11), 2274\u20132282 (2012). https:\/\/doi.org\/10.1109\/TPAMI.2012.120","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"11_CR3","unstructured":"Adebayo, J., Gilmer, J., Muelly, M., Goodfellow, I.J., Hardt, M., Kim, B.: Sanity checks for saliency maps. In: Bengio, S., Wallach, H.M., Larochelle, H., Grauman, K., Cesa-Bianchi, N., Garnett, R. (eds.) Advances in Neural Information Processing Systems (NeurIPS), pp. 9525\u20139536 (2018). https:\/\/proceedings.neurips.cc\/paper\/2018\/hash\/294a8ed24b1ad22ec2e7efea049b8737-Abstract.html"},{"key":"11_CR4","doi-asserted-by":"publisher","unstructured":"Beyer, L., H\u00e9naff, O.J., Kolesnikov, A., Zhai, X., van\u00a0den Oord, A.: Are we done with ImageNet? CoRR abs\/arXiv: 2006.07159 (2020). https:\/\/doi.org\/10.48550\/arXiv.2006.07159","DOI":"10.48550\/arXiv.2006.07159"},{"key":"11_CR5","unstructured":"Bolya, D., Fu, C.Y., Dai, X., Zhang, P., Feichtenhofer, C., Hoffman, J.: Token merging: your vit but faster. In: International Conference on Learning Representations (ICLR) (2023). https:\/\/openreview.net\/forum?id=JroZRaRw7Eu"},{"key":"11_CR6","unstructured":"Brown, T.B., et al.: Language models are few-shot learners. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M., Lin, H. (eds.) Advances in Neural Information Processing Systems (NeurIPS) (2020). https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html"},{"key":"11_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"11_CR8","doi-asserted-by":"publisher","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., Joulin, A.: Emerging properties in self-supervised vision transformers. In: IEEE\/CVF International Conference on Pattern Recognition (CVPR), pp. 9630\u20139640, IEEE (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00951","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"11_CR9","doi-asserted-by":"publisher","unstructured":"Chan, C.S., Kong, H., Guanqing, L.: A comparative study of faithfulness metrics for model interpretability methods. In: Conference on Association for Computational Linguistics (ACL), pp. 5029\u20135038, Association for Computational Linguistics, Dublin, Ireland (2022). https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.345","DOI":"10.18653\/v1\/2022.acl-long.345"},{"key":"11_CR10","doi-asserted-by":"publisher","unstructured":"Dalal, N., Triggs, B.: Histograms of oriented gradients for human detection. In: IEEE\/CVF International Conference on Pattern Recognition (CVPR), vol.\u00a01, pp. 886\u2013893 v (2005). https:\/\/doi.org\/10.1109\/CVPR.2005.177","DOI":"10.1109\/CVPR.2005.177"},{"key":"11_CR11","doi-asserted-by":"publisher","unstructured":"Deng, J., Dong, W., Socher, R., Li, L., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: IEEE\/CVF International Conference on Pattern Recognition (CVPR), pp. 248\u2013255. IEEE Computer Society (2009). https:\/\/doi.org\/10.1109\/CVPR.2009.5206848","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"11_CR12","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Burstein, J., Doran, C., Solorio, T. (eds.) Conference on North American Chapter of the Association for Computational Linguistics (NAACL), pp. 4171\u20134186, Association for Computational Linguistics (2019). https:\/\/doi.org\/10.18653\/v1\/n19-1423","DOI":"10.18653\/v1\/n19-1423"},{"key":"11_CR13","doi-asserted-by":"crossref","unstructured":"DeYoung, J., et al.: ERASER: a benchmark to evaluate rationalized NLP models. In: Conference on Association for Computational Linguistics (ACL), pp. 4443\u20134458, Association for Computational Linguistics, Online (Jul 2020). https:\/\/aclanthology.org\/2020.acl-main.408","DOI":"10.18653\/v1\/2020.acl-main.408"},{"key":"11_CR14","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (ICLR) (2021). https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"11_CR15","doi-asserted-by":"crossref","unstructured":"Fellbaum, C.: WordNet: An Electronic Lexical Database. Bradford Books (1998). https:\/\/mitpress.mit.edu\/9780262561167\/","DOI":"10.7551\/mitpress\/7287.001.0001"},{"key":"11_CR16","doi-asserted-by":"publisher","unstructured":"Griffin, G., Holub, A., Perona, P.: Caltech 256 (Apr 2022). https:\/\/doi.org\/10.22002\/D1.20087","DOI":"10.22002\/D1.20087"},{"key":"11_CR17","unstructured":"Hamilton, M., Zhang, Z., Hariharan, B., Snavely, N., Freeman, W.T.: Unsupervised semantic segmentation by distilling feature correspondences. In: International Conference on Learning Representations (ICLR) (2022). https:\/\/openreview.net\/forum?id=SaKO6z6Hl0c"},{"key":"11_CR18","doi-asserted-by":"publisher","unstructured":"Havtorn, J.D., Royer, A., Blankevoort, T., Bejnordi, B.E.: Msvit: dynamic mixed-scale tokenization for vision transformers. In: IEEE International Conference on Computer Vision (ICCV), pp. 838\u2013848 (October 2023). https:\/\/doi.org\/10.1109\/ICCVW60793.2023.00091","DOI":"10.1109\/ICCVW60793.2023.00091"},{"key":"11_CR19","unstructured":"Huang, H., Zhou, X., Cao, J., He, R., Tan, T.: Vision transformer with super token sampling (2022)"},{"key":"11_CR20","doi-asserted-by":"publisher","unstructured":"Johnson, M., et al.: Google\u2019s multilingual neural machine translation system: Enabling zero-shot translation. Trans. Assoc. Comput. Linguistics 5, 339\u2013351 (2017). https:\/\/doi.org\/10.1162\/tacl_a_00065","DOI":"10.1162\/tacl_a_00065"},{"key":"11_CR21","doi-asserted-by":"publisher","unstructured":"Kirillov, A., et al.: Segment anything (2023). https:\/\/doi.org\/10.1109\/ICCV51070.2023.00371","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"11_CR22","unstructured":"Krizhevsky, A., Hinton, G., et\u00a0al.: Learning multiple layers of features from tiny images (2009)"},{"key":"11_CR23","doi-asserted-by":"publisher","unstructured":"Ladick\u00fd, L., Russell, C., Kohli, P., Torr, P.H.: Associative hierarchical crfs for object class image segmentation. In: IEEE IEEE International Conference on Computer Vision (ICCV), pp. 739\u2013746 (2009). https:\/\/doi.org\/10.1109\/ICCV.2009.5459248","DOI":"10.1109\/ICCV.2009.5459248"},{"key":"11_CR24","doi-asserted-by":"publisher","unstructured":"Leung, T., Malik, J.: Representing and recognizing the visual appearance of materials using three-dimensional textons. Inter. J. Comput. Vis. 43(1), 29\u201344 (Jun 2001), ISSN 1573-1405, https:\/\/doi.org\/10.1023\/A:1011126920638","DOI":"10.1023\/A:1011126920638"},{"key":"11_CR25","doi-asserted-by":"publisher","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: IEEE International Conference on Computer Vision (ICCV), pp. 9992\u201310002. IEEE (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00986","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"11_CR26","unstructured":"Ma, X., Zhou, Y., Wang, H., Qin, C., Sun, B., Liu, C., Fu, Y.: Image as set of points. In: International Conference on Learning Representations (ICLR) (2023). https:\/\/openreview.net\/forum?id=awnvqZja69"},{"key":"11_CR27","doi-asserted-by":"publisher","unstructured":"Moore, A.P., Prince, S.J.D., Warrell, J., Mohammed, U., Jones, G.: Superpixel lattices. In: IEEE\/CVF International Conference on Computer Vision and Pattern Recognition (CVPR) (2008). https:\/\/doi.org\/10.1109\/CVPR.2008.4587471","DOI":"10.1109\/CVPR.2008.4587471"},{"key":"11_CR28","unstructured":"Oquab, M., et al.: Dinov2: learning robust visual features without supervision. Trans. Mach. Learn. Res. (2024). https:\/\/openreview.net\/forum?id=a68SUt6zFt"},{"issue":"7","key":"11_CR29","doi-asserted-by":"publisher","first-page":"629","DOI":"10.1109\/34.56205","volume":"12","author":"P Perona","year":"1990","unstructured":"Perona, P., Malik, J.: Scale-space and edge detection using anisotropic diffusion. IEEE Trans. Pattern Anal. Mach. Intell. 12(7), 629\u2013639 (1990). https:\/\/doi.org\/10.1109\/34.56205","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"11_CR30","doi-asserted-by":"publisher","unstructured":"Ribeiro, M.T., Singh, S., Guestrin, C.: \u201cwhy should I trust you?\u201d explaining the predictions of any classifier. In: ACM Conference on Knowledge Discovery and Data Mining (ACM SIGKDD), pp. 1135\u20131144 (2016). https:\/\/doi.org\/10.1145\/2939672.2939778","DOI":"10.1145\/2939672.2939778"},{"key":"11_CR31","doi-asserted-by":"publisher","unstructured":"Ronen, T., Levy, O., Golbert, A.: Vision transformers with mixed-resolution tokenization. In: IEEE\/CVF International Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4612\u20134621 (2023). https:\/\/doi.org\/10.48550\/arXiv.2304.00287","DOI":"10.48550\/arXiv.2304.00287"},{"key":"11_CR32","unstructured":"Ryoo, M.S., Piergiovanni, A.J., Arnab, A., Dehghani, M., Angelova, A.: TokenLearner: adaptive space-time tokenization for videos. In: Ranzato, M., Beygelzimer, A., Dauphin, Y.N., Liang, P., Vaughan, J.W. (eds.) Advances in Neural Information Processing Systems (NeurIPS), pp. 12786\u201312797 (2021), https:\/\/proceedings.neurips.cc\/paper\/2021\/hash\/6a30e32e56fce5cf381895dfe6ca7b6f-Abstract.html"},{"key":"11_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1007\/978-3-540-69866-1_2","volume-title":"Complex Motion","author":"H Scharr","year":"2007","unstructured":"Scharr, H.: Optimal filters for extended optical flow. In: J\u00e4hne, B., Mester, R., Barth, E., Scharr, H. (eds.) IWCM 2004. LNCS, vol. 3417, pp. 14\u201329. Springer, Heidelberg (2007). https:\/\/doi.org\/10.1007\/978-3-540-69866-1_2"},{"key":"11_CR34","doi-asserted-by":"publisher","unstructured":"Sennrich, R., Haddow, B., Birch, A.: Neural machine translation of rare words with subword units. In: Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics, ACL 2016, 7-12 August 2016, Berlin, Germany, Volume 1: Long Papers. The Association for Computer Linguistics (2016), https:\/\/doi.org\/10.18653\/v1\/p16-1162","DOI":"10.18653\/v1\/p16-1162"},{"issue":"8","key":"11_CR35","doi-asserted-by":"publisher","first-page":"888","DOI":"10.1109\/34.868688","volume":"22","author":"J Shi","year":"2000","unstructured":"Shi, J., Malik, J.: Normalized cuts and image segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 22(8), 888\u2013905 (2000)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"11_CR36","unstructured":"Steiner, A., Kolesnikov, A., Zhai, X., Wightman, R., Uszkoreit, J., Beyer, L.: How to train your vit? data, augmentation, and regularization in vision transformers. Trans. Mach. Learn. Res. (2022). https:\/\/openreview.net\/forum?id=4nPswr1KcP"},{"key":"11_CR37","doi-asserted-by":"crossref","unstructured":"Stutz, D., Hermans, A., Leibe, B.: Superpixels: an evaluation of the state-of-the-art. Comput. Vis. Image Underst. 166, 1\u201327 (2018), ISSN 1077-3142, https:\/\/www.sciencedirect.com\/science\/article\/pii\/S1077314217300589","DOI":"10.1016\/j.cviu.2017.03.007"},{"key":"11_CR38","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u00e9gou, H.: Training data-efficient image transformers & distillation through attention. In: Meila, M., Zhang, T. (eds.) International Conference on Machine Learning (ICML), Proceedings of Machine Learning Research, vol. 139, pp. 10347\u201310357. PMLR (2021). http:\/\/proceedings.mlr.press\/v139\/touvron21a.html"},{"key":"11_CR39","doi-asserted-by":"publisher","unstructured":"Touvron, H., Cord, M., J\u00e9gou, H.: Deit III: revenge of the vit. In: Avidan, S., Brostow, G.J., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) European Conf. Comput. Vis. (ECCV), LNCS. vol. 13684, pp. 516\u2013533, Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-20053-3_30","DOI":"10.1007\/978-3-031-20053-3_30"},{"key":"11_CR40","doi-asserted-by":"publisher","unstructured":"Touvron, H., Cord, M., Sablayrolles, A., Synnaeve, G., J\u00e9gou, H.: Going deeper with image transformers. In: IEEE International Conference on Computer Vision (ICCV), pp. 32\u201342. IEEE (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00010","DOI":"10.1109\/ICCV48922.2021.00010"},{"key":"11_CR41","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Guyon, I., et al. (eds.) Advances in Neural Information Processing Systems (NeurIPS), pp. 5998\u20136008 (2017). https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"11_CR42","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"705","DOI":"10.1007\/978-3-540-88693-8_52","volume-title":"Computer Vision \u2013 ECCV 2008","author":"A Vedaldi","year":"2008","unstructured":"Vedaldi, A., Soatto, S.: Quick shift and kernel methods for mode seeking. In: Forsyth, D., Torr, P., Zisserman, A. (eds.) ECCV 2008. LNCS, vol. 5305, pp. 705\u2013718. Springer, Heidelberg (2008). https:\/\/doi.org\/10.1007\/978-3-540-88693-8_52"},{"key":"11_CR43","doi-asserted-by":"publisher","unstructured":"Wang, L., et al.: Learning to detect salient objects with image-level supervision. In: IEEE\/CVF International Conference on Computer Vision and Pattern Recognition (CVPR) (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.404","DOI":"10.1109\/CVPR.2017.404"},{"key":"11_CR44","doi-asserted-by":"publisher","unstructured":"Wang, W., et al.: Pyramid vision transformer: a versatile backbone for dense prediction without convolutions. In: IEEE International Conference on Computer Vision(ICCV), pp. 548\u2013558. IEEE (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00061","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"11_CR45","doi-asserted-by":"publisher","unstructured":"Wang, W., et al.: PVT v2: improved baselines with pyramid vision transformer. Comput. Visual Media, 1\u201310 (2022). https:\/\/doi.org\/10.1007\/s41095-022-0274-8","DOI":"10.1007\/s41095-022-0274-8"},{"key":"11_CR46","doi-asserted-by":"crossref","unstructured":"Wang, Y., Shen, X., Hu, S.X., Yuan, Y., Crowley, J.L., Vaufreydaz, D.: Self-supervised transformers for unsupervised object discovery using normalized cut. In: IEEE\/CVF International Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01414"},{"issue":"10","key":"11_CR47","doi-asserted-by":"publisher","first-page":"4838","DOI":"10.1109\/TIP.2018.2836300","volume":"27","author":"X Wei","year":"2018","unstructured":"Wei, X., Yang, Q., Gong, Y., Ahuja, N., Yang, M.: Superpixel hierarchy. IEEE Trans. Image Process. 27(10), 4838\u20134849 (2018). https:\/\/doi.org\/10.1109\/TIP.2018.2836300","journal-title":"IEEE Trans. Image Process."},{"key":"11_CR48","doi-asserted-by":"publisher","unstructured":"Xiaohan, Y., Yla-Jaaski, J., Huttunen, O., Vehkomaki, T., Sipila, O., Katila, T.: Image segmentation combining region growing and edge detection. In: IEEE International Conference on Pattern Recognition (ICPR), pp. 481\u2013484 (1992). https:\/\/doi.org\/10.1109\/ICPR.1992.202029","DOI":"10.1109\/ICPR.1992.202029"},{"key":"11_CR49","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: SegFormer: simple and efficient design for semantic segmentation with transformers. In: Ranzato, M., Beygelzimer, A., Dauphin, Y.N., Liang, P., Vaughan, J.W. (eds.) Adv. Neural Inf. Process. Sys. (NeurIPS), pp. 12077\u201312090 (2021), https:\/\/proceedings.neurips.cc\/paper\/2021\/hash\/64f1f27bf1b4ec22924fd0acb550c235-Abstract.html"},{"key":"11_CR50","doi-asserted-by":"publisher","unstructured":"Yan, J., Yu, Y., Zhu, X., Lei, Z., Li, S.Z.: Object detection by labeling superpixels. In: IEEE\/CVF International Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5107\u20135116 (2015). https:\/\/doi.org\/10.1109\/CVPR.2015.7299146","DOI":"10.1109\/CVPR.2015.7299146"},{"key":"11_CR51","doi-asserted-by":"publisher","unstructured":"Yan, Q., Xu, L., Shi, J., Jia, J.: Hierarchical saliency detection. In: IEEE\/CVF International Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1155\u20131162 (2013). https:\/\/doi.org\/10.1109\/CVPR.2013.153","DOI":"10.1109\/CVPR.2013.153"},{"key":"11_CR52","doi-asserted-by":"publisher","first-page":"4719","DOI":"10.1109\/TIP.2022.3187563","volume":"31","author":"T Yan","year":"2022","unstructured":"Yan, T., Huang, X., Zhao, Q.: Hierarchical superpixel segmentation by parallel crtrees labeling. IEEE Trans. Image Process. 31, 4719\u20134732 (2022). https:\/\/doi.org\/10.1109\/TIP.2022.3187563","journal-title":"IEEE Trans. Image Process."},{"key":"11_CR53","doi-asserted-by":"crossref","unstructured":"Yang, C., Zhang, L., Lu, H., Ruan, X., Yang, M.H.: Saliency detection via graph-based manifold ranking. In: IEEE\/CVF International Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3166\u20133173, IEEE (2013)","DOI":"10.1109\/CVPR.2013.407"},{"key":"11_CR54","doi-asserted-by":"publisher","unstructured":"Yuan, L., et al.: Tokens-to-Token ViT: training vision transformers from scratch on imagenet. In: IEEE International Conference on Computer Vision (ICCV), pp. 538\u2013547. IEEE (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00060","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"11_CR55","doi-asserted-by":"publisher","unstructured":"Yun, S., Han, D., Chun, S., Oh, S.J., Yoo, Y., Choe, J.: Cutmix: regularization strategy to train strong classifiers with localizable features. In: IEEE International Conference on Computer Vision (ICCV), pp. 6022\u20136031. IEEE (2019). https:\/\/doi.org\/10.1109\/ICCV.2019.00612","DOI":"10.1109\/ICCV.2019.00612"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-93806-1_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,31]],"date-time":"2025-05-31T18:29:17Z","timestamp":1748716157000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-93806-1_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031938054","9783031938061"],"references-count":55,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-93806-1_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"20 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}