{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T13:55:28Z","timestamp":1743083728506,"version":"3.40.3"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031820205"},{"type":"electronic","value":"9783031820212"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-82021-2_22","type":"book-chapter","created":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T09:45:33Z","timestamp":1740822333000},"page":"311-322","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A Manifold Representation of\u00a0the\u00a0Key in\u00a0Vision Transformers"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8867-9104","authenticated-orcid":false,"given":"Li","family":"Meng","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6331-702X","authenticated-orcid":false,"given":"Morten","family":"Goodwin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7591-1659","authenticated-orcid":false,"given":"Anis","family":"Yazidi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8371-927X","authenticated-orcid":false,"given":"Paal","family":"Engelstad","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,3,1]]},"reference":[{"key":"22_CR1","doi-asserted-by":"crossref","unstructured":"Arar, M., Shamir, A., Bermano, A.H.: Learned queries for efficient local attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10841\u201310852 (2022)","DOI":"10.1109\/CVPR52688.2022.01057"},{"key":"22_CR2","unstructured":"Chen, K., et al.: MMDetection: open MMLab detection toolbox and benchmark. arXiv preprint arXiv:1906.07155 (2019)"},{"key":"22_CR3","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"22_CR4","doi-asserted-by":"crossref","unstructured":"Fan, Q., Huang, H., Chen, M., Liu, H., He, R.: RMT: retentive networks meet vision transformers. arXiv preprint arXiv:2309.11523 (2023)","DOI":"10.1109\/CVPR52733.2024.00539"},{"key":"22_CR5","doi-asserted-by":"crossref","unstructured":"Guo, Y., Stutz, D., Schiele, B.: Robustifying token attention for vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 17557\u201317568 (2023)","DOI":"10.1109\/ICCV51070.2023.01610"},{"key":"22_CR6","doi-asserted-by":"crossref","unstructured":"Han, D., Pan, X., Han, Y., Song, S., Huang, G.: Flatten transformer: vision transformer using focused linear attention. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5961\u20135971 (2023)","DOI":"10.1109\/ICCV51070.2023.00548"},{"key":"22_CR7","unstructured":"Han, K., Xiao, A., Wu, E., Guo, J., Xu, C., Wang, Y.: Transformer in transformer (2021)"},{"key":"22_CR8","unstructured":"Hao, Z., et al.: Learning efficient vision transformers via fine-grained manifold distillation. In: Advances in Neural Information Processing Systems, vol. 35, pp. 9164\u20139175 (2022)"},{"key":"22_CR9","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., Girshick, R.: Mask R-CNN. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2961\u20132969 (2017)","DOI":"10.1109\/ICCV.2017.322"},{"issue":"8","key":"22_CR10","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"22_CR11","unstructured":"Huang, C., Talbott, W., Jaitly, N., Susskind, J.M.: Efficient representation learning via adaptive context pooling. In: International Conference on Machine Learning, pp. 9346\u20139355. PMLR (2022)"},{"key":"22_CR12","doi-asserted-by":"crossref","unstructured":"Hyeon-Woo, N., Yu-Ji, K., Heo, B., Han, D., Oh, S.J., Oh, T.H.: Scratching visual transformer\u2019s back with uniform attention. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5807\u20135818 (2023)","DOI":"10.1109\/ICCV51070.2023.00534"},{"key":"22_CR13","doi-asserted-by":"crossref","unstructured":"Jordan, M.I.: Serial order: a parallel distributed processing approach. Adv. Psychol.\u00a0121, 471\u2013495. Elsevier (1997)","DOI":"10.1016\/S0166-4115(97)80111-2"},{"key":"22_CR14","unstructured":"Katharopoulos, A., Vyas, A., Pappas, N., Fleuret, F.: Transformers are RNNs: Fast autoregressive transformers with linear attention. In: International Conference on Machine Learning, pp. 5156\u20135165. PMLR (2020)"},{"key":"22_CR15","doi-asserted-by":"crossref","unstructured":"Konstantinidis, D., Papastratis, I., Dimitropoulos, K., Daras, P.: Multi-manifold attention for vision transformers. IEEE Access (2023)","DOI":"10.1109\/ACCESS.2023.3329952"},{"key":"22_CR16","unstructured":"Korman, E.O.: Self-supervised representation learning on manifolds. In: ICLR 2021 Workshop on Geometrical and Topological Representation Learning (2021)"},{"issue":"7553","key":"22_CR17","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y LeCun","year":"2015","unstructured":"LeCun, Y., Bengio, Y., Hinton, G.: Deep learning. Nature 521(7553), 436\u2013444 (2015)","journal-title":"Nature"},{"key":"22_CR18","doi-asserted-by":"crossref","unstructured":"Li, B., et al.: Dropkey for vision transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22700\u201322709 (2023)","DOI":"10.1109\/CVPR52729.2023.02174"},{"key":"22_CR19","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2117\u20132125 (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"22_CR20","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer v2: scaling up capacity and resolution. In: International Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"22_CR21","doi-asserted-by":"publisher","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00986","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"22_CR22","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"22_CR23","doi-asserted-by":"crossref","unstructured":"Marin, D., Chang, J.H.R., Ranjan, A., Prabhu, A., Rastegari, M., Tuzel, O.: Token pooling in vision transformers for image classification. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 12\u201321 (2023)","DOI":"10.1109\/WACV56688.2023.00010"},{"key":"22_CR24","unstructured":"Meng, L., Goodwin, M., Yazidi, A., Engelstad, P.: State representation learning using an unbalanced atlas. arXiv preprint arXiv:2305.10267 (2023)"},{"key":"22_CR25","unstructured":"Nguyen, T.M., et al.: Improving transformers with probabilistic attention keys. In: International Conference on Machine Learning, pp. 16595\u201316621. PMLR (2022)"},{"key":"22_CR26","unstructured":"Paszke, A., et al.: Pytorch: an imperative style, high-performance deep learning library. In: Advances in Neural Information Processing Systems, vol. 32, pp. 8024\u20138035. Curran Associates, Inc. (2019)"},{"key":"22_CR27","unstructured":"Renggli, C., Pinto, A.S., Houlsby, N., Mustafa, B., Puigcerver, J., Riquelme, C.: Learning to merge tokens in vision transformers. arXiv preprint arXiv:2202.12015 (2022)"},{"key":"22_CR28","doi-asserted-by":"crossref","unstructured":"Rumelhart, D.E., Hinton, G.E., Williams, R.J., et\u00a0al.: Learning internal representations by error propagation (1985)","DOI":"10.21236\/ADA164453"},{"key":"22_CR29","doi-asserted-by":"crossref","unstructured":"Shi, D.: Transnext: robust foveal visual perception for vision transformers. arXiv preprint arXiv:2311.17132 (2023)","DOI":"10.1109\/CVPR52733.2024.01683"},{"key":"22_CR30","unstructured":"Tolstikhin, I.O., et al.: MLP-mixer: an all-MLP architecture for vision. In: Advances in Neural Information Processing Systems, vol. 34, pp. 24261\u201324272 (2021)"},{"key":"22_CR31","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., Jegou, H.: Training data-efficient image transformers & distillation through attention. In: International Conference on Machine Learning, vol.\u00a0139, pp. 10347\u201310357 (2021)"},{"key":"22_CR32","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., Jegou, H.: Deit III: revenge of the ViT. arXiv preprint arXiv:2204.07118 (2022)","DOI":"10.1007\/978-3-031-20053-3_30"},{"key":"22_CR33","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., Sablayrolles, A., Synnaeve, G., J\u00e9gou, H.: Going deeper with image transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 32\u201342 (2021)","DOI":"10.1109\/ICCV48922.2021.00010"},{"key":"22_CR34","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., Sablayrolles, A., Synnaeve, G., J\u00e9gou, H.: Going deeper with image transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 32\u201342 (2021)","DOI":"10.1109\/ICCV48922.2021.00010"},{"key":"22_CR35","doi-asserted-by":"crossref","unstructured":"Valeriani, L., Doimo, D., Cuturello, F., Laio, A., Ansuini, A., Cazzaniga, A.: The geometry of hidden representations of large transformer models. arXiv preprint arXiv:2302.00294 (2023)","DOI":"10.1101\/2022.10.24.513504"},{"key":"22_CR36","doi-asserted-by":"publisher","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017). https:\/\/doi.org\/10.5555\/3295222.3295349","DOI":"10.5555\/3295222.3295349"},{"key":"22_CR37","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"285","DOI":"10.1007\/978-3-031-20053-3_17","volume-title":"Computer Vision - ECCV 2022","author":"P Wang","year":"2022","unstructured":"Wang, P., et al.: KVT: K-NN attention for boosting vision transformers. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13684, pp. 285\u2013302. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20053-3_17"},{"key":"22_CR38","unstructured":"Wang, S., Li, B.Z., Khabsa, M., Fang, H., Ma, H.: Linformer: self-attention with linear complexity. arXiv preprint arXiv:2006.04768 (2020)"},{"key":"22_CR39","doi-asserted-by":"crossref","unstructured":"Xia, Z., Pan, X., Song, S., Li, L.E., Huang, G.: Vision transformer with deformable attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4794\u20134803 (2022)","DOI":"10.1109\/CVPR52688.2022.00475"},{"key":"22_CR40","unstructured":"Zhou, D., et al.: Refiner: refining self-attention for vision transformers. arXiv preprint arXiv:2106.03714 (2021)"}],"container-title":["Lecture Notes in Computer Science","Advances in Computer Graphics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-82021-2_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,1]],"date-time":"2025-03-01T09:45:49Z","timestamp":1740822349000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-82021-2_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031820205","9783031820212"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-82021-2_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"1 March 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CGI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Computer Graphics International Conference","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Geneva","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Switzerland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 July 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 July 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"41","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cgi2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.cgs-network.org\/cgi24\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}