{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,8]],"date-time":"2025-05-08T12:43:25Z","timestamp":1746708205088,"version":"3.40.3"},"publisher-location":"Cham","reference-count":62,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031729485"},{"type":"electronic","value":"9783031729492"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72949-2_20","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:22:17Z","timestamp":1730301737000},"page":"345-362","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["SpecFormer: Guarding Vision Transformer Robustness via\u00a0Maximum Singular Value Penalization"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3899-9791","authenticated-orcid":false,"given":"Xixu","family":"Hu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3120-5466","authenticated-orcid":false,"given":"Runkai","family":"Zheng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4833-0880","authenticated-orcid":false,"given":"Jindong","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3911-9055","authenticated-orcid":false,"given":"Cheuk Hang","family":"Leung","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4028-981X","authenticated-orcid":false,"given":"Qi","family":"Wu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8608-8482","authenticated-orcid":false,"given":"Xing","family":"Xie","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"20_CR1","first-page":"20014","volume":"34","author":"A Ali","year":"2021","unstructured":"Ali, A., et al.: XCiT: Cross-covariance image transformers. Adv. Neural. Inf. Process. Syst. 34, 20014\u201320027 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: ViViT: a video vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6836\u20136846 (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"20_CR3","doi-asserted-by":"crossref","unstructured":"Bai, T., Luo, J., Zhao, J., Wen, B., Wang, Q.: Recent advances in adversarial training for adversarial robustness. In: IJCAI Survey Track (2021)","DOI":"10.24963\/ijcai.2021\/591"},{"key":"20_CR4","first-page":"26831","volume":"34","author":"Y Bai","year":"2021","unstructured":"Bai, Y., Mei, J., Yuille, A.L., Xie, C.: Are transformers more robust than CNNs? Adv. Neural. Inf. Process. Syst. 34, 26831\u201326843 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR5","doi-asserted-by":"crossref","unstructured":"Ben-David, S., Blitzer, J., Crammer, K., Pereira, F.: Analysis of representations for domain adaptation. Adv. Neural. Inf. Process. Syst. 19 (2006)","DOI":"10.7551\/mitpress\/7503.003.0022"},{"key":"20_CR6","doi-asserted-by":"crossref","unstructured":"Bhojanapalli, S., Chakrabarti, A., Glasner, D., Li, D., Unterthiner, T., Veit, A.: Understanding robustness of transformers for image classification. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10231\u201310241 (2021)","DOI":"10.1109\/ICCV48922.2021.01007"},{"key":"20_CR7","unstructured":"Burden, R.L., Faires, J.D., Burden, A.M.: Numerical Analysis. Cengage Learning (2015)"},{"key":"20_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020, Part I. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"Carlini, N., Wagner, D.: Towards evaluating the robustness of neural networks. In: 2017 IEEE Symposium on Security and Privacy (SP), pp. 39\u201357. IEEE (2017)","DOI":"10.1109\/SP.2017.49"},{"key":"20_CR10","doi-asserted-by":"crossref","unstructured":"Chen, P.Y., Sharma, Y., Zhang, H., Yi, J., Hsieh, C.J.: EAD: elastic-net attacks to deep neural networks via adversarial examples. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a032 (2018)","DOI":"10.1609\/aaai.v32i1.11302"},{"key":"20_CR11","unstructured":"Cisse, M., Bojanowski, P., Grave, E., Dauphin, Y., Usunier, N.: Parseval networks: improving robustness to adversarial examples. In: International Conference on Machine Learning, pp. 854\u2013863. PMLR (2017)"},{"key":"20_CR12","unstructured":"Croce, F., Hein, M.: Reliable evaluation of adversarial robustness with an ensemble of diverse parameter-free attacks. In: International Conference on Machine Learning, pp. 2206\u20132216. PMLR (2020)"},{"key":"20_CR13","unstructured":"Dasoulas, G., Scaman, K., Virmaux, A.: Lipschitz normalization for self-attention layers with application to graph neural networks. In: International Conference on Machine Learning, pp. 2456\u20132466. PMLR (2021)"},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"Debenedetti, E., Sehwag, V., Mittal, P.: A light recipe to train robust vision transformers. arXiv preprint arXiv:2209.07399 (2022)","DOI":"10.1109\/SaTML54575.2023.00024"},{"key":"20_CR15","unstructured":"Dehghani, M., et\u00a0al.: Scaling vision transformers to 22 billion parameters. arXiv preprint arXiv:2302.05442 (2023)"},{"key":"20_CR16","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"20_CR17","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (ICLR) (2020)"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"d\u2019Ascoli, S., Touvron, H., Leavitt, M.L., Morcos, A.S., Biroli, G., Sagun, L.: ConViT: improving vision transformers with soft convolutional inductive biases. In: International Conference on Machine Learning, pp. 2286\u20132296. PMLR (2021)","DOI":"10.1088\/1742-5468\/ac9830"},{"key":"20_CR19","unstructured":"Federer, H.: Geometric Measure Theory. Classics in Mathematics. Springer, Heidelberg (1969)"},{"key":"20_CR20","unstructured":"Fu, Y., Zhang, S., Wu, S., Wan, C., Lin, Y.: Patch-fool: are vision transformers always robust against adversarial perturbations? In: International Conference on Learning Representations (ICLR) (2022)"},{"key":"20_CR21","unstructured":"Goodfellow, I.J., Shlens, J., Szegedy, C.: Explaining and harnessing adversarial examples. In: International Conference on Learning Representations (ICLR) (2015)"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"20_CR23","unstructured":"Hein, M., Andriushchenko, M.: Formal guarantees on the robustness of a classifier against adversarial manipulation. Adv. Neural. Inf. Process. Syst. 30 (2017)"},{"key":"20_CR24","unstructured":"Howard, J.: ImageNette (2019). https:\/\/github.com\/fastai\/imagenette"},{"key":"20_CR25","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"16","DOI":"10.1007\/978-3-030-13453-2_2","volume-title":"ECML PKDD 2018 Workshops","author":"T Huster","year":"2019","unstructured":"Huster, T., Chiang, C.-Y.J., Chadha, R.: Limitations of the Lipschitz constant as a defense against adversarial examples. In: Alzate, C., et al. (eds.) ECML PKDD 2018. LNCS (LNAI), vol. 11329, pp. 16\u201329. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-13453-2_2"},{"key":"20_CR26","unstructured":"Kim, H., Papamakarios, G., Mnih, A.: The Lipschitz constant of self-attention. In: International Conference on Machine Learning, pp. 5562\u20135571. PMLR (2021)"},{"key":"20_CR27","unstructured":"Krizhevsky, A., Hinton, G., et\u00a0al.: Learning multiple layers of features from tiny images (2009)"},{"key":"20_CR28","unstructured":"Leino, K., Wang, Z., Fredrikson, M.: Globally-robust neural networks. In: International Conference on Machine Learning, pp. 6212\u20136222. PMLR (2021)"},{"key":"20_CR29","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"20_CR30","doi-asserted-by":"crossref","unstructured":"Lovisotto, G., Finnie, N., Munoz, M., Mummadi, C.K., Metzen, J.H.: Give me your attention: dot-product attention considered harmful for adversarial patch robustness. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15234\u201315243 (2022)","DOI":"10.1109\/CVPR52688.2022.01480"},{"key":"20_CR31","unstructured":"Van\u00a0der Maaten, L., Hinton, G.: Visualizing data using t-SNE. J. Mach. Learn. Res. 9(11) (2008)"},{"key":"20_CR32","unstructured":"Madry, A., Makelov, A., Schmidt, L., Tsipras, D., Vladu, A.: Towards deep learning models resistant to adversarial attacks. In: International Conference on Learning Representations (ICLR) (2018)"},{"key":"20_CR33","unstructured":"Mo, Y., Wu, D., Wang, Y., Guo, Y., Wang, Y.: When adversarial training meets vision transformers: recipes from training to architecture. In: Advances in Neural Information Processing Systems (NeurIPS) (2022)"},{"key":"20_CR34","doi-asserted-by":"publisher","unstructured":"Murdock, J.A.: Perturbations: Theory and Methods. Society for Industrial and Applied Mathematics (1999). https:\/\/doi.org\/10.1137\/1.9781611971095","DOI":"10.1137\/1.9781611971095"},{"key":"20_CR35","first-page":"23296","volume":"34","author":"MM Naseer","year":"2021","unstructured":"Naseer, M.M., Ranasinghe, K., Khan, S.H., Hayat, M., Shahbaz Khan, F., Yang, M.H.: Intriguing properties of vision transformers. Adv. Neural. Inf. Process. Syst. 34, 23296\u201323308 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR36","doi-asserted-by":"crossref","unstructured":"Nguyen, A., Yosinski, J., Clune, J.: Deep neural networks are easily fooled: high confidence predictions for unrecognizable images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 427\u2013436 (2015)","DOI":"10.1109\/CVPR.2015.7298640"},{"key":"20_CR37","unstructured":"Pang, T., Lin, M., Yang, X., Zhu, J., Yan, S.: Robustness and accuracy could be reconcilable by (proper) definition. In: International Conference on Machine Learning, pp. 17258\u201317277. PMLR (2022)"},{"key":"20_CR38","unstructured":"Papernot, N., McDaniel, P., Sinha, A., Wellman, M.: Towards the science of security and privacy in machine learning. arXiv preprint arXiv:1611.03814 (2016)"},{"key":"20_CR39","doi-asserted-by":"crossref","unstructured":"Papernot, N., McDaniel, P., Wu, X., Jha, S., Swami, A.: Distillation as a defense to adversarial perturbations against deep neural networks. In: 2016 IEEE Symposium on Security and Privacy (SP), pp. 582\u2013597. IEEE (2016)","DOI":"10.1109\/SP.2016.41"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Paul, S., Chen, P.Y.: Vision transformers are robust learners. In: Proceedings of the AAAI Conference on Artificial Intelligence, no. 2, pp. 2071\u20132081 (2022)","DOI":"10.1609\/aaai.v36i2.20103"},{"key":"20_CR41","unstructured":"Qi, X., Wang, J., Chen, Y., Shi, Y., Zhang, L.: LipsFormer: introducing Lipschitz continuity to vision transformers. In: International Conference on Learning Representations (ICLR) (2023)"},{"key":"20_CR42","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"20_CR43","unstructured":"Shao, R., Shi, Z., Yi, J., Chen, P.Y., Hsieh, C.J.: On the adversarial robustness of vision transformers. arXiv preprint arXiv:2103.15670 (2021)"},{"key":"20_CR44","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"20_CR45","doi-asserted-by":"crossref","unstructured":"Strudel, R., Garcia, R., Laptev, I., Schmid, C.: Segmenter: transformer for semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7262\u20137272 (2021)","DOI":"10.1109\/ICCV48922.2021.00717"},{"key":"20_CR46","doi-asserted-by":"crossref","unstructured":"Szegedy, C., et al.: Going deeper with convolutions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp.\u00a01\u20139 (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"20_CR47","unstructured":"Szegedy, C., et al.: Intriguing properties of neural networks. arXiv preprint arXiv:1312.6199 (2013)"},{"key":"20_CR48","unstructured":"Takase, S., Kiyono, S., Kobayashi, S., Suzuki, J.: On layer normalizations and residual connections in transformers. arXiv preprint arXiv:2206.00330 (2022)"},{"key":"20_CR49","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u00e9gou, H.: Training data-efficient image transformers & distillation through attention. In: International Conference on Machine Learning, pp. 10347\u201310357. PMLR (2021)"},{"key":"20_CR50","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., Sablayrolles, A., Synnaeve, G., J\u00e9gou, H.: Going deeper with image transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 32\u201342 (2021)","DOI":"10.1109\/ICCV48922.2021.00010"},{"key":"20_CR51","unstructured":"Tram\u00e8r, F., Kurakin, A., Papernot, N., Goodfellow, I., Boneh, D., McDaniel, P.: Ensemble adversarial training: attacks and defenses. In: Ensemble adversarial training: Attacks and defenses (2018)"},{"key":"20_CR52","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural. Inf. Process. Syst. 30 (2017)"},{"key":"20_CR53","unstructured":"Wang, H., Deng, Y., Yoo, S., Lin, Y.: Exploring robust features for improving adversarial robustness. arXiv preprint arXiv:2309.04650 (2023)"},{"key":"20_CR54","unstructured":"Wang, H., Ma, S., Dong, L., Huang, S., Zhang, D., Wei, F.: DeepNet: scaling transformers to 1,000 layers. arXiv preprint arXiv:2203.00555 (2022)"},{"key":"20_CR55","unstructured":"Wang, Z., Bai, Y., Zhou, Y., Xie, C.: Can CNNs be more robust than transformers? arXiv preprint arXiv:2206.03452 (2022)"},{"key":"20_CR56","unstructured":"Xie, C., Wang, J., Zhang, Z., Ren, Z., Yuille, A.: Mitigating adversarial effects through randomization. In: International Conference on Learning Representations (ICLR) (2018)"},{"key":"20_CR57","unstructured":"Xu, T., Chen, W., Wang, P., Wang, F., Li, H., Jin, R.: CDTrans: cross-domain transformer for unsupervised domain adaptation. In: International Conference on Learning Representations (ICLR) (2022)"},{"key":"20_CR58","unstructured":"Yoshida, Y., Miyato, T.: Spectral norm regularization for improving the generalizability of deep learning. arXiv preprint arXiv:1705.10941 (2017)"},{"key":"20_CR59","unstructured":"Zhang, H., Yu, Y., Jiao, J., Xing, E., El\u00a0Ghaoui, L., Jordan, M.: Theoretically principled trade-off between robustness and accuracy. In: International Conference on Machine Learning, pp. 7472\u20137482. PMLR (2019)"},{"key":"20_CR60","doi-asserted-by":"crossref","unstructured":"Zhang, H., Zhang, P., Hsieh, C.J.: RecurJac: an efficient recursive algorithm for bounding Jacobian matrix of neural networks and its applications. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 5757\u20135764 (2019)","DOI":"10.1609\/aaai.v33i01.33015757"},{"key":"20_CR61","doi-asserted-by":"crossref","unstructured":"Zheng, S., Song, Y., Leung, T., Goodfellow, I.: Improving the robustness of deep neural networks via stability training. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4480\u20134488 (2016)","DOI":"10.1109\/CVPR.2016.485"},{"key":"20_CR62","unstructured":"Zhou, D., et al.: Understanding the robustness in vision transformers. In: International Conference on Machine Learning, pp. 27378\u201327394. PMLR (2022)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72949-2_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:50:16Z","timestamp":1730303416000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72949-2_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031729485","9783031729492"],"references-count":62,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72949-2_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}