{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:41:21Z","timestamp":1755823281446,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":74,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"JSPS KAKENHI","award":["JP23H00490"],"award-info":[{"award-number":["JP23H00490"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612060","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"2684-2693","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Scale-space Tokenization for Improving the Robustness of Vision Transformers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-0570-7008","authenticated-orcid":false,"given":"Lei","family":"Xu","sequence":"first","affiliation":[{"name":"Tokyo Institute of Technology, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2342-3324","authenticated-orcid":false,"given":"Rei","family":"Kawakami","sequence":"additional","affiliation":[{"name":"Tokyo Institute of Technology, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9761-4142","authenticated-orcid":false,"given":"Nakamasa","family":"Inoue","sequence":"additional","affiliation":[{"name":"Tokyo Institute of Technology, Tokyo, Japan"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICEngTechnol.2017.8308186"},{"key":"e_1_3_2_1_2_1","first-page":"1","article-title":"Why do deep convolutional networks generalize so poorly to small image transformations","volume":"20","author":"Azulay Aharon","year":"2019","unstructured":"Aharon Azulay and Yair Weiss. 2019. Why do deep convolutional networks generalize so poorly to small image transformations? J. Mach. Learn. Res. 20 (2019), no. 184, pp. 1--25.","journal-title":"J. Mach. Learn. Res."},{"key":"e_1_3_2_1_3_1","volume-title":"Proc. Advances in Neural Information Processing Systems (NeurIPS).","author":"Bai Yutong","year":"2021","unstructured":"Yutong Bai, Jieru Mei, Alan L Yuille, and Cihang Xie. 2021. Are Transformers More Robust than CNNs?. In Proc. Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01007"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/1282280.1282340"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCOM.1983.1095851"},{"key":"e_1_3_2_1_7_1","volume-title":"Proc. International Conference on Machine Learning (ICML).","author":"Ascoli St\u00e9phane","year":"2021","unstructured":"St\u00e9phane d'Ascoli, Hugo Touvron, Matthew Leavitt, Ari Morcos, Giulio Biroli, and Levent Sagun. 2021. Convit: Improving vision transformers with soft convolutional inductive biases. In Proc. International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547826"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2300479"},{"key":"e_1_3_2_1_11_1","volume-title":"Proc. International Conference on Learning Representations (ICLR).","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An image is worth 16x16 words: Transformers for image recognition at scale. In Proc. International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413808"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-020-00257-z"},{"key":"e_1_3_2_1_15_1","volume-title":"Proc. International Conference on Learning Representations (ICLR).","author":"Geirhos Robert","year":"2019","unstructured":"Robert Geirhos, Patricia Rubisch, Claudio Michaelis, Matthias Bethge, Felix A. Wichmann, and Wieland Brendel. 2019. ImageNet-trained CNNs are biased towards texture; increasing shape bias improves accuracy and robustness. In Proc. International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_16_1","volume-title":"Proc. International Conference on Learning Representations (ICLR).","author":"Goodfellow Ian J.","year":"2015","unstructured":"Ian J. Goodfellow, Jonathon Shlens, and Christian Szegedy. 2015. Explaining and Harnessing Adversarial Examples. In Proc. International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19775-8_24"},{"key":"e_1_3_2_1_18_1","volume-title":"Proc. Advances in Neural Information Processing Systems (NeurIPS).","author":"Han Kai","year":"2021","unstructured":"Kai Han, An Xiao, EnhuaWu, Jianyuan Guo, Chunjing Xu, and YunheWang. 2021. Transformer in transformer. In Proc. Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547815"},{"key":"e_1_3_2_1_21_1","volume-title":"Proc. International Conference on Computer Vision (ICCV).","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Steven Basart, Norman Mu, Saurav Kadavath, FrankWang, Evan Dorundo, Rahul Desai, Tyler Zhu, Samyak Parajuli, Mike Guo, et al. 2020. The Many Faces of Robustness: A Critical Analysis of Out-of-Distribution Generalization. In Proc. International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_2_1_22_1","volume-title":"Proc. International Conference on Learning Representations (ICLR).","author":"Hendrycks Dan","year":"2019","unstructured":"Dan Hendrycks and Thomas Dietterich. 2019. Benchmarking Neural Network Robustness to Common Corruptions and Perturbations. In Proc. International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_23_1","volume-title":"Proc. International Conference on Learning Representations (ICLR).","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Norman Mu, Ekin D Cubuk, Barret Zoph, Justin Gilmer, and Balaji Lakshminarayanan. 2020. AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty. In Proc. International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Dan Hendrycks Kevin Zhao Steven Basart Jacob Steinhardt and Dawn Song. 2021. Natural Adversarial Examples. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01172"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3469877.3497692"},{"key":"e_1_3_2_1_27_1","volume-title":"Proc. International Conference on Machine Learning (ICML).","author":"Ioffe Sergey","year":"2015","unstructured":"Sergey Ioffe and Christian Szegedy. 2015. Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift. In Proc. International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3505244"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475332"},{"volume-title":"Proc. International Conference on Learning Representations (ICLR).","author":"Diederik","key":"e_1_3_2_1_30_1","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In Proc. International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF00336961"},{"key":"e_1_3_2_1_32_1","unstructured":"Alex Krizhevsky and Geoffrey Hinton. 2009. Learning Multiple Layers of Features from Tiny Images. Technical Report."},{"volume-title":"Proc. Conference on Neural Information Processing Systems (NIPS).","author":"Krizhevsky Alex","key":"e_1_3_2_1_33_1","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton. 2012. ImageNet Classification with Deep Convolutional Neural Networks. In Proc. Conference on Neural Information Processing Systems (NIPS)."},{"volume-title":"Proc. Computer Vision and Pattern Recognition (CVPR).","author":"Lazebnik S.","key":"e_1_3_2_1_34_1","unstructured":"S. Lazebnik, C. Schmid, and J. Ponce. 2006. Beyond Bags of Features: Spatial Pyramid Matching for Recognizing Natural Scene Categories. In Proc. Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548770"},{"key":"e_1_3_2_1_36_1","volume-title":"Proc. International Conference on Learning Representations (ICLR).","author":"Li Yingwei","year":"2021","unstructured":"Yingwei Li, Qihang Yu, Mingxing Tan, Jieru Mei, Peng Tang, Wei Shen, Alan Yuille, and Cihang Xie. 2021. Shape-Texture Debiased Neural Network Training. In Proc. International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_37_1","volume-title":"LocalViT: Bringing Locality to Vision Transformers. CoRR","author":"Li Yawei","year":"2021","unstructured":"Yawei Li, Kai Zhang, Jiezhang Cao, Radu Timofte, and Luc Van Gool. 2021. LocalViT: Bringing Locality to Vision Transformers. CoRR (2021). arXiv:2104.05707"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1080\/757582976"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_40_1","first-page":"23415","article-title":"Distinctive Image Features from Scale-Invariant Keypoints","volume":"78","author":"Lowe David G.","year":"2004","unstructured":"David G. Lowe. 2004. Distinctive Image Features from Scale-Invariant Keypoints. Springer International Journal of Computer Vision 78 (2004), pp. 23415--23442.","journal-title":"Springer International Journal of Computer Vision"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485133"},{"key":"e_1_3_2_1_42_1","volume-title":"Proc. International Conference on Learning Representations (ICLR).","author":"Madry Aleksander","year":"2018","unstructured":"Aleksander Madry, Aleksandar Makelov, Ludwig Schmidt, Dimitris Tsipras, and Adrian Vladu. 2018. Towards Deep Learning Models Resistant to Adversarial Attacks. In Proc. International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_43_1","volume-title":"Proc. International Conference on Learning Representations (ICLR).","author":"Madry Aleksander","year":"2018","unstructured":"Aleksander Madry, Aleksandar Makelov, Ludwig Schmidt, Dimitris Tsipras, and Adrian Vladu. 2018. Towards deep learning models resistant to adversarial attacks. In Proc. International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00774"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01173"},{"key":"e_1_3_2_1_46_1","unstructured":"D. Marr and E. Hildreth. 1980. Theory of Edge Detection. In In Proc. Royal Society of London. Series B Biological Sciences."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20103"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01044"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547954"},{"key":"e_1_3_2_1_50_1","volume-title":"Proc. European Conference on Computer Vision (ECCV).","author":"Rusak Evgenia","year":"2020","unstructured":"Evgenia Rusak, Lukas Schott, Roland S Zimmermann, Julian Bitterwolf, Oliver Bringmann, Matthias Bethge, and Wieland Brendel. 2020. A SimpleWay to Make Neural Networks Robust Against Diverse Image Corruptions. In Proc. European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548340"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_53_1","volume-title":"Proc. International Conference on Machine Learning (ICML).","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc Le. 2019. EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks. In Proc. International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_54_1","volume-title":"Proc. International Conference on Machine Learning (ICML).","author":"Touvron Hugo","year":"2020","unstructured":"Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, and Herv\u00e9 J\u00e9gou. 2020. Training data-efficient image transformers & distillation through attention. Proc. International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_55_1","volume-title":"Proc. Computer Vision and Pattern Recognition (CVPR).","author":"Tsung-Yi Lin","year":"2017","unstructured":"Lin Tsung-Yi, Piotr Doll\u00e1r, Ross Girshick, Kaiming He, Bharath Hariharan, and Serge Belongie. 2017. Feature Pyramid Networks for Object Detection. In Proc. Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_56_1","volume-title":"Proc. Advances in Neural Information Processing Systems (NeurIPS).","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention Is All You Need. In Proc. Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_57_1","volume-title":"Proc. Advances in Neural Information Processing Systems (NeurIPS).","author":"Wang Haohan","year":"2019","unstructured":"Haohan Wang, Songwei Ge, Eric P Xing, and Zachary C Lipton. 2019. Learning Robust Global Representations by Penalizing Local Predictive Power. In Proc. Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547989"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10032-020-00360-2"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19778-9_18"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"e_1_3_2_1_63_1","volume-title":"Proc. International Conference on Learning Representations (ICLR).","author":"Xinlong Wang Chunhua Shen Bo Zhang","year":"2023","unstructured":"Bo Zhang Xinlong Wang Chunhua Shen Xiangxiang Chu, Zhi Tian. 2023. Conditional Positional Encodings for Vision Transformers. In Proc. International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413543"},{"key":"e_1_3_2_1_66_1","volume-title":"Proc. Advances in Neural Information Processing Systems (NeurIPS).","author":"Yin Dong","year":"2019","unstructured":"Dong Yin, Raphael Gontijo Lopes, Jonathon Shlens, Ekin Dogus Cubuk, and Justin Gilmer. 2019. A Fourier Perspective on Model Robustness in Computer Vision. In Proc. Advances in Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00062"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547913"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3475731.3484955"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547801"},{"key":"e_1_3_2_1_72_1","volume-title":"Proc. International Conference on Machine Learning (ICML).","author":"Zhang Richard","year":"2019","unstructured":"Richard Zhang. 2019. Making Convolutional Networks Shift-Invariant Again. In Proc. International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/3475724.3483609"},{"volume-title":"Proc. International Conference on Machine Learning (ICML).","author":"Zhou Daquan","key":"e_1_3_2_1_74_1","unstructured":"Daquan Zhou, Zhiding Yu, Enze Xie, Chaowei Xiao, Animashree Anandkumar, Jiashi Feng, and Jose M. Alvarez. 2022. Understanding The Robustness in Vision Transformers. In Proc. International Conference on Machine Learning (ICML)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612060","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612060","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:03:35Z","timestamp":1755821015000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612060"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":74,"alternative-id":["10.1145\/3581783.3612060","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612060","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}