{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T01:10:02Z","timestamp":1755911402344,"version":"3.44.0"},"reference-count":80,"publisher":"Tsinghua University Press","issue":"3","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["623B2057,62220106003"],"award-info":[{"award-number":["623B2057,62220106003"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["Comp. Visual. Med."],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.26599\/cvm.2025.9450474","type":"journal-article","created":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T13:48:42Z","timestamp":1751896122000},"page":"483-496","source":"Crossref","is-referenced-by-count":0,"title":["FastMAE: Efficient Masked Autoencoder with Offline Tokenizer"],"prefix":"10.26599","volume":"11","author":[{"given":"Meng-Hao","family":"Guo","sequence":"first","affiliation":[{"name":"Tsinghua University,Department of Computer Science,Beijing,China,100084"}]},{"given":"Chen","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Pennsylvania,Philadelphia,PA,USA,19104"}]},{"given":"Wei","family":"Liu","sequence":"additional","affiliation":[{"name":"Tencent Data Platform,Shenzhen,China,518057"}]},{"given":"Shi-Min","family":"Hu","sequence":"additional","affiliation":[{"name":"Tsinghua University,Department of Computer Science,Beijing,China,100084"}]}],"member":"11138","reference":[{"key":"ref1","article-title":"Unsupervised representation learning by predicting image rotations","author":"Gidaris","year":"2018","journal-title":"arXiv preprint"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref4","article-title":"A simple framework for contrastive learning of visual representations","author":"Chen","year":"2020","journal-title":"arXiv preprint"},{"key":"ref5","article-title":"Bootstrap your own latent-a new approach to self-supervised learning","author":"Grill","year":"2020","journal-title":"arXiv preprint"},{"key":"ref6","article-title":"Beit: BERT pre-training of image transformers","author":"Bao","year":"2021","journal-title":"arXiv preprint"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref8","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018","journal-title":"arXiv preprint"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2102.12092"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20056-4_20"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01426"},{"key":"ref12","article-title":"ConvMAE: Masked convolution meets masked autoencoders","author":"Gao","year":"2022","journal-title":"arXiv preprint"},{"key":"ref13","article-title":"HiViT: A simpler and more efficient design of hierarchical vision transformer","volume-title":"Proceedings of the 11th International Conference on Learning Representations","author":"Zhang","year":"2023"},{"key":"ref14","article-title":"Designing BERT for convolutional networks: Sparse and hierarchical masked modeling","author":"Tian","year":"2023","journal-title":"arXiv preprint"},{"key":"ref15","article-title":"Exploring long-sequence masked autoencoders","author":"Hu","year":"2022","journal-title":"arXiv preprint"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3336525"},{"key":"ref17","article-title":"CMAE-V: Contrastive masked autoencoders for video action recognition","author":"Lu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01852-4"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20056-4_15"},{"key":"ref20","article-title":"iBOT: Image bert pre-training with online tokenizer","author":"Zhou","year":"2021","journal-title":"arXiv preprint"},{"key":"ref21","article-title":"Towards sustainable self-supervised learning","author":"Gao","year":"2022","journal-title":"arXiv preprint"},{"key":"ref22","article-title":"MILAN: Masked image pretraining on language assisted representation","author":"Hou","year":"2022","journal-title":"arXiv preprint"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01785"},{"key":"ref24","article-title":"Mugs: A multi-granular self-supervised learning framework","author":"Zhou","year":"2022","journal-title":"arXiv preprint"},{"key":"ref25","article-title":"Albert: A lite bert for self-supervised learning of language representations","author":"Lan","year":"2019","journal-title":"arXiv preprint"},{"key":"ref26","article-title":"ELECTRA: Pre-training text encoders as discriminators rather than generators","author":"Clark","year":"2020","journal-title":"arXiv preprint"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02240"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1394-4"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-024-0424-2"},{"key":"ref30","article-title":"Green hierarchical vision transformer for masked image modeling","author":"Huang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01855"},{"key":"ref32","article-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training","author":"Tong","year":"2022","journal-title":"arXiv preprint"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25130"},{"key":"ref34","article-title":"VICRegL: Self-supervised learning of local visual features","author":"Bardes","year":"2022","journal-title":"arXiv preprint"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-023-0375-z"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-024-0430-4"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_45"},{"key":"ref38","article-title":"Unsupervised learning of visual features by contrasting cluster assignments","author":"Caron","year":"2021","journal-title":"arXiv preprint"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01641"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-022-0274-8"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-021-0229-5"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2022.3211006"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-023-0364-2"},{"key":"ref47","article-title":"SegNeXt: Rethinking convolutional attention design for semantic segmentation","author":"Guo","year":"2022","journal-title":"arXiv preprint"},{"key":"ref48","article-title":"CoATNet: Marrying convolution and attention for all data sizes","author":"Dai","year":"2021","journal-title":"arXiv preprint"},{"key":"ref49","article-title":"Is attention better than matrix decomposition?","author":"Geng","year":"2021","journal-title":"arXiv preprint"},{"key":"ref50","first-page":"15908","article-title":"Transformer in transformer","volume-title":"Proceedings of the 35th International Conference on Neural Information Processing Systems","author":"Han","year":"2021"},{"key":"ref51","article-title":"ViTAE: Vision transformer advanced by exploring intrinsic inductive bias","author":"Xu","year":"2021","journal-title":"arXiv preprint"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref53","article-title":"R-bench: Graduate-level multi-disciplinary benchmarks for LLM & MLLM complex reasoning evaluation","author":"Guo","year":"2025","journal-title":"arXiv preprint"},{"key":"ref54","article-title":"RBench-V: A primary assessment for visual reasoning models with multi-modal outputs","author":"Guo","year":"2025","journal-title":"arXiv preprint"},{"key":"ref55","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020","journal-title":"arXiv preprint"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"ref57","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Touvron","year":"2021"},{"key":"ref58","article-title":"SegFormer: Simple and efficient design for semantic segmentation with transformers","author":"Xie","year":"2021","journal-title":"arXiv preprint"},{"key":"ref59","article-title":"UniFormer: Unified transformer for efficient spatiotemporal representation learning","author":"Li","year":"2022","journal-title":"arXiv preprint"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"ref61","article-title":"TransGAN: Two pure transformers can make one strong GAN, and that can scale up","author":"Jiang","year":"2021","journal-title":"arXiv preprint"},{"key":"ref62","volume-title":"Introduction to Algorithms","volume":"3","author":"Leiserson","year":"1994"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.02177"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.544"},{"key":"ref68","first-page":"8026","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems","author":"Paszke","year":"2019"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-020-3097-4"},{"key":"ref70","article-title":"Decoupled weight decay regularization","author":"Loshchilov","year":"2017","journal-title":"arXiv preprint"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref73","article-title":"MMDetection: Open MMLab detection toolbox and benchmark","author":"Chen","year":"2019","journal-title":"arXiv preprint"},{"key":"ref74","article-title":"Corrupted image modeling for self-supervised visual pre-training","author":"Fang","year":"2022","journal-title":"arXiv preprint"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.00212"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20056-4_7"},{"key":"ref78","first-page":"1298","article-title":"data2vec: A general framework for self-supervised learning in speech, vision and language","volume-title":"Proceedings of the 39th International Conference on Machine Learning","author":"Baevski","year":"2022"},{"key":"ref79","article-title":"FastMIM: Expediting masked image modeling pre-training for vision","author":"Guo","year":"2022","journal-title":"arXiv preprint"},{"key":"ref80","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Radford","year":"2021"}],"container-title":["Computational Visual Media"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10750449\/11072531\/11072550.pdf?arnumber=11072550","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T00:52:03Z","timestamp":1755910323000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11072550\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6]]},"references-count":80,"journal-issue":{"issue":"3"},"URL":"https:\/\/doi.org\/10.26599\/cvm.2025.9450474","relation":{},"ISSN":["2096-0662","2096-0433"],"issn-type":[{"type":"electronic","value":"2096-0662"},{"type":"print","value":"2096-0433"}],"subject":[],"published":{"date-parts":[[2025,6]]}}}