{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,14]],"date-time":"2025-10-14T11:37:39Z","timestamp":1760441859365,"version":"3.41.2"},"reference-count":129,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"8","license":[{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,8,1]],"date-time":"2025-08-01T00:00:00Z","timestamp":1754006400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key R &#x0026; D Program of China","award":["2022ZD0116500"],"award-info":[{"award-number":["2022ZD0116500"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21B2042","62320106010"],"award-info":[{"award-number":["U21B2042","62320106010"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"2035 Innovation Program of CAS"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1109\/tpami.2025.3557001","type":"journal-article","created":{"date-parts":[[2025,4,9]],"date-time":"2025-04-09T17:54:32Z","timestamp":1744221272000},"page":"6200-6214","source":"Crossref","is-referenced-by-count":1,"title":["Bootstrap Masked Visual Modeling via Hard Patch Mining"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2333-1844","authenticated-orcid":false,"given":"Haochen","family":"Wang","sequence":"first","affiliation":[{"name":"New Laboratory of Pattern Recognition, State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6989-2711","authenticated-orcid":false,"given":"Junsong","family":"Fan","sequence":"additional","affiliation":[{"name":"New Laboratory of Pattern Recognition, State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1579-2357","authenticated-orcid":false,"given":"Yuxi","family":"Wang","sequence":"additional","affiliation":[{"name":"New Laboratory of Pattern Recognition, State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8999-2680","authenticated-orcid":false,"given":"Kaiyou","family":"Song","sequence":"additional","affiliation":[{"name":"Megvii Technology, Beijing, China"}]},{"given":"Tiancai","family":"Wang","sequence":"additional","affiliation":[{"name":"Megvii Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2138-4608","authenticated-orcid":false,"given":"Xiangyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Megvii Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2648-3875","authenticated-orcid":false,"given":"Zhaoxiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"New Laboratory of Pattern Recognition, State Key Laboratory of Multimodal Artificial Intelligence Systems, Institute of Automation, Chinese Academy of Sciences, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.5555\/3524938.3525087"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3497510"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01000"},{"article-title":"DropPos: Pre-training vision transformers by reconstructing dropped positions","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang","key":"ref7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref9","article-title":"Improving language understanding by generative pre-training","author":"Radford","year":"2018","journal-title":"OpenAI"},{"key":"ref10","article-title":"Language models are unsupervised multitask learners","author":"Radford","year":"2019","journal-title":"OpenAI"},{"article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Brown","key":"ref11"},{"article-title":"BEiT: BERT pre-training of image transformers","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Bao","key":"ref12"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00943"},{"article-title":"Masked autoencoders as spatiotemporal learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Feichtenhofer","key":"ref15"},{"article-title":"VideoMAE: Masked autoencoders are data-efficient learners for self-supervised video pre-training","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Tong","key":"ref16"},{"article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Dosovitskiy","key":"ref17"},{"article-title":"Uniform masking: Enabling mae pre-training for pyramid-based vision transformers with locality","year":"2022","author":"Li","key":"ref18"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01241"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00517"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"article-title":"The kinetics human action video dataset","year":"2017","author":"Kay","key":"ref23"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.167"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46466-4_5"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_40"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"article-title":"Representation learning with contrastive predictive coding","year":"2018","author":"Oord","key":"ref28"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00421"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02285-3"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02229-x"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2019.00025"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/104"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018545"},{"key":"ref36","first-page":"843","article-title":"Unsupervised learning of video representations using LSTMs","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Srivastava"},{"article-title":"Deep predictive coding networks for video prediction and unsupervised learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Lotter","key":"ref37"},{"article-title":"Deep multi-scale video prediction beyond mean square error","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Mathieu","key":"ref38"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.18"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_51"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_24"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.13"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.320"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.638"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00267"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_32"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.607"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.79"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00840"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01058"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00186"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00689"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00129"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01426"},{"article-title":"Image BERT pre-training with online tokenizer","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhou","key":"ref56"},{"article-title":"Siamese masked autoencoders","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Gupta","key":"ref57"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/1390156.1390294"},{"key":"ref59","first-page":"3371","article-title":"Stacked denoising autoencoders: Learning useful representations in a deep network with a local denoising criterion","volume":"11","author":"Vincent","year":"2010","journal-title":"J. Mach. Learn. Res."},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.278"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_35"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.76"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.96"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1126\/science.1127647"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01170"},{"article-title":"Discrete variational autoencoders","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Rolfe","key":"ref68"},{"article-title":"MixMIM: Mixed and masked image modeling for efficient visual representation learning","year":"2022","author":"Liu","key":"ref69"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.177"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25252"},{"article-title":"Masked frequency modeling for self-supervised visual pre-training","year":"2022","author":"Xie","key":"ref72"},{"key":"ref73","first-page":"1298","article-title":"Data2vec: A general framework for self-supervised learning in speech, vision and language","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Baevski"},{"article-title":"Masked image modeling with denoising contrast","year":"2022","author":"Yi","key":"ref74"},{"article-title":"Extreme masking for learning instance and distributed visual representations","year":"2022","author":"Wu","key":"ref75"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20056-4_15"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01826"},{"key":"ref79","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01003"},{"article-title":"MaskViT: Masked visual pre-training for video prediction","year":"2022","author":"Gupta","key":"ref81"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01516"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20056-4_18"},{"key":"ref84","first-page":"20026","article-title":"Adversarial masking for self-supervised learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Shi"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"article-title":"SemMAE: Semantic-guided masking for learning masked autoencoders","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li","key":"ref86"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00983"},{"article-title":"MOFO: Motion focused self-supervision for video understanding","year":"2023","author":"Ahmadian","key":"ref89"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_24"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1002\/0470847832"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1613\/jair.301"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01394"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00248"},{"article-title":"Exploring target representations for masked autoencoders","year":"2022","author":"Liu","key":"ref95"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.00604"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01852-4"},{"article-title":"Accurate, large minibatch SGD: Training imagenet in 1 hour","year":"2017","author":"Goyal","key":"ref98"},{"article-title":"Mixup: Beyond empirical risk minimization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Zhang","key":"ref99"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00612"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_39"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"article-title":"Benchmarking detection transfer learning with vision transformers","year":"2021","author":"Li","key":"ref106"},{"article-title":"Detectron2","year":"2019","author":"Wu","key":"ref107"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.544"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-014-0733-5"},{"year":"2020","key":"ref111","article-title":"MMSegmentation: OpenMMLab semantic segmentation toolbox and benchmark"},{"article-title":"Decoupled weight decay regularization","year":"2017","author":"Loshchilov","key":"ref112"},{"article-title":"Unsupervised learning of visual features by contrasting cluster assignments","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Caron","key":"ref113"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.2307\/2331554"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2102.05095"},{"key":"ref121","first-page":"12493","article-title":"Keeping your eye on the ball: Trajectory attention in video transformers","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Patrick"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.324"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.89"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018577"},{"article-title":"Learning from future: A novel self-training framework for semantic segmentation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Du","key":"ref126"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01874"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3051099"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00990"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11068886\/10960509.pdf?arnumber=10960509","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T22:56:59Z","timestamp":1751929019000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10960509\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8]]},"references-count":129,"journal-issue":{"issue":"8"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3557001","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"type":"print","value":"0162-8828"},{"type":"electronic","value":"2160-9292"},{"type":"electronic","value":"1939-3539"}],"subject":[],"published":{"date-parts":[[2025,8]]}}}