{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T12:28:30Z","timestamp":1780057710013,"version":"3.54.0"},"reference-count":90,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01071","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"11513-11524","source":"Crossref","is-referenced-by-count":2,"title":["Efficient Track Anything"],"prefix":"10.1109","author":[{"given":"Yunyang","family":"Xiong","sequence":"first","affiliation":[{"name":"Meta AI Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chong","family":"Zhou","sequence":"additional","affiliation":[{"name":"Meta AI Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaoyu","family":"Xiang","sequence":"additional","affiliation":[{"name":"Meta AI Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lemeng","family":"Wu","sequence":"additional","affiliation":[{"name":"Meta AI Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chenchen","family":"Zhu","sequence":"additional","affiliation":[{"name":"Meta AI Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zechun","family":"Liu","sequence":"additional","affiliation":[{"name":"Meta AI Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Saksham","family":"Suri","sequence":"additional","affiliation":[{"name":"Meta AI Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Balakrishnan","family":"Varadarajan","sequence":"additional","affiliation":[{"name":"Meta AI Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ramya Krishna","family":"Akula","sequence":"additional","affiliation":[{"name":"Meta AI Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Forrest","family":"Iandola","sequence":"additional","affiliation":[{"name":"Meta AI Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Raghuraman","family":"Krishnamoorthi","sequence":"additional","affiliation":[{"name":"Meta AI Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bilge","family":"Soran","sequence":"additional","affiliation":[{"name":"Meta AI Research"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Vikas","family":"Chandra","sequence":"additional","affiliation":[{"name":"Meta AI Research"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_46"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15555-0_21"},{"key":"ref3","author":"Caelles","year":"2018","journal-title":"The 2018 davis challenge on video object segmentation"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.01587"},{"key":"ref5","author":"Cen","year":"2023","journal-title":"Sad: Segment any rgbd"},{"key":"ref6","volume-title":"Semantic segment anything","author":"Chen","year":"2023"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00361"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_37"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00551"},{"key":"ref11","first-page":"11781","article-title":"Rethinking space-time networks with improved memory coverage for efficient video object segmentation","volume":"34","author":"Cheng","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00127"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00304"},{"key":"ref14","author":"Cheng","year":"2023","journal-title":"Segment and track anything"},{"key":"ref15","author":"Choromanski","year":"2020","journal-title":"Rethinking attention with performers"},{"key":"ref16","article-title":"ELECTRA: Pre-training text encoders as discriminators rather than generators","author":"Clark","year":"2020","journal-title":"ICLR"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00680"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.2352\/ei.2025.37.14.coimg-132"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01850"},{"key":"ref20","author":"Ding","year":"2024","journal-title":"Sam2long: Enhancing sam 2 for long video segmentation with a training-free memory tree"},{"key":"ref21","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","volume-title":"International Conference on Learning Representations","author":"Dosovitskiy"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00585"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_36"},{"key":"ref24","article-title":"Video segmentation by nonlocal consensus voting","author":"Faktor","year":"2014","journal-title":"BMVC"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247883"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612680"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5539893"},{"key":"ref29","author":"Han","year":"2023","journal-title":"Segment anything model (sam) meets glass: Mirror and transparent objects cannot be easily detected"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_18"},{"key":"ref31","author":"Homayounfar","year":"2021","journal-title":"Videoclick: Video object segmentation with a single click"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3611020\/mm1"},{"key":"ref33","first-page":"4651","article-title":"Perceiver: General perception with iterative attention","volume-title":"International conference on machine learning","author":"Jaegle"},{"key":"ref34","author":"Jiang","year":"2023","journal-title":"Restore anything pipeline: Segment anything meets image restoration"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref36","article-title":"Fractalnet: Ultra-deep neural networks without residuals","volume-title":"International Conference on Learning Representations","author":"Larsson"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1989.1.4.541"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126471"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/iccv.2013.273"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00139"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01386"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref46","article-title":"Decoupled weight decay regularization","volume-title":"International Conference on Learning Representations","author":"Loshchilov"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1038\/s41467-024-44824-z"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00932"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.223"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247743"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.85"},{"key":"ref52","author":"Pont-Tuset","year":"2017","journal-title":"The 2017 davis challenge on video object segmentation"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/jstars.2024.3490754"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/wacv61041.2025.00901"},{"key":"ref55","author":"Ravi","year":"2024","journal-title":"Sam 2: Segment anything in images and videos"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00743"},{"key":"ref57","first-page":"29441","article-title":"Hiera: A hierarchical vision transformer without the bells-and-whistles","volume-title":"International Conference on Machine Learning","author":"Ryali"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1117\/12.3047383"},{"key":"ref59","author":"Sun","year":"2023","journal-title":"Explain any concept: Segment anything meets concept-based explanation"},{"key":"ref60","author":"Tang","year":"2024","journal-title":"Segment any mesh: Zero-shot mesh part segmentation via lifting segment anything 2 to 3 d"},{"key":"ref61","author":"Tang","year":"2023","journal-title":"Can sam segment anything? when sam meets camouflaged object detection"},{"key":"ref62","author":"Tariq","year":"2023","journal-title":"Segment anything meets semantic communication"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299055"},{"key":"ref64","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"International conference on machine learning","author":"Touvron"},{"key":"ref65","first-page":"6000","article-title":"Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. Attention is all you need","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems","author":"Vaswani"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00225"},{"key":"ref67","author":"Wang","journal-title":"Linformer: Self-attention with linear complexity"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2015.7298961"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01276"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-025-00106-w"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i16.17664"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01525"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247802"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_36"},{"key":"ref76","author":"Yang","year":"2023","journal-title":"Track anything: Segment anything meets videos"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2632"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3383592"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01387"},{"key":"ref80","author":"Yu","year":"2023","journal-title":"Inpaint anything: Segment anything meets image inpainting"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01179"},{"key":"ref82","author":"Zhang","year":"2023","journal-title":"Faster segment anything: Towards lightweight sam for mobile applications"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.87"},{"key":"ref84","author":"Zhang","year":"2023","journal-title":"Joint modeling of feature, correspondence, and a compressed memory for video object segmentation"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-47401-9_16"},{"key":"ref86","author":"Zhang","year":"2023","journal-title":"Deshadowanything: When segment anything model meets zero-shot shadow removal"},{"key":"ref87","author":"Zhang","year":"2024","journal-title":"Evf-sam: Early vision-language fusion for text-prompted segment anything model"},{"key":"ref88","author":"Zhao","year":"2023","journal-title":"Fast segment anything"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01291"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-025-00082-1"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444247.pdf?arnumber=11444247","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:29:21Z","timestamp":1777613361000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444247\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":90,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01071","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}