{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:01:54Z","timestamp":1777888914772,"version":"3.51.4"},"reference-count":69,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01038","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"11153-11163","source":"Crossref","is-referenced-by-count":0,"title":["Trackverse: a Large-Scale Object-Centric Video Dataset for Image-Level Representation Learning"],"prefix":"10.1109","author":[{"given":"Yibing","family":"Wei","sequence":"first","affiliation":[{"name":"Pedro Morgado University of Wisconsin-Madison"}]},{"given":"Samuel","family":"Church","sequence":"additional","affiliation":[{"name":"Pedro Morgado University of Wisconsin-Madison"}]},{"given":"Victor","family":"Suciu","sequence":"additional","affiliation":[{"name":"Pedro Morgado University of Wisconsin-Madison"}]},{"given":"Jinhong","family":"Lin","sequence":"additional","affiliation":[{"name":"Pedro Morgado University of Wisconsin-Madison"}]},{"given":"Cheng-En","family":"Wu","sequence":"additional","affiliation":[{"name":"Pedro Morgado University of Wisconsin-Madison"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Youtube-8m: A largescale video classification benchmark","author":"Abu-El-Haija","year":"2016","journal-title":"arXiv preprint"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.13"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref4","first-page":"1298","volume-title":"Data2vec: A general framework for self-supervised learning in speech, vision and language","author":"Baevski","year":"2022"},{"key":"ref5","author":"Bertasius","year":"2021","journal-title":"Is space-time attention all you need for video understanding"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00640"},{"key":"ref7","author":"Caron","year":"2020","journal-title":"Unsupervised learning of visual features by contrasting cluster assignments"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref9","volume-title":"Pyscenedetect","author":"Castellano"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01343"},{"key":"ref11","first-page":"1597","volume-title":"A simple framework for contrastive learning of visual representations","author":"Chen","year":"2020"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01531-2"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_26"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2022.103406"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2496141"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00331"},{"key":"ref19","article-title":"Masked autoencoders as spatiotemporal learners","author":"Feichtenhofer","year":"2022","journal-title":"CVPR"},{"key":"ref20","author":"Gordon","year":"2020","journal-title":"Watching the world go by: Representation learning from unlabeled videos"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01842"},{"key":"ref23","author":"Grill","year":"2020","journal-title":"Bootstrap your own latent: A new approach to self-supervised learning"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00550"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00550"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1162\/089976603321192121"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298744"},{"key":"ref30","first-page":"1954519560","volume":"33","author":"Jabri","year":"2020","journal-title":"Space-time correspondence as a contrastive random walk"},{"key":"ref31","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"International conference on machine learning","author":"Jia","year":"2021"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"ref33","article-title":"The kinetics human action video dataset","author":"Kay","year":"2017","journal-title":"arXiv preprint"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2516982"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"ref36","author":"Leal-Taix\u00e9","year":"2015","journal-title":"MOTChallenge 2015: Towards a benchmark for multi-target tracking"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref38","article-title":"Decoupled weight decay regularization","volume-title":"International Conference on Learning Representations","author":"Loshchilov","year":"2018"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01274"},{"key":"ref40","first-page":"12475","article-title":"Audiovisual instance discrimination with cross-modal agreement","volume-title":"Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","author":"Morgado","year":"2021"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr46437.2021.01229"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.29007\/1mjd"},{"key":"ref43","first-page":"9960","article-title":"Selfsupervised learning through the eyes of a child","volume":"33","author":"Orhan","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref44","article-title":"Self-supervised video pretraining yields human-aligned visual representations","author":"Parthasarathy","year":"2022","journal-title":"arXiv preprint"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.638"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.85"},{"key":"ref47","author":"Pont-Tuset","year":"2017","journal-title":"The 2017 davis challenge on video object segmentation"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00689"},{"key":"ref49","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021","journal-title":"arXiv preprint"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00129"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01516"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2018.8462891"},{"key":"ref53","article-title":"Pixel-level correspondence for self-supervised learning from video","author":"Sharma","year":"2022","journal-title":"arXiv preprint"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1212.0402"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00212"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01382"},{"key":"ref57","article-title":"Is imagenet worth 1 video? learning strong image encoders from 1 long unlabelled video","author":"Venkataramanan","year":"2023","journal-title":"arXiv preprint"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.320"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72933-1_1"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1162\/089976602317318938"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01003"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00498"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_1"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00174"},{"key":"ref66","article-title":"ibot: Image bert pre-training with online tokenizer","author":"Zhou","year":"2021","journal-title":"arXiv preprint"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_21"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11445545.pdf?arnumber=11445545","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:02:16Z","timestamp":1777611736000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11445545\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":69,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01038","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}