{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T09:29:11Z","timestamp":1780392551073,"version":"3.54.1"},"publisher-location":"Cham","reference-count":60,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031733468","type":"print"},{"value":"9783031733475","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,29]],"date-time":"2024-10-29T00:00:00Z","timestamp":1730160000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73347-5_21","type":"book-chapter","created":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T09:15:43Z","timestamp":1730106943000},"page":"367-385","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":33,"title":["DINO-Tracker: Taming DINO for\u00a0Self-supervised Point Tracking in\u00a0a\u00a0Single Video"],"prefix":"10.1007","author":[{"given":"Narek","family":"Tumanyan","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Assaf","family":"Singer","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shai","family":"Bagon","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tali","family":"Dekel","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,10,29]]},"reference":[{"key":"21_CR1","doi-asserted-by":"crossref","unstructured":"Aflalo, A., Bagon, S., Kashti, T., Eldar, Y.C.: Deepcut: unsupervised segmentation using graph neural networks clustering. In: 2023 IEEE\/CVF International Conference on Computer Vision Workshops (ICCVW), pp. 32\u201341 (2022)","DOI":"10.1109\/ICCVW60793.2023.00010"},{"key":"21_CR2","unstructured":"Amir, S., Gandelsman, Y., Bagon, S., Dekel, T.: Deep vit features as dense visual descriptors. In: ECCVW What is Motion For? (2022)"},{"key":"21_CR3","doi-asserted-by":"crossref","unstructured":"Bian, Z., Jabri, A., Efros, A.A., Owens, A.: Learning pixel trajectories with multiscale contrastive random walks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6508\u20136519 (2022)","DOI":"10.1109\/CVPR52688.2022.00640"},{"key":"21_CR4","doi-asserted-by":"crossref","unstructured":"Biggs, B., Roddick, T., Fitzgibbon, A., Cipolla, R.: Creatures great and SMAL: recovering the shape and motion of animals from video. In: ACCV (2018)","DOI":"10.1007\/978-3-030-20873-8_1"},{"key":"21_CR5","doi-asserted-by":"crossref","unstructured":"Black, M.J., Anandan, P.: A framework for the robust estimation of optical flow. In: 1993 (4th) International Conference on Computer Vision, pp. 231\u2013236 (1993)","DOI":"10.1109\/ICCV.1993.378214"},{"key":"21_CR6","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1023\/B:VISI.0000045324.43199.43","volume":"61","author":"A Bruhn","year":"2005","unstructured":"Bruhn, A., Weickert, J., Schn\u00f6rr, C.: Lucas\/kanade meets horn\/schunck: combining local and global optic flow methods. Int. J. Comput. Vis. 61, 211\u2013231 (2005)","journal-title":"Int. J. Comput. Vis."},{"key":"21_CR7","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"21_CR8","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4724\u20134733 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"21_CR9","doi-asserted-by":"crossref","unstructured":"Chang, J., Wei, D., III, J.W.F.: A video representation using temporal superpixels. In: 2013 IEEE Conference on Computer Vision and Pattern Recognition, pp. 2051\u20132058 (2013)","DOI":"10.1109\/CVPR.2013.267"},{"key":"21_CR10","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. arXiv preprint arXiv:2002.05709 (2020)"},{"key":"21_CR11","doi-asserted-by":"crossref","unstructured":"Dekel, T., Oron, S., Rubinstein, M., Avidan, S., Freeman, W.T.: Best-buddies similarity for robust template matching. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2021\u20132029 (2015)","DOI":"10.1109\/CVPR.2015.7298813"},{"key":"21_CR12","unstructured":"Doersch, C., et al.: Tap-vid: a benchmark for tracking any point in a video. In: NeurIPS Datasets Track (2022)"},{"key":"21_CR13","doi-asserted-by":"crossref","unstructured":"Doersch, C., et al.: Tapir: tracking any point with per-frame initialization and temporal refinement. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00923"},{"key":"21_CR14","unstructured":"Dosovitskiy, A., et al.: An image is worth 16$$\\,\\times \\,$$16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2021)"},{"key":"21_CR15","doi-asserted-by":"crossref","unstructured":"Dosovitskiy, A., et al.: FlowNet: learning optical flow with convolutional networks. In: IEEE International Conference on Computer Vision (ICCV), pp. 2758\u20132766 (2015)","DOI":"10.1109\/ICCV.2015.316"},{"key":"21_CR16","doi-asserted-by":"crossref","unstructured":"Gupta, K., et al.: ASIC: aligning sparse image collections. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00382"},{"key":"21_CR17","unstructured":"Hamilton, M., Zhang, Z., Hariharan, B., Snavely, N., Freeman, W.T.: Unsupervised semantic segmentation by distilling feature correspondences. In: International Conference on Learning Representations (2022)"},{"key":"21_CR18","doi-asserted-by":"crossref","unstructured":"Harley, A.W., Fang, Z., Fragkiadaki, K.: Particle video revisited: tracking through occlusions using point trajectories. In: ECCV (2022)","DOI":"10.1007\/978-3-031-20047-2_4"},{"issue":"1","key":"21_CR19","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1016\/0004-3702(81)90024-2","volume":"17","author":"BK Horn","year":"1981","unstructured":"Horn, B.K., Schunck, B.G.: Determining optical flow. Artif. Intell. 17(1), 185\u2013203 (1981)","journal-title":"Artif. Intell."},{"key":"21_CR20","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. In: International Conference on Learning Representations (2022)"},{"key":"21_CR21","doi-asserted-by":"crossref","unstructured":"Huang, Z., et al.: FlowFormer: a transformer architecture for optical flow. arXiv abs\/2203.16194 (2022)","DOI":"10.1007\/978-3-031-19790-1_40"},{"key":"21_CR22","doi-asserted-by":"publisher","first-page":"492","DOI":"10.1214\/aoms\/1177703732","volume":"35","author":"PJ Huber","year":"1964","unstructured":"Huber, P.J.: Robust estimation of a location parameter. Ann. Math. Stat. 35, 492\u2013518 (1964)","journal-title":"Ann. Math. Stat."},{"key":"21_CR23","doi-asserted-by":"crossref","unstructured":"Ilg, E., Mayer, N., Saikia, T., Keuper, M., Dosovitskiy, A., Brox, T.: FlowNet 2.0: evolution of optical flow estimation with deep networks. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1647\u20131655 (2016)","DOI":"10.1109\/CVPR.2017.179"},{"key":"21_CR24","unstructured":"Jabri, A., Owens, A., Efros, A.A.: Space-time correspondence as a contrastive random walk. In: Advances in Neural Information Processing Systems (2020)"},{"key":"21_CR25","doi-asserted-by":"crossref","unstructured":"Karaev, N., Rocco, I., Graham, B., Neverova, N., Vedaldi, A., Rupprecht, C.: CoTracker: it is better to track together. arXiv:2307.07635 (2023)","DOI":"10.1007\/978-3-031-73033-7_2"},{"key":"21_CR26","unstructured":"Li, X., Liu, S., De\u00a0Mello, S., Wang, X., Kautz, J., Yang, M.H.: Joint-task self-supervised learning for temporal correspondence. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"21_CR27","doi-asserted-by":"publisher","first-page":"978","DOI":"10.1109\/TPAMI.2010.147","volume":"33","author":"C Liu","year":"2011","unstructured":"Liu, C., Yuen, J., Torralba, A.: Sift flow: dense correspondence across scenes and its applications. IEEE Trans. Pattern Anal. Mach. Intell. 33, 978\u2013994 (2011)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"91\u2013110","key":"21_CR28","first-page":"2","volume":"2","author":"G Lowe","year":"2004","unstructured":"Lowe, G.: Sift-the scale invariant feature transform. Int. J. 2(91\u2013110), 2 (2004)","journal-title":"Int. J."},{"key":"21_CR29","unstructured":"Lucas, B.D., Kanade, T.: An iterative image registration technique with an application to stereo vision. In: IJCAI 1981, Proceedings of the 7th International Joint Conference on Artificial Intelligence, vol. 2, pp. 674\u2013679. Morgan Kaufmann Publishers Inc. (1981)"},{"issue":"11","key":"21_CR30","first-page":"2579","volume":"9","author":"L Van der Maaten","year":"2008","unstructured":"Van der Maaten, L., Hinton, G.: Visualizing data using t-SNE. J. Mach. Learn. Res. 9(11), 2579\u20132605 (2008)","journal-title":"J. Mach. Learn. Res."},{"key":"21_CR31","doi-asserted-by":"crossref","unstructured":"Mariotti, O., Aodha, O.M., Bilen, H.: Improving semantic correspondence with viewpoint-guided spherical maps. arXiv:2312.13216 (2023)","DOI":"10.1109\/CVPR52733.2024.01846"},{"key":"21_CR32","doi-asserted-by":"crossref","unstructured":"Melas-Kyriazi, L., Rupprecht, C., Laina, I., Vedaldi, A.: Deep spectral methods: a surprisingly strong baseline for unsupervised semantic segmentation and localization. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8354\u20138365 (2022)","DOI":"10.1109\/CVPR52688.2022.00818"},{"key":"21_CR33","doi-asserted-by":"crossref","unstructured":"Neoral, M., \u0160er\u00fdch, J., Matas, J.: MFT: long-term tracking of every pixel. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 6837\u20136847 (2024)","DOI":"10.1109\/WACV57701.2024.00669"},{"key":"21_CR34","doi-asserted-by":"crossref","unstructured":"Ofri-Amar, D., Geyer, M., Kasten, Y., Dekel, T.: Neural congealing: aligning images to a joint semantic atlas. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 19403\u201319412 (2023)","DOI":"10.1109\/CVPR52729.2023.01859"},{"key":"21_CR35","unstructured":"Oquab, M., et al.: DINOv2: learning robust visual features without supervision. arXiv:2304.07193 (2023)"},{"key":"21_CR36","unstructured":"Pont-Tuset, J., Perazzi, F., Caelles, S., Arbel\u00e1ez, P., Sorkine-Hornung, A., Van Gool, L.: The 2017 davis challenge on video object segmentation. arXiv:1704.00675 (2017)"},{"key":"21_CR37","unstructured":"Rocco, I., Cimpoi, M., Arandjelovi\u0107, R., Torii, A., Pajdla, T., Sivic, J.: Neighbourhood consensus networks. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"21_CR38","doi-asserted-by":"crossref","unstructured":"Rubinstein, M., Liu, C.: Towards longer long-range motion trajectories. In: British Machine Vision Conference (2012)","DOI":"10.5244\/C.26.53"},{"key":"21_CR39","doi-asserted-by":"crossref","unstructured":"Salehi, M., Gavves, E., Snoek, C.G.M., Asano, Y.M.: Time does tell: self-supervised time-tuning of dense image representations. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01516"},{"key":"21_CR40","doi-asserted-by":"publisher","first-page":"72","DOI":"10.1007\/s11263-008-0136-6","volume":"80","author":"P Sand","year":"2006","unstructured":"Sand, P., Teller, S.J.: Particle video: long-range motion estimation using point trajectories. Int. J. Comput. Vis. 80, 72\u201391 (2006)","journal-title":"Int. J. Comput. Vis."},{"key":"21_CR41","doi-asserted-by":"crossref","unstructured":"Shtedritski, A., Vedaldi, A., Rupprecht, C.: Learning universal semantic correspondences with no supervision and automatic data curation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) Workshops, pp. 933\u2013943 (2023)","DOI":"10.1109\/ICCVW60793.2023.00100"},{"key":"21_CR42","doi-asserted-by":"crossref","unstructured":"Sun, D., Yang, X., Liu, M.Y., Kautz, J.: PWC-Net: CNNs for optical flow using pyramid, warping, and cost volume. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8934\u20138943 (2017)","DOI":"10.1109\/CVPR.2018.00931"},{"key":"21_CR43","doi-asserted-by":"crossref","unstructured":"Sun, X., Harley, A.W., Guibas, L.J.: Refining pre-trained motion models. In: Proceedings of the IEEE International Conference on Robotics and Automation (2024)","DOI":"10.1109\/ICRA57147.2024.10610900"},{"key":"21_CR44","doi-asserted-by":"crossref","unstructured":"Teed, Z., Deng, J.: Raft: recurrent all-pairs field transforms for optical flow. In: European Conference on Computer Vision (ECCV), pp. 402\u2013419 (2020)","DOI":"10.1007\/978-3-030-58536-5_24"},{"key":"21_CR45","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3630096","volume":"43","author":"N Tumanyan","year":"2023","unstructured":"Tumanyan, N., Bar-Tal, O., Amir, S., Bagon, S., Dekel, T.: Disentangling structure and appearance in ViT feature space. ACM Trans. Graph. 43, 1\u20136 (2023)","journal-title":"ACM Trans. Graph."},{"key":"21_CR46","doi-asserted-by":"crossref","unstructured":"Tumanyan, N., Bar-Tal, O., Bagon, S., Dekel, T.: Splicing ViT features for semantic appearance transfer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10748\u201310757 (2022)","DOI":"10.1109\/CVPR52688.2022.01048"},{"key":"21_CR47","doi-asserted-by":"crossref","unstructured":"Vondrick, C., Shrivastava, A., Fathi, A., Guadarrama, S., Murphy, K.: Tracking emerges by colorizing videos. In: Proceedings of the European Conference on Computer Vision (ECCV) (2018)","DOI":"10.1007\/978-3-030-01261-8_24"},{"key":"21_CR48","doi-asserted-by":"crossref","unstructured":"Wang, Q., et al.: Tracking everything everywhere all at once. In: International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.01813"},{"key":"21_CR49","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"757","DOI":"10.1007\/978-3-030-58452-8_44","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Q Wang","year":"2020","unstructured":"Wang, Q., Zhou, X., Hariharan, B., Snavely, N.: Learning feature descriptors using camera pose supervision. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 757\u2013774. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_44"},{"key":"21_CR50","doi-asserted-by":"crossref","unstructured":"Wang, X., Jabri, A., Efros, A.A.: Learning correspondence from the cycle-consistency of time. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00267"},{"key":"21_CR51","doi-asserted-by":"crossref","unstructured":"Xu, H., Zhang, J., Cai, J., Rezatofighi, H., Tao, D.: GMFlow: learning optical flow via global matching. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 8111\u20138120 (2021)","DOI":"10.1109\/CVPR52688.2022.00795"},{"key":"21_CR52","doi-asserted-by":"crossref","unstructured":"Xu, J., Ranftl, R., Koltun, V.: Accurate optical flow via direct cost volume processing. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1289\u20131297 (2017)","DOI":"10.1109\/CVPR.2017.615"},{"key":"21_CR53","doi-asserted-by":"crossref","unstructured":"Xu, J., Wang, X.: Rethinking self-supervised correspondence learning: a video frame-level similarity perspective. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10075\u201310085 (2021)","DOI":"10.1109\/ICCV48922.2021.00992"},{"key":"21_CR54","doi-asserted-by":"publisher","first-page":"107861","DOI":"10.1016\/j.patcog.2021.107861","volume":"114","author":"M Zhai","year":"2021","unstructured":"Zhai, M., Xiang, X., Lv, N., Kong, X.: Optical flow and scene flow estimation: a survey. Pattern Recogn. 114, 107861 (2021)","journal-title":"Pattern Recogn."},{"key":"21_CR55","unstructured":"Zhang, J., et al.: A tale of two features: stable diffusion complements DINO for zero-shot semantic correspondence. arXiv:2305.15347 (2023)"},{"key":"21_CR56","doi-asserted-by":"crossref","unstructured":"Zhang, J., et al.: Telling left from right: identifying geometry-aware semantic correspondence. arXiv:2311.17034 (2023)","DOI":"10.1109\/CVPR52733.2024.00297"},{"key":"21_CR57","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. arXiv:2302.05543 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"21_CR58","doi-asserted-by":"crossref","unstructured":"Zhao, W., Liu, S., Guo, H., Wang, W., Liu, Y.: ParticleSfM: exploiting dense point trajectories for localizing moving cameras in the wild. In: European Conference on Computer Vision (2022)","DOI":"10.1007\/978-3-031-19824-3_31"},{"key":"21_CR59","doi-asserted-by":"crossref","unstructured":"Zheng, Y., Harley, A.W., Shen, B., Wetzstein, G., Guibas, L.J.: PointOdyssey: a large-scale synthetic dataset for long-term point tracking. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01818"},{"key":"21_CR60","doi-asserted-by":"crossref","unstructured":"Zhou, T., Krahenbuhl, P., Aubry, M., Huang, Q., Efros, A.A.: Learning dense correspondence via 3D-guided cycle consistency. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 117\u2013126 (2016)","DOI":"10.1109\/CVPR.2016.20"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73347-5_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T10:30:50Z","timestamp":1732962650000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73347-5_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,29]]},"ISBN":["9783031733468","9783031733475"],"references-count":60,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73347-5_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,29]]},"assertion":[{"value":"29 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}