{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T20:52:47Z","timestamp":1769633567446,"version":"3.49.0"},"publisher-location":"Cham","reference-count":79,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031198205","type":"print"},{"value":"9783031198212","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-19821-2_6","type":"book-chapter","created":{"date-parts":[[2022,10,22]],"date-time":"2022-10-22T12:12:59Z","timestamp":1666440779000},"page":"97-115","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Semantic-Aware Fine-Grained Correspondence"],"prefix":"10.1007","author":[{"given":"Yingdong","family":"Hu","sequence":"first","affiliation":[]},{"given":"Renhao","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Kaifeng","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Yang","family":"Gao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,10,23]]},"reference":[{"key":"6_CR1","unstructured":"Bachman, P., Hjelm, R.D., Buchwalter, W.: Learning representations by maximizing mutual information across views. arXiv preprint arXiv:1906.00910 (2019)"},{"key":"6_CR2","doi-asserted-by":"crossref","unstructured":"Bai, Y., Chen, X., Kirillov, A., Yuille, A., Berg, A.C.: Point-level region contrast for object detection pre-training. arXiv preprint arXiv:2202.04639 (2022)","DOI":"10.1109\/CVPR52688.2022.01559"},{"issue":"2","key":"6_CR3","doi-asserted-by":"publisher","first-page":"309","DOI":"10.1111\/tops.12527","volume":"13","author":"DH Ballard","year":"2021","unstructured":"Ballard, D.H., Zhang, R.: The hierarchical evolution in human vision modeling. Top. Cogn. Sci. 13(2), 309\u2013328 (2021)","journal-title":"Top. Cogn. Sci."},{"key":"6_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"850","DOI":"10.1007\/978-3-319-48881-3_56","volume-title":"Computer Vision \u2013 ECCV 2016 Workshops","author":"L Bertinetto","year":"2016","unstructured":"Bertinetto, L., Valmadre, J., Henriques, J.F., Vedaldi, A., Torr, P.H.S.: Fully-convolutional siamese networks for object tracking. In: Hua, G., J\u00e9gou, H. (eds.) ECCV 2016. LNCS, vol. 9914, pp. 850\u2013865. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-48881-3_56"},{"key":"6_CR5","doi-asserted-by":"crossref","unstructured":"Caelles, S., Maninis, K.K., Pont-Tuset, J., Leal-Taix\u00e9, L., Cremers, D., Van Gool, L.: One-shot video object segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 221\u2013230 (2017)","DOI":"10.1109\/CVPR.2017.565"},{"key":"6_CR6","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., Joulin, A.: Unsupervised learning of visual features by contrasting cluster assignments. arXiv preprint arXiv:2006.09882 (2020)"},{"key":"6_CR7","doi-asserted-by":"crossref","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., Joulin, A.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"6_CR8","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"6_CR9","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning PMLR, pp. 1597\u20131607 (2020)"},{"key":"6_CR10","unstructured":"Chen, T., Luo, C., Li, L.: Intriguing properties of contrastive losses. arXiv preprint arXiv:2011.02803 (2020)"},{"key":"6_CR11","unstructured":"Chen, X., Fan, H., Girshick, R., He, K.: Improved baselines with momentum contrastive learning. arXiv preprint arXiv:2003.04297 (2020)"},{"key":"6_CR12","doi-asserted-by":"crossref","unstructured":"Chen, X., He, K.: Exploring simple siamese representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15750\u201315758 (2021)","DOI":"10.1109\/CVPR46437.2021.01549"},{"key":"6_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"347","DOI":"10.1007\/978-3-030-20893-6_22","volume-title":"Computer Vision \u2013 ACCV 2018","author":"YC Chen","year":"2019","unstructured":"Chen, Y.C., et al.: Deep semantic matching with foreground detection and cycle-consistency. In: Jawahar, C.V., Li, H., Mori, G., Schindler, K. (eds.) ACCV 2018. LNCS, vol. 11363, pp. 347\u2013362. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-20893-6_22"},{"key":"6_CR14","unstructured":"Cheng, H.K., Tai, Y.W., Tang, C.K.: Rethinking space-time networks with improved memory coverage for efficient video object segmentation. arXiv preprint arXiv:2106.05210 (2021)"},{"key":"6_CR15","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"6_CR16","unstructured":"Dosovitskiy, A., et al.: An image is worth 16 $$\\times $$ 16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"6_CR17","doi-asserted-by":"crossref","unstructured":"Dosovitskiy, Aet al.: Flownet: Learning optical flow with convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2758\u20132766 (2015)","DOI":"10.1109\/ICCV.2015.316"},{"key":"6_CR18","unstructured":"Gordon, D., Ehsani, K., Fox, D., Farhadi, A.: Watching the world go by: Representation learning from unlabeled videos. arXiv preprint arXiv:2003.07990 (2020)"},{"key":"6_CR19","unstructured":"Gould, S., et al.: Peripheral-foveal vision for real-time object recognition and tracking in video. In: IJCAI, vol. 7, pp. 2115\u20132121 (2007)"},{"key":"6_CR20","doi-asserted-by":"crossref","unstructured":"Grabner, H., Matas, J., Van Gool, L., Cattin, P.: Tracking the invisible: learning where the object might be. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 1285\u20131292. IEEE (2010)","DOI":"10.1109\/CVPR.2010.5539819"},{"key":"6_CR21","unstructured":"Grill, J.B., et al.: Bootstrap your own latent: a new approach to self-supervised learning. arXiv preprint arXiv:2006.07733 (2020)"},{"key":"6_CR22","doi-asserted-by":"crossref","unstructured":"Hadsell, R., Chopra, S., LeCun, Y.: Dimensionality reduction by learning an invariant mapping. In: 2006 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR 2006), vol. 2, pp. 1735\u20131742. IEEE (2006)","DOI":"10.1109\/CVPR.2006.100"},{"key":"6_CR23","doi-asserted-by":"crossref","unstructured":"Hariharan, B., Arbel\u00e1ez, P., Girshick, R., Malik, J.: Hypercolumns for object segmentation and fine-grained localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 447\u2013456 (2015)","DOI":"10.1109\/CVPR.2015.7298642"},{"key":"6_CR24","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9729\u20139738 (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"6_CR25","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"6_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"749","DOI":"10.1007\/978-3-319-46448-0_45","volume-title":"Computer Vision \u2013 ECCV 2016","author":"D Held","year":"2016","unstructured":"Held, D., Thrun, S., Savarese, S.: Learning to track at 100 FPS with deep regression networks. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 749\u2013765. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_45"},{"key":"6_CR27","unstructured":"Henaff, O.: Data-efficient image recognition with contrastive predictive coding. In: International Conference on Machine Learning PMLR, pp. 4182\u20134192 (2020)"},{"issue":"3","key":"6_CR28","doi-asserted-by":"publisher","first-page":"583","DOI":"10.1109\/TPAMI.2014.2345390","volume":"37","author":"JF Henriques","year":"2014","unstructured":"Henriques, J.F., Caseiro, R., Martins, P., Batista, J.: High-speed tracking with kernelized correlation filters. IEEE Trans. Pattern Anal. Mach. Intell. 37(3), 583\u2013596 (2014)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"6_CR29","doi-asserted-by":"crossref","unstructured":"Huang, S., Wang, Q., Zhang, S., Yan, S., He, X.: Dynamic context correspondence network for semantic alignment. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2010\u20132019 (2019)","DOI":"10.1109\/ICCV.2019.00210"},{"key":"6_CR30","doi-asserted-by":"crossref","unstructured":"Ilg, E., Mayer, N., Saikia, T., Keuper, M., Dosovitskiy, A., Brox, T.: Flownet 2.0: evolution of optical flow estimation with deep networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2462\u20132470 (2017)","DOI":"10.1109\/CVPR.2017.179"},{"key":"6_CR31","doi-asserted-by":"crossref","unstructured":"Iqbal, U., Garbade, M., Gall, J.: Pose for action-action for pose. In: 2017 12th IEEE International Conference on Automatic Face & Gesture Recognition (FG 2017), pp. 438\u2013445. IEEE (2017)","DOI":"10.1109\/FG.2017.61"},{"key":"6_CR32","unstructured":"Jabri, A., Owens, A., Efros, A.A.: Space-time correspondence as a contrastive random walk. Adv. Neural Inf. Process. Syst. 33, 19545\u201319560 (2020)"},{"key":"6_CR33","doi-asserted-by":"crossref","unstructured":"Jhuang, H., Gall, J., Zuffi, S., Schmid, C., Black, M.J.: Towards understanding action recognition. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 3192\u20133199 (2013)","DOI":"10.1109\/ICCV.2013.396"},{"key":"6_CR34","doi-asserted-by":"crossref","unstructured":"Lai, Z., Lu, E., Xie, W.: Mast: a memory-augmented self-supervised tracker. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6479\u20136488 (2020)","DOI":"10.1109\/CVPR42600.2020.00651"},{"key":"6_CR35","unstructured":"Lai, Z., Xie, W.: Self-supervised learning for video correspondence flow. arXiv preprint arXiv:1905.00875 (2019)"},{"key":"6_CR36","doi-asserted-by":"crossref","unstructured":"Li, B., Yan, J., Wu, W., Zhu, Z., Hu, X.: High performance visual tracking with siamese region proposal network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8971\u20138980 (2018)","DOI":"10.1109\/CVPR.2018.00935"},{"key":"6_CR37","unstructured":"Li, X., Liu, S., De Mello, S., Wang, X., Kautz, J., Yang, M.H.: Joint-task self-supervised learning for temporal correspondence. arXiv preprint arXiv:1909.11895 (2019)"},{"key":"6_CR38","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"28","DOI":"10.1007\/978-3-540-88690-7_3","volume-title":"Computer Vision \u2013 ECCV 2008","author":"C Liu","year":"2008","unstructured":"Liu, C., Yuen, J., Torralba, A., Sivic, J., Freeman, W.T.: SIFT flow: dense correspondence across different scenes. In: Forsyth, D., Torr, P., Zisserman, A. (eds.) ECCV 2008. LNCS, vol. 5304, pp. 28\u201342. Springer, Heidelberg (2008). https:\/\/doi.org\/10.1007\/978-3-540-88690-7_3"},{"key":"6_CR39","doi-asserted-by":"crossref","unstructured":"Liu, Y., Zhu, L., Yamada, M., Yang, Y.: Semantic correspondence as an optimal transport problem. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4463\u20134472 (2020)","DOI":"10.1109\/CVPR42600.2020.00452"},{"key":"6_CR40","first-page":"1601","volume":"27","author":"JL Long","year":"2014","unstructured":"Long, J.L., Zhang, N., Darrell, T.: Do convnets learn correspondence? Adv. Neural Inf. Process. Syst. 27, 1601\u20131609 (2014)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"6_CR41","doi-asserted-by":"crossref","unstructured":"Min, J., Cho, M.: Convolutional hough matching networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2940\u20132950 (2021)","DOI":"10.1109\/CVPR46437.2021.00296"},{"key":"6_CR42","doi-asserted-by":"crossref","unstructured":"Min, J., Lee, J., Ponce, J., Cho, M.: Hyperpixel flow: semantic correspondence with multi-layer neural features. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3395\u20133404 (2019)","DOI":"10.1109\/ICCV.2019.00349"},{"key":"6_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"346","DOI":"10.1007\/978-3-030-58555-6_21","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Min","year":"2020","unstructured":"Min, J., Lee, J., Ponce, J., Cho, M.: Learning to compose hypercolumns for visual correspondence. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12360, pp. 346\u2013363. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58555-6_21"},{"key":"6_CR44","doi-asserted-by":"crossref","unstructured":"Misra, I., Maaten, L.V.D.: Self-supervised learning of pretext-invariant representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6707\u20136717 (2020)","DOI":"10.1109\/CVPR42600.2020.00674"},{"key":"6_CR45","doi-asserted-by":"crossref","unstructured":"Oh, S.W., Lee, J.Y., Xu, N., Kim, S.J.: Video object segmentation using space-time memory networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9226\u20139235 (2019)","DOI":"10.1109\/ICCV.2019.00932"},{"key":"6_CR46","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"6_CR47","doi-asserted-by":"crossref","unstructured":"Perazzi, F., Pont-Tuset, J., McWilliams, B., Van Gool, L., Gross, M., Sorkine-Hornung, A.: a benchmark dataset and evaluation methodology for video object segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 724\u2013732 (2016)","DOI":"10.1109\/CVPR.2016.85"},{"key":"6_CR48","unstructured":"Pinheiro, P.O., Almahairi, A., Benmalek, R.Y., Golemo, F., Courville, A.: Unsupervised learning of dense visual representations. arXiv preprint arXiv:2011.05499 (2020)"},{"key":"6_CR49","unstructured":"Pont-Tuset, J., Perazzi, F., Caelles, S., Arbel\u00e1ez, P., Sorkine-Hornung, A., Van Gool, L.: The 2017 davis challenge on video object segmentation. arXiv preprint arXiv:1704.00675 (2017)"},{"key":"6_CR50","unstructured":"Purushwalkam, S., Gupta, A.: Demystifying contrastive self-supervised learning: Invariances, augmentations and dataset biases. arXiv preprint arXiv:2007.13916 (2020)"},{"key":"6_CR51","doi-asserted-by":"crossref","unstructured":"Ranjan, A., Black, M.J.: Optical flow estimation using a spatial pyramid network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4161\u20134170 (2017)","DOI":"10.1109\/CVPR.2017.291"},{"key":"6_CR52","doi-asserted-by":"crossref","unstructured":"Rocco, I., Arandjelovic, R., Sivic, J.: Convolutional neural network architecture for geometric matching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6148\u20136157 (2017)","DOI":"10.1109\/CVPR.2017.12"},{"key":"6_CR53","doi-asserted-by":"crossref","unstructured":"Rocco, I., Arandjelovi\u0107, R., Sivic, J.: End-to-end weakly-supervised semantic alignment. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6917\u20136925 (2018)","DOI":"10.1109\/CVPR.2018.00723"},{"key":"6_CR54","unstructured":"Rocco, I., Cimpoi, M., Arandjelovi\u0107, R., Torii, A., Pajdla, T., Sivic, J.: Neighbourhood consensus networks. arXiv preprint arXiv:1810.10510 (2018)"},{"key":"6_CR55","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. Adv. Neural Inf. Process. Syst. 27 (2014)"},{"key":"6_CR56","doi-asserted-by":"crossref","unstructured":"Song, J., Wang, L., Van Gool, L., Hilliges, O.: Thin-slicing network: a deep structured model for pose estimation in videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4220\u20134229 (2017)","DOI":"10.1109\/CVPR.2017.590"},{"key":"6_CR57","doi-asserted-by":"crossref","unstructured":"Sun, D., Yang, X., Liu, M.Y., Kautz, J.: Pwc-net: Cnns for optical flow using pyramid, warping, and cost volume. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8934\u20138943 (2018)","DOI":"10.1109\/CVPR.2018.00931"},{"key":"6_CR58","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"402","DOI":"10.1007\/978-3-030-58536-5_24","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Z Teed","year":"2020","unstructured":"Teed, Z., Deng, J.: RAFT: recurrent all-pairs field transforms for optical flow. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12347, pp. 402\u2013419. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_24"},{"key":"6_CR59","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"776","DOI":"10.1007\/978-3-030-58621-8_45","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y Tian","year":"2020","unstructured":"Tian, Y., Krishnan, D., Isola, P.: Contrastive multiview coding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12356, pp. 776\u2013794. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58621-8_45"},{"key":"6_CR60","doi-asserted-by":"crossref","unstructured":"Truong, P., Danelljan, M., Timofte, R.: GLU-net: global-local universal network for dense flow and correspondences. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6258\u20136268 (2020)","DOI":"10.1109\/CVPR42600.2020.00629"},{"key":"6_CR61","doi-asserted-by":"crossref","unstructured":"Valmadre, J., Bertinetto, L., Henriques, J., Vedaldi, A., Torr, P.H.: End-to-end representation learning for correlation filter based tracking. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2805\u20132813 (2017)","DOI":"10.1109\/CVPR.2017.531"},{"key":"6_CR62","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"268","DOI":"10.1007\/978-3-030-58607-2_16","volume-title":"Computer Vision \u2013 ECCV 2020","author":"W Van Gansbeke","year":"2020","unstructured":"Van Gansbeke, W., Vandenhende, S., Georgoulis, S., Proesmans, M., Van Gool, L.: SCAN: learning to classify images without labels. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12355, pp. 268\u2013285. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58607-2_16"},{"key":"6_CR63","unstructured":"Van Gansbeke, W., Vandenhende, S., Georgoulis, S., Van Gool, L.: Revisiting contrastive methods for unsupervised learning of visual representations. arXiv preprint arXiv:2106.05967 (2021)"},{"key":"6_CR64","doi-asserted-by":"crossref","unstructured":"Voigtlaender, P., Chai, Y., Schroff, F., Adam, H., Leibe, B., Chen, L.C.: Feelvos: fast end-to-end embedding learning for video object segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9481\u20139490 (2019)","DOI":"10.1109\/CVPR.2019.00971"},{"key":"6_CR65","doi-asserted-by":"crossref","unstructured":"Voigtlaender, P., Leibe, B.: Online adaptation of convolutional neural networks for video object segmentation. arXiv preprint arXiv:1706.09364 (2017)","DOI":"10.5244\/C.31.116"},{"key":"6_CR66","doi-asserted-by":"crossref","unstructured":"Vondrick, C., Shrivastava, A., Fathi, A., Guadarrama, S., Murphy, K.: Tracking emerges by colorizing videos. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 391\u2013408 (2018)","DOI":"10.1007\/978-3-030-01261-8_24"},{"key":"6_CR67","unstructured":"Wang, N., Yeung, D.Y.: Learning a deep compact image representation for visual tracking. Adv. Neural Inf. Process. Syst. (2013)"},{"key":"6_CR68","doi-asserted-by":"crossref","unstructured":"Wang, N., Song, Y., Ma, C., Zhou, W., Liu, W., Li, H.: Unsupervised deep tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1308\u20131317 (2019)","DOI":"10.1109\/CVPR.2019.00140"},{"key":"6_CR69","doi-asserted-by":"crossref","unstructured":"Wang, Q., Zhang, L., Bertinetto, L., Hu, W., Torr, P.H.: Fast online object tracking and segmentation: a unifying approach. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1328\u20131338 (2019)","DOI":"10.1109\/CVPR.2019.00142"},{"key":"6_CR70","doi-asserted-by":"crossref","unstructured":"Wang, X., Jabri, A., Efros, A.A.: Learning correspondence from the cycle-consistency of time. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2566\u20132576 (2019)","DOI":"10.1109\/CVPR.2019.00267"},{"key":"6_CR71","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhang, R., Shen, C., Kong, T., Li, L.: Dense contrastive learning for self-supervised visual pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3024\u20133033 (2021)","DOI":"10.1109\/CVPR46437.2021.00304"},{"key":"6_CR72","doi-asserted-by":"crossref","unstructured":"Wu, Z., Xiong, Y., Yu, S., Lin, D.: Unsupervised feature learning via non-parametric instance-level discrimination. arXiv preprint arXiv:1805.01978 (2018)","DOI":"10.1109\/CVPR.2018.00393"},{"key":"6_CR73","doi-asserted-by":"crossref","unstructured":"Xie, E., et al.: Detco: unsupervised contrastive learning for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8392\u20138401 (2021)","DOI":"10.1109\/ICCV48922.2021.00828"},{"key":"6_CR74","doi-asserted-by":"crossref","unstructured":"Xie, Z., Lin, Y., Zhang, Z., Cao, Y., Lin, S., Hu, H.: Propagate yourself: exploring pixel-level consistency for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16684\u201316693 (2021)","DOI":"10.1109\/CVPR46437.2021.01641"},{"key":"6_CR75","doi-asserted-by":"crossref","unstructured":"Xu, J., Wang, X.: Rethinking self-supervised correspondence learning: a video frame-level similarity perspective. arXiv preprint arXiv:2103.17263 (2021)","DOI":"10.1109\/ICCV48922.2021.00992"},{"key":"6_CR76","unstructured":"Xu, N., et al.: Youtube-vos: a large-scale video object segmentation benchmark. arXiv preprint arXiv:1809.03327 (2018)"},{"key":"6_CR77","doi-asserted-by":"crossref","unstructured":"Yang, Y., Ramanan, D.: Articulated pose estimation with flexible mixtures-of-parts. In: CVPR 2011, pp. 1385\u20131392. IEEE (2011)","DOI":"10.1109\/CVPR.2011.5995741"},{"key":"6_CR78","doi-asserted-by":"crossref","unstructured":"Zhang, R., et al.: Human gaze assisted artificial intelligence: a review. In: IJCAI: Proceedings of the Conference. vol. 2020, p. 4951. NIH Public Access (2020)","DOI":"10.24963\/ijcai.2020\/689"},{"key":"6_CR79","doi-asserted-by":"crossref","unstructured":"Zhou, Q., Liang, X., Gong, K., Lin, L.: Adaptive temporal encoding network for video instance-level human parsing. In: Proceedings of the 26th ACM International Conference on Multimedia, pp. 1527\u20131535 (2018)","DOI":"10.1145\/3240508.3240660"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2022"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-19821-2_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,6]],"date-time":"2024-10-06T10:22:37Z","timestamp":1728210157000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-19821-2_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031198205","9783031198212"],"references-count":79,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-19821-2_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"23 October 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tel Aviv","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Israel","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 October 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27 October 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2022.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5804","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1645","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.21","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.91","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}