{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T17:18:34Z","timestamp":1772558314008,"version":"3.50.1"},"reference-count":66,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2024,8,9]],"date-time":"2024-08-09T00:00:00Z","timestamp":1723161600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,8,9]],"date-time":"2024-08-09T00:00:00Z","timestamp":1723161600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"National Natural Science Foundation of China Youth Fund","award":["62202142"],"award-info":[{"award-number":["62202142"]}]},{"name":"Key Research and Promotion Projects of Henan Province","award":["222102210215"],"award-info":[{"award-number":["222102210215"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2025,3]]},"DOI":"10.1007\/s00371-024-03597-8","type":"journal-article","created":{"date-parts":[[2024,8,9]],"date-time":"2024-08-09T09:12:22Z","timestamp":1723194742000},"page":"3221-3238","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Motion perception-driven multimodal self-supervised video object segmentation"],"prefix":"10.1007","volume":"41","author":[{"given":"Jun","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Honghui","family":"Cao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chenhao","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ziqing","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yonghua","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,8,9]]},"reference":[{"key":"3597_CR1","doi-asserted-by":"publisher","unstructured":"Lian, L., Wu, Z., Yu, S.X.: Bootstrapping objectness from videos by relaxed common fate and visual grouping. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14582\u201314591 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.01401","DOI":"10.1109\/CVPR52729.2023.01401"},{"key":"3597_CR2","doi-asserted-by":"publisher","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., Joulin, A.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00951","DOI":"10.1109\/ICCV48922.2021.00951"},{"issue":"12","key":"3597_CR3","doi-asserted-by":"publisher","first-page":"15790","DOI":"10.1109\/TPAMI.2023.3305122","volume":"45","author":"Y Wang","year":"2023","unstructured":"Wang, Y., Shen, X., Yuan, Y., Du, Y., Li, M., Hu, S.X., Crowley, J.L., Vaufreydaz, D.: TokenCut: segmenting objects in images and videos with self-supervised transformer and normalized cut. IEEE Trans. Pattern Anal. Mach. Intell. 45(12), 15790\u201315801 (2023). https:\/\/doi.org\/10.1109\/TPAMI.2023.3305122","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3597_CR4","doi-asserted-by":"publisher","first-page":"4661","DOI":"10.1007\/s11042-019-7413-y","volume":"79","author":"C Li","year":"2020","unstructured":"Li, C., Chen, Z., Sheng, B., Li, P., He, G.: Video flickering removal using temporal reconstruction optimization. Multimed. Tools Appl. 79, 4661\u20134679 (2020). https:\/\/doi.org\/10.1007\/s11042-019-7413-y","journal-title":"Multimed. Tools Appl."},{"key":"3597_CR5","doi-asserted-by":"crossref","unstructured":"Lu, X., Wang, W., Shen, J., Tai, Y.W., Crandall, D.J., Hoi, S.C.: Learning video object segmentation from unlabeled videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8960\u20138970 (2020). arXiv:2003.05020","DOI":"10.1109\/CVPR42600.2020.00898"},{"key":"3597_CR6","unstructured":"Ding, S., Xie, W., Chen, Y., Qian, R., Zhang, X., Xiong, H., Tian, Q.: Motion-inductive self-supervised object discovery in videos. arXiv preprint arXiv:2210.00221 (2022). https:\/\/doi.org\/10.48550\/arXiv.2210.00221"},{"key":"3597_CR7","unstructured":"Xie, J., Xie, W., Zisserman, A.: Segmenting moving objects via an object-centric layered representation. In: Advances in Neural Information Processing Systems, vol. 35, pp. 28023\u201328036 (2022). arXiv:2207.02206"},{"key":"3597_CR8","doi-asserted-by":"publisher","unstructured":"Lai, Z., Lu, E., Xie, W.: MAST: a memory-augmented self-supervised tracker. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6479\u20136488 (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.00651","DOI":"10.1109\/CVPR42600.2020.00651"},{"issue":"1","key":"3597_CR9","doi-asserted-by":"publisher","first-page":"301","DOI":"10.1515\/gth-2017-0007","volume":"4","author":"W Max","year":"1923","unstructured":"Max, W.: Untersuchungen zur lehre von der gestalt ii. Psychol. Forsch. 4(1), 301\u201350 (1923). https:\/\/doi.org\/10.1515\/gth-2017-0007","journal-title":"Psychol. Forsch."},{"key":"3597_CR10","volume-title":"The Senses Considered as Perceptual Systems","author":"JJ Gibson","year":"1966","unstructured":"Gibson, J.J.: The Senses Considered as Perceptual Systems. Houghton Mifflin, Boston (1966)"},{"key":"3597_CR11","doi-asserted-by":"publisher","first-page":"201","DOI":"10.3758\/BF03212378","volume":"14","author":"G Johansson","year":"1973","unstructured":"Johansson, G.: Visual perception of biological motion and a model for its analysis. Percept. Psychophys. 14, 201\u2013211 (1973). https:\/\/doi.org\/10.3758\/BF03212378","journal-title":"Percept. Psychophys."},{"key":"3597_CR12","doi-asserted-by":"publisher","unstructured":"Yang, C., Lamdouar, H., Lu, E., Zisserman, A., Xie, W.: Self-supervised video object segmentation by motion grouping. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7177\u20137188 (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00709","DOI":"10.1109\/ICCV48922.2021.00709"},{"key":"3597_CR13","unstructured":"Lamdouar, H., Xie, W., Zisserman, A.: Segmenting invisible moving objects. In: Proceedings of the British Machine Vision Conference. British Machine Vision Association (2021)"},{"key":"3597_CR14","doi-asserted-by":"publisher","first-page":"109399","DOI":"10.1016\/j.patcog.2023.109399","volume":"138","author":"J Sun","year":"2023","unstructured":"Sun, J., Mao, Y., Dai, Y., Zhong, Y., Wang, J.: MUNet: motion uncertainty-aware semi-supervised video object segmentation. Pattern Recogn. 138, 109399 (2023). https:\/\/doi.org\/10.1016\/j.patcog.2023.109399","journal-title":"Pattern Recogn."},{"key":"3597_CR15","doi-asserted-by":"publisher","unstructured":"Dosovitskiy, A., Fischer, P., Ilg, E., Hausser, P., Hazirbas, C., Golkov, V., Van Der\u00a0Smagt, P., Cremers, D., Brox, T.: FlowNet: learning optical flow with convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2758\u20132766 (2015). https:\/\/doi.org\/10.1109\/ICCV.2015.316","DOI":"10.1109\/ICCV.2015.316"},{"key":"3597_CR16","doi-asserted-by":"publisher","unstructured":"Zhou, T., Wang, S., Zhou, Y., Yao, Y., Li, J., Shao, L.: Motion-attentive transition for zero-shot video object segmentation. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 13066\u201313073 (2020). https:\/\/doi.org\/10.1609\/aaai.v34i07.7008","DOI":"10.1609\/aaai.v34i07.7008"},{"key":"3597_CR17","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3296629","author":"Y Tang","year":"2023","unstructured":"Tang, Y., Chen, T., Jiang, X., Yao, Y., Xie, G.S., Shen, H.T.: Holistic prototype attention network for few-shot video object segmentation. IEEE Trans. Circuits Syst. Video Technol. (2023). https:\/\/doi.org\/10.1109\/TCSVT.2023.3296629","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"3597_CR18","doi-asserted-by":"publisher","unstructured":"Perazzi, F., Pont-Tuset, J., McWilliams, B., Van\u00a0Gool, L., Gross, M., Sorkine-Hornung, A.: A benchmark dataset and evaluation methodology for video object segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 724\u2013732 (2016). https:\/\/doi.org\/10.1109\/CVPR.2016.85","DOI":"10.1109\/CVPR.2016.85"},{"key":"3597_CR19","doi-asserted-by":"publisher","unstructured":"Li, F., Kim, T., Humayun, A., Tsai, D., Rehg, J.M.: Video segmentation by tracking many figure-ground segments. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2192\u20132199 (2013). https:\/\/doi.org\/10.1109\/ICCV.2013.273","DOI":"10.1109\/ICCV.2013.273"},{"key":"3597_CR20","doi-asserted-by":"crossref","unstructured":"Lamdouar, H., Yang, C., Xie, W., Zisserman, A.: Betrayed by motion: camouflaged object discovery via motion segmentation. In: Proceedings of the Asian Conference on Computer Vision (2020). arXiv:2011.11630","DOI":"10.1007\/978-3-030-69532-3_30"},{"issue":"6","key":"3597_CR21","doi-asserted-by":"publisher","first-page":"1187","DOI":"10.1109\/TPAMI.2013.242","volume":"36","author":"P Ochs","year":"2013","unstructured":"Ochs, P., Malik, J., Brox, T.: Segmentation of moving objects by long term video analysis. IEEE Trans. Pattern Anal. Mach. Intell. 36(6), 1187\u20131200 (2013). https:\/\/doi.org\/10.1109\/TPAMI.2013.242","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3597_CR22","doi-asserted-by":"crossref","unstructured":"Bertasius, G., Torresani, L.: Classifying, segmenting, and tracking object instances in video with mask propagation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9739\u20139748 (2020). arXiv:1912.04573","DOI":"10.1109\/CVPR42600.2020.00976"},{"issue":"5","key":"3597_CR23","doi-asserted-by":"publisher","first-page":"1410","DOI":"10.1109\/TCSVT.2019.2902937","volume":"30","author":"Z Chen","year":"2019","unstructured":"Chen, Z., Wang, J., Sheng, B., Li, P., Feng, D.D.: Illumination-invariant video cut-out using octagon sensitive optimization. IEEE Trans. Circuits Syst. Video Technol. 30(5), 1410\u20131422 (2019). https:\/\/doi.org\/10.1109\/TCSVT.2019.2902937","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"3597_CR24","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Chen, X., Wang, J.: Object-contextual representations for semantic segmentation. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part VI 16, pp. 173\u2013190. Springer (2020). arXiv:1909.11065","DOI":"10.1007\/978-3-030-58539-6_11"},{"issue":"7","key":"3597_CR25","doi-asserted-by":"publisher","first-page":"4498","DOI":"10.1109\/TCSVT.2021.3127562","volume":"32","author":"F Lin","year":"2021","unstructured":"Lin, F., Xie, H., Liu, C., Zhang, Y.: Bilateral temporal re-aggregation for weakly-supervised video object segmentation. IEEE Trans. Circuits Syst. Video Technol. 32(7), 4498\u20134512 (2021). https:\/\/doi.org\/10.1109\/TCSVT.2021.3127562","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"3597_CR26","doi-asserted-by":"publisher","unstructured":"Wang, W., Shen, J., Xie, J., Porikli, F.: Super-trajectory for video segmentation. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1671\u20131679 (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.185","DOI":"10.1109\/ICCV.2017.185"},{"key":"3597_CR27","doi-asserted-by":"publisher","unstructured":"Grundmann, M., Kwatra, V., Han, M., Essa, I.: Efficient hierarchical graph-based video segmentation. In: 2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, pp. 2141\u20132148. IEEE (2010). https:\/\/doi.org\/10.1109\/CVPR.2010.5539893","DOI":"10.1109\/CVPR.2010.5539893"},{"key":"3597_CR28","doi-asserted-by":"publisher","unstructured":"Xu, C., Corso, J.J.: Evaluation of super-voxel methods for early video processing. In: 2012 IEEE Conference on Computer Vision and Pattern Recognition, pp. 1202\u20131209. IEEE (2012). https:\/\/doi.org\/10.1109\/CVPR.2012.6247802","DOI":"10.1109\/CVPR.2012.6247802"},{"key":"3597_CR29","doi-asserted-by":"publisher","unstructured":"Zhang, D., Javed, O., Shah, M.: Video object segmentation through spatially accurate and temporally dense extraction of primary object regions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 628\u2013635 (2013). https:\/\/doi.org\/10.1109\/CVPR.2013.87","DOI":"10.1109\/CVPR.2013.87"},{"key":"3597_CR30","doi-asserted-by":"publisher","unstructured":"Tsai, Y.H., Zhong, G., Yang, M.H.: Semantic co-segmentation in videos. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part IV 14, pp. 760\u2013775. Springer (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_46","DOI":"10.1007\/978-3-319-46493-0_46"},{"key":"3597_CR31","doi-asserted-by":"publisher","first-page":"153869","DOI":"10.1109\/ACCESS.2019.2899348","volume":"7","author":"D Zeng","year":"2019","unstructured":"Zeng, D., Chen, X., Zhu, M., Goesele, M., Kuijper, A.: Background subtraction with real-time semantic segmentation. IEEE Access 7, 153869\u2013153884 (2019). https:\/\/doi.org\/10.1109\/ACCESS.2019.2899348","journal-title":"IEEE Access"},{"key":"3597_CR32","doi-asserted-by":"publisher","first-page":"325","DOI":"10.1016\/j.neucom.2021.04.090","volume":"455","author":"W Zhu","year":"2021","unstructured":"Zhu, W., Meng, J., Xu, L.: Self-supervised video object segmentation using integration-augmented attention. Neurocomputing 455, 325\u2013339 (2021). https:\/\/doi.org\/10.1016\/j.neucom.2021.04.090","journal-title":"Neurocomputing"},{"key":"3597_CR33","doi-asserted-by":"publisher","unstructured":"Lee, S., Cho, S., Lee, D., Lee, M., Lee, S.: Tsanet: temporal and scale alignment for unsupervised video object segmentation. arXiv preprint arXiv:2303.04376 (2023). https:\/\/doi.org\/10.1109\/ICIP49359.2023.10222236","DOI":"10.1109\/ICIP49359.2023.10222236"},{"key":"3597_CR34","unstructured":"Lian, L., Wu, Z., Yu, S.X.: Improving unsupervised video object segmentation with motion-appearance synergy. arXiv preprint arXiv:2212.08816 (2022)"},{"key":"3597_CR35","doi-asserted-by":"crossref","unstructured":"Lu, X., Wang, W., Ma, C., Shen, J., Shao, L., Porikli, F.: See more, know more: unsupervised video object segmentation with co-attention Siamese networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3623\u20133632 (2019). arXiv:2001.06810","DOI":"10.1109\/CVPR.2019.00374"},{"key":"3597_CR36","doi-asserted-by":"crossref","unstructured":"Dutt\u00a0Jain, S., Xiong, B., Grauman, K.: FusionSeg: learning to combine motion and appearance for fully automatic segmentation of generic objects in videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3664\u20133673 (2017). arXiv:1701.05384","DOI":"10.1109\/CVPR.2017.228"},{"key":"3597_CR37","doi-asserted-by":"publisher","unstructured":"Brox, T., Bruhn, A., Papenberg, N., Weickert, J.: High accuracy optical flow estimation based on a theory for warping. In: Computer Vision-ECCV 2004: 8th European Conference on Computer Vision, Prague, Czech Republic, May 11\u201314, 2004. Proceedings, Part IV 8, pp. 25\u201336. Springer (2004). https:\/\/doi.org\/10.1007\/978-3-540-24673-2_3","DOI":"10.1007\/978-3-540-24673-2_3"},{"key":"3597_CR38","doi-asserted-by":"publisher","unstructured":"Horn, B.K., Schunck, B.G.: Determining optical flow. In: Artificial Intelligence, vol. 17(1\u20133), pp. 185\u2013203 (1981). https:\/\/doi.org\/10.1016\/0004-3702(81)90024-2","DOI":"10.1016\/0004-3702(81)90024-2"},{"key":"3597_CR39","doi-asserted-by":"crossref","unstructured":"Sun, D., Yang, X., Liu, M.Y., Kautz, J.: PWC-Net: CNNs for optical flow using pyramid, warping, and cost volume. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 8934\u20138943 (2018). arXiv:1709.02371","DOI":"10.1109\/CVPR.2018.00931"},{"key":"3597_CR40","doi-asserted-by":"publisher","unstructured":"Teed, Z., Deng, J.: RAFT: recurrent all-pairs field transforms for optical flow. In: Computer Vision\u2014ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part II 16, pp. 402\u2013419. Springer (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5","DOI":"10.1007\/978-3-030-58536-5"},{"key":"3597_CR41","doi-asserted-by":"publisher","unstructured":"Huang, Z., Shi, X., Zhang, C., Wang, Q., Cheung, K.C., Qin, H., Dai, J., Li, H.: FlowFormer: a transformer architecture for optical flow. In: European Conference on Computer Vision, pp. 668\u2013685. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-19790-1_40","DOI":"10.1007\/978-3-031-19790-1_40"},{"key":"3597_CR42","doi-asserted-by":"publisher","unstructured":"Shi, X., Huang, Z., Li, D., Zhang, M., Cheung, K.C., See, S., Qin, H., Dai, J., Li, H.: FlowFormer++: masked cost volume autoencoding for pretraining optical flow estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1599\u20131610 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.00160","DOI":"10.1109\/CVPR52729.2023.00160"},{"key":"3597_CR43","doi-asserted-by":"publisher","first-page":"102536","DOI":"10.1016\/j.displa.2023.102536","volume":"80","author":"B Wei","year":"2023","unstructured":"Wei, B., Wen, Y., Liu, X., Qi, X., Sheng, B.: SOFNet: optical-flow based large-scale slice augmentation of brain MRI. Displays 80, 102536 (2023). https:\/\/doi.org\/10.1016\/j.displa.2023.102536","journal-title":"Displays"},{"issue":"8","key":"3597_CR44","doi-asserted-by":"publisher","first-page":"3105","DOI":"10.1109\/TCSVT.2020.3036467","volume":"31","author":"S You","year":"2020","unstructured":"You, S., Yao, H., Xu, C.: Multi-target multi-camera tracking with optical-based pose association. IEEE Trans. Circuits Syst. Video Technol. 31(8), 3105\u20133117 (2020). https:\/\/doi.org\/10.1109\/TCSVT.2020.3036467","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"12","key":"3597_CR45","doi-asserted-by":"publisher","first-page":"8116","DOI":"10.1109\/TCSVT.2021.3057872","volume":"32","author":"Y Zhou","year":"2021","unstructured":"Zhou, Y., Xu, X., Shen, F., Zhu, X., Shen, H.T.: Flow-edge guided unsupervised video object segmentation. IEEE Trans. Circuits Syst. Video Technol. 32(12), 8116\u20138127 (2021). https:\/\/doi.org\/10.1109\/TCSVT.2021.3057872","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"3597_CR46","doi-asserted-by":"publisher","unstructured":"Zhang, X., Boularias, A.: Optical flow boosts unsupervised localization and segmentation. In: 2023 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS), pp. 7635\u20137642. IEEE (2023). https:\/\/doi.org\/10.1109\/IROS55552.2023.10342195","DOI":"10.1109\/IROS55552.2023.10342195"},{"key":"3597_CR47","doi-asserted-by":"publisher","unstructured":"Oh, S.W., Lee, J.Y., Sunkavalli, K., Kim, S.J.: Fast video object segmentation by reference-guided mask propagation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7376\u20137385 (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00770","DOI":"10.1109\/CVPR.2018.00770"},{"key":"3597_CR48","doi-asserted-by":"publisher","unstructured":"Duarte, K., Rawat, Y.S., Shah, M.: CapsuleVOS: semi-supervised video object segmentation using capsule routing. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8480\u20138489 (2019). https:\/\/doi.org\/10.1109\/ICCV.2019.00857","DOI":"10.1109\/ICCV.2019.00857"},{"key":"3597_CR49","unstructured":"Locatello, F., Weissenborn, D., Unterthiner, T., Mahendran, A., Heigold, G., Uszkoreit, J., Dosovitskiy, A., Kipf, T.: Object-centric learning with slot attention. In: Advances in Neural Information Processing Systems, vol. 33, pp. 11525\u201311538 (2020). arXiv:2006.15055"},{"key":"3597_CR50","doi-asserted-by":"publisher","first-page":"50","DOI":"10.1109\/TMM.2021.3120873","volume":"25","author":"X Lin","year":"2021","unstructured":"Lin, X., Sun, S., Huang, W., Sheng, B., Li, P., Feng, D.D.: EAPT: efficient attention pyramid transformer for image processing. IEEE Trans. Multimed. 25, 50\u201361 (2021). https:\/\/doi.org\/10.1109\/TMM.2021.3120873","journal-title":"IEEE Trans. Multimed."},{"key":"3597_CR51","doi-asserted-by":"publisher","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: European Conference on Computer Vision, pp. 213\u2013229. Springer (2020). https:\/\/doi.org\/10.1007\/978-3-031-43148-7_20","DOI":"10.1007\/978-3-031-43148-7_20"},{"key":"3597_CR52","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3284165","author":"M Sun","year":"2023","unstructured":"Sun, M., Xiao, J., Lim, E.G., Zhao, C., Zhao, Y.: Unified multi-modality video object segmentation using reinforcement learning. IEEE Trans. Circuits Syst. Video Technol. (2023). https:\/\/doi.org\/10.1109\/TCSVT.2023.3284165","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"7","key":"3597_CR53","doi-asserted-by":"publisher","first-page":"3409","DOI":"10.1109\/TCSVT.2022.3233369","volume":"33","author":"Y Tang","year":"2023","unstructured":"Tang, Y., Zhang, L., Yuan, Y., Chen, Z.: Describe fashion products via local sparse self-attention mechanism and attribute-based re-sampling strategy. IEEE Trans. Circuits Syst. Video Technol. 33(7), 3409\u20133424 (2023). https:\/\/doi.org\/10.1109\/TCSVT.2022.3233369","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"3597_CR54","doi-asserted-by":"publisher","unstructured":"Yang, Y., Loquercio, A., Scaramuzza, D., Soatto, S.: Unsupervised moving object detection via contextual information separation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 879\u2013888 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00097","DOI":"10.1109\/CVPR.2019.00097"},{"issue":"6","key":"3597_CR55","doi-asserted-by":"publisher","first-page":"7099","DOI":"10.1109\/TPAMI.2022.3225573","volume":"45","author":"T Zhou","year":"2022","unstructured":"Zhou, T., Porikli, F., Crandall, D.J., Van Gool, L., Wang, W.: A survey on deep learning technique for video segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 45(6), 7099\u20137122 (2022). https:\/\/doi.org\/10.1109\/TPAMI.2022.3225573","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3597_CR56","unstructured":"Wright, L., Demeure, N.: Ranger21: a synergistic deep learning optimizer. arXiv preprint arXiv:2106.13731 (2021)"},{"key":"3597_CR57","unstructured":"Jabri, A., Owens, A., Efros, A.: Space-time correspondence as a contrastive random walk. In: Advances in Neural Information Processing Systems, vol. 33, pp. 19545\u201319560. arXiv:2006.14613 (2020)"},{"key":"3597_CR58","unstructured":"Kr\u00e4henb\u00fchl, P., Koltun, V.: Efficient inference in fully connected CRFS with gaussian edge potentials. In: Advances in Neural Information Processing Systems, vol. 24 (2011). arXiv:1210.5644"},{"issue":"4","key":"3597_CR59","doi-asserted-by":"publisher","first-page":"4462","DOI":"10.1109\/TPAMI.2022.3198480","volume":"45","author":"E Meunier","year":"2022","unstructured":"Meunier, E., Badoual, A., Bouthemy, P.: EM-driven unsupervised learning for efficient motion segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 45(4), 4462\u20134473 (2022). https:\/\/doi.org\/10.1109\/TPAMI.2022.3198480","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3597_CR60","unstructured":"Meunier, E., Bouthemy, P.: Unsupervised motion segmentation in one go: smooth long-term model over a video. arXiv preprint arXiv:2310.01040 (2023)"},{"key":"3597_CR61","unstructured":"Lao, D., Hu, Z., Locatello, F., Yang, Y., Soatto, S.: Divided attention: unsupervised multi-object discovery with contextually separated slots. arXiv preprint arXiv:2304.01430 (2023)"},{"key":"3597_CR62","doi-asserted-by":"publisher","unstructured":"Sestini, L., Rosa, B., De Momi, E., Ferrigno, G., Padoy, N.: FUN-SIS: a fully unsupervised approach for surgical instrument segmentation. Med. Image Anal. 85, 102751 (2023). https:\/\/doi.org\/10.1016\/j.media.2023.102751","DOI":"10.1016\/j.media.2023.102751"},{"key":"3597_CR63","doi-asserted-by":"publisher","unstructured":"Meunier, E., Bouthemy, P.: Unsupervised space-time network for temporally-consistent segmentation of multiple motions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22139\u201322148 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.02120","DOI":"10.1109\/CVPR52729.2023.02120"},{"issue":"2","key":"3597_CR64","doi-asserted-by":"publisher","first-page":"995","DOI":"10.1109\/TCSVT.2023.3288878","volume":"34","author":"L Xi","year":"2024","unstructured":"Xi, L., Chen, W., Wu, X., Liu, Z., Li, Z.: Online unsupervised video object segmentation via contrastive motion clustering. IEEE Trans. Circuits Syst. Video Technol. 34(2), 995\u20131006 (2024). https:\/\/doi.org\/10.1109\/TCSVT.2023.3288878","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"7","key":"3597_CR65","doi-asserted-by":"publisher","first-page":"6662","DOI":"10.1109\/TCYB.2021.3079311","volume":"52","author":"B Sheng","year":"2021","unstructured":"Sheng, B., Li, P., Ali, R., Chen, C.P.: Improving video temporal consistency via broad learning system. IEEE Trans. Cybern. 52(7), 6662\u20136675 (2021). https:\/\/doi.org\/10.1109\/TCYB.2021.3079311","journal-title":"IEEE Trans. Cybern."},{"key":"3597_CR66","doi-asserted-by":"publisher","unstructured":"Zhang, H., Ali, R., Sheng, B., Li, P., Kim, J., Wang, J.: Preserving temporal consistency in videos through adaptive SLIC. In: Advances in Computer Graphics: 37th Computer Graphics International Conference, CGI 2020, Geneva, Switzerland, October 20\u201323, 2020, Proceedings 37, pp. 405\u2013410. Springer (2020). https:\/\/doi.org\/10.1007\/978-3-030-61864-3_34","DOI":"10.1007\/978-3-030-61864-3_34"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03597-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-024-03597-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-024-03597-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,3,10]],"date-time":"2025-03-10T09:10:23Z","timestamp":1741597823000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-024-03597-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,9]]},"references-count":66,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,3]]}},"alternative-id":["3597"],"URL":"https:\/\/doi.org\/10.1007\/s00371-024-03597-8","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"value":"0178-2789","type":"print"},{"value":"1432-2315","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,8,9]]},"assertion":[{"value":"27 July 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 August 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}