{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T11:55:29Z","timestamp":1775562929441,"version":"3.50.1"},"reference-count":62,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Image and Vision Computing"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1016\/j.imavis.2026.105945","type":"journal-article","created":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T07:57:24Z","timestamp":1772524644000},"page":"105945","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["STSim-Mamb: A spatiotemporal similarity learning framework for unsupervised video object segmentation"],"prefix":"10.1016","volume":"169","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-6008-014X","authenticated-orcid":false,"given":"Maojin","family":"Sun","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minghui","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"issue":"1","key":"10.1016\/j.imavis.2026.105945_b1","doi-asserted-by":"crossref","first-page":"457","DOI":"10.1007\/s10462-022-10176-7","article-title":"Deep learning for video object segmentation: a review","volume":"56","author":"Gao","year":"2023","journal-title":"Artif. Intell. Rev."},{"key":"10.1016\/j.imavis.2026.105945_b2","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3151","article-title":"Putting the object back into video object segmentation","author":"Cheng","year":"2024"},{"issue":"11","key":"10.1016\/j.imavis.2026.105945_b3","doi-asserted-by":"crossref","first-page":"13023","DOI":"10.1109\/TITS.2022.3232153","article-title":"Edge intelligence empowered vehicle detection and image segmentation for autonomous vehicles","volume":"24","author":"Chen","year":"2023","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"issue":"7","key":"10.1016\/j.imavis.2026.105945_b4","doi-asserted-by":"crossref","first-page":"6780","DOI":"10.1109\/TITS.2023.3258683","article-title":"Object detection in traffic videos: A survey","volume":"24","author":"Ghahremannezhad","year":"2023","journal-title":"IEEE Trans. Intell. Transp. Syst."},{"key":"10.1016\/j.imavis.2026.105945_b5","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"20224","article-title":"MOSE: A new dataset for video object segmentation in complex scenes","author":"Ding","year":"2023"},{"issue":"3","key":"10.1016\/j.imavis.2026.105945_b6","doi-asserted-by":"crossref","first-page":"4419","DOI":"10.1007\/s11042-022-13413-x","article-title":"A closer look at referring expressions for video object segmentation","volume":"82","author":"Bellver","year":"2023","journal-title":"Multimedia Tools Appl."},{"key":"10.1016\/j.imavis.2026.105945_b7","series-title":"Medical SAM 2: Segment medical images as video via segment anything model 2","author":"Zhu","year":"2024"},{"issue":"11","key":"10.1016\/j.imavis.2026.105945_b8","doi-asserted-by":"crossref","DOI":"10.1371\/journal.pone.0313323","article-title":"Deep learning-based image classification of sea turtles using object detection and instance segmentation models","volume":"19","author":"Baek","year":"2024","journal-title":"PloS One"},{"issue":"3","key":"10.1016\/j.imavis.2026.105945_b9","first-page":"3072","article-title":"Siammask: A framework for fast online object tracking and segmentation","volume":"45","author":"Hu","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.105945_b10","first-page":"1","article-title":"A multitask benchmark dataset for satellite video: Object detection, tracking, and segmentation","volume":"61","author":"Li","year":"2023","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"issue":"12","key":"10.1016\/j.imavis.2026.105945_b11","doi-asserted-by":"crossref","DOI":"10.1371\/journal.pone.0315621","article-title":"Multibranch semantic image segmentation model based on edge optimization and category perception","volume":"19","author":"Yang","year":"2024","journal-title":"PloS One"},{"issue":"4","key":"10.1016\/j.imavis.2026.105945_b12","doi-asserted-by":"crossref","DOI":"10.1371\/journal.pone.0319905","article-title":"Multi-scale prototype convolutional network for few-shot semantic segmentation","volume":"20","author":"Xu","year":"2025","journal-title":"PloS One"},{"key":"10.1016\/j.imavis.2026.105945_b13","first-page":"6449","article-title":"Referred by multi-modality: A unified temporal transformer for video object segmentation","volume":"vol. 38","author":"Yan","year":"2024"},{"issue":"1","key":"10.1016\/j.imavis.2026.105945_b14","doi-asserted-by":"crossref","DOI":"10.1016\/j.ipm.2023.103566","article-title":"Fully transformer-equipped architecture for end-to-end referring video object segmentation","volume":"61","author":"Li","year":"2024","journal-title":"Inf. Process. Manage."},{"key":"10.1016\/j.imavis.2026.105945_b15","series-title":"European Conference on Computer Vision","first-page":"596","article-title":"Hierarchical feature alignment network for unsupervised video object segmentation","author":"Pei","year":"2022"},{"issue":"2","key":"10.1016\/j.imavis.2026.105945_b16","doi-asserted-by":"crossref","first-page":"995","DOI":"10.1109\/TCSVT.2023.3288878","article-title":"Online unsupervised video object segmentation via contrastive motion clustering","volume":"34","author":"Xi","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.imavis.2026.105945_b17","doi-asserted-by":"crossref","unstructured":"W. Wang, X. Lu, J. Shen, D.J. Crandall, L. Shao, Zero-shot video object segmentation via attentive graph neural networks, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 9236\u20139245.","DOI":"10.1109\/ICCV.2019.00933"},{"key":"10.1016\/j.imavis.2026.105945_b18","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.110078","article-title":"Efficient long-short temporal attention network for unsupervised video object segmentation","volume":"146","author":"Li","year":"2024","journal-title":"Pattern Recognit."},{"issue":"5","key":"10.1016\/j.imavis.2026.105945_b19","doi-asserted-by":"crossref","first-page":"3221","DOI":"10.1007\/s00371-024-03597-8","article-title":"Motion perception-driven multimodal self-supervised video object segmentation","volume":"41","author":"Wang","year":"2025","journal-title":"Vis. Comput."},{"issue":"10","key":"10.1016\/j.imavis.2026.105945_b20","doi-asserted-by":"crossref","DOI":"10.1371\/journal.pone.0274522","article-title":"Data augmentation based on multiple oversampling fusion for medical image segmentation","volume":"17","author":"Wu","year":"2022","journal-title":"PloS One"},{"issue":"11","key":"10.1016\/j.imavis.2026.105945_b21","doi-asserted-by":"crossref","DOI":"10.1371\/journal.pone.0277578","article-title":"TC-Net: Dual coding network of transformer and CNN for skin lesion segmentation","volume":"17","author":"Dong","year":"2022","journal-title":"PloS One"},{"key":"10.1016\/j.imavis.2026.105945_b22","doi-asserted-by":"crossref","unstructured":"X. Lu, W. Wang, C. Ma, J. Shen, L. Shao, F. Porikli, See more, know more: Unsupervised video object segmentation with co-attention siamese networks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 3623\u20133632.","DOI":"10.1109\/CVPR.2019.00374"},{"key":"10.1016\/j.imavis.2026.105945_b23","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"688","article-title":"Unsupervised video object segmentation with online adversarial self-tuning","author":"Su","year":"2023"},{"key":"10.1016\/j.imavis.2026.105945_b24","series-title":"European Conference on Computer Vision","first-page":"452","article-title":"Exploring pre-trained text-to-video diffusion models for referring video object segmentation","author":"Zhu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105945_b25","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"6866","article-title":"Aligndet: Aligning pre-training and fine-tuning in object detection","author":"Li","year":"2023"},{"key":"10.1016\/j.imavis.2026.105945_b26","series-title":"Refsam: Efficiently adapting segmenting anything model for referring video object segmentation","author":"Li","year":"2023"},{"key":"10.1016\/j.imavis.2026.105945_b27","first-page":"2946","article-title":"Reliable propagation-correction modulation for video object segmentation","volume":"vol. 36","author":"Xu","year":"2022"},{"key":"10.1016\/j.imavis.2026.105945_b28","series-title":"DAGM German Conference on Pattern Recognition","first-page":"230","article-title":"Improving unsupervised label propagation for pose tracking and video object segmentation","author":"Waldmann","year":"2022"},{"key":"10.1016\/j.imavis.2026.105945_b29","doi-asserted-by":"crossref","first-page":"148","DOI":"10.1016\/j.neucom.2023.01.044","article-title":"Quality-aware pattern diffusion for video object segmentation","volume":"528","author":"Zhou","year":"2023","journal-title":"Neurocomputing"},{"key":"10.1016\/j.imavis.2026.105945_b30","doi-asserted-by":"crossref","DOI":"10.1109\/TCSVT.2024.3445717","article-title":"Adaptive multi-scale iterative optimized video object segmentation based on correlation enhancement","author":"Li","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"5","key":"10.1016\/j.imavis.2026.105945_b31","doi-asserted-by":"crossref","first-page":"3221","DOI":"10.1109\/TPAMI.2025.3532306","article-title":"Learning high-quality dynamic memory for video object segmentation","volume":"47","author":"Liu","year":"2025","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"4","key":"10.1016\/j.imavis.2026.105945_b32","first-page":"2228","article-title":"Zero-shot video object segmentation with co-attention siamese networks","volume":"44","author":"Lu","year":"2020","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"2","key":"10.1016\/j.imavis.2026.105945_b33","doi-asserted-by":"crossref","first-page":"2595","DOI":"10.1109\/TPAMI.2022.3163375","article-title":"Video object segmentation using kernelized memory network with multiple kernels","volume":"45","author":"Seong","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.105945_b34","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.126244","article-title":"Treasure in the background: Improve saliency object detection by self-supervised contrast learning","volume":"267","author":"Dong","year":"2025","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.imavis.2026.105945_b35","doi-asserted-by":"crossref","unstructured":"K. Najafian, F. Maleki, L. Jin, I. Stavness, A Semi-Self-Supervised Approach for Dense-Pattern Video Object Segmentation, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 5412\u20135421.","DOI":"10.1109\/CVPRW67362.2025.00538"},{"key":"10.1016\/j.imavis.2026.105945_b36","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"22836","article-title":"Breaking the \u201dobject\u201d in video object segmentation","author":"Tokmakov","year":"2023"},{"key":"10.1016\/j.imavis.2026.105945_b37","doi-asserted-by":"crossref","unstructured":"A. Baade, C. Chen, Self-Supervised Cross-View Correspondence with Predictive Cycle Consistency, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 16753\u201316763.","DOI":"10.1109\/CVPR52734.2025.01561"},{"key":"10.1016\/j.imavis.2026.105945_b38","series-title":"Sv4d 2.0: Enhancing spatio-temporal consistency in multi-view video diffusion for high-quality 4d generation","author":"Yao","year":"2025"},{"key":"10.1016\/j.imavis.2026.105945_b39","article-title":"Multi-scale spatio-temporal memory network for semi-supervised video object segmentation","author":"Wang","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.imavis.2026.105945_b40","series-title":"LSVOS 2025 challenge report: Recent advances in complex video object segmentation","author":"Liu","year":"2025"},{"key":"10.1016\/j.imavis.2026.105945_b41","series-title":"Video mamba suite: State space model as a versatile alternative for video understanding","author":"Chen","year":"2024"},{"key":"10.1016\/j.imavis.2026.105945_b42","series-title":"State space model for new-generation network alternative to transformers: A survey","author":"Wang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105945_b43","series-title":"European Conference on Computer Vision","first-page":"75","article-title":"Mamba-ND: Selective state space modeling for multi-dimensional data","author":"Li","year":"2024"},{"key":"10.1016\/j.imavis.2026.105945_b44","series-title":"Spatial-mamba: Effective visual state space models via structure-aware state fusion","author":"Xiao","year":"2024"},{"key":"10.1016\/j.imavis.2026.105945_b45","series-title":"GroupMamba: Parameter-efficient and accurate group visual state space model","author":"Shaker","year":"2024"},{"key":"10.1016\/j.imavis.2026.105945_b46","series-title":"Vision mamba: A comprehensive survey and taxonomy","author":"Liu","year":"2024"},{"issue":"10","key":"10.1016\/j.imavis.2026.105945_b47","article-title":"Enhancing bridge damage detection with mamba-enhanced hrnet for semantic segmentation","volume":"19","author":"Liu","year":"2024","journal-title":"PloS One"},{"issue":"5","key":"10.1016\/j.imavis.2026.105945_b48","doi-asserted-by":"crossref","DOI":"10.1371\/journal.pone.0321559","article-title":"Segmentation-based deep 2D-3D multibranch learning approach for effective hyperspectral image classification","volume":"20","author":"Ahmed","year":"2025","journal-title":"PloS One"},{"key":"10.1016\/j.imavis.2026.105945_b49","series-title":"ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing","first-page":"1","article-title":"EPI-mamba: State space model for semantic segmentation from light fields","author":"Li","year":"2025"},{"key":"10.1016\/j.imavis.2026.105945_b50","series-title":"Visual mamba: A survey and new outlooks","author":"Xu","year":"2024"},{"key":"10.1016\/j.imavis.2026.105945_b51","article-title":"GLVMamba: A global-local visual state space model for remote sensing image segmentation","author":"Li","year":"2025","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"10.1016\/j.imavis.2026.105945_b52","series-title":"Mamba in vision: A comprehensive survey of techniques and applications","author":"Rahman","year":"2024"},{"key":"10.1016\/j.imavis.2026.105945_b53","series-title":"The 2017 davis challenge on video object segmentation","author":"Pont-Tuset","year":"2017"},{"key":"10.1016\/j.imavis.2026.105945_b54","series-title":"Youtube-vos: A large-scale video object segmentation benchmark","author":"Xu","year":"2018"},{"key":"10.1016\/j.imavis.2026.105945_b55","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"22139","article-title":"Unsupervised space-time network for temporally-consistent segmentation of multiple motions","author":"Meunier","year":"2023"},{"key":"10.1016\/j.imavis.2026.105945_b56","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"2246","article-title":"Boosting video object segmentation via space-time correspondence learning","author":"Zhang","year":"2023"},{"key":"10.1016\/j.imavis.2026.105945_b57","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"9481","article-title":"FeelVOS: Fast end-to-end embedding learning for video object segmentation","author":"Voigtlaender","year":"2019"},{"key":"10.1016\/j.imavis.2026.105945_b58","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"18706","article-title":"Unified mask embedding and correspondence learning for self-supervised video segmentation","author":"Li","year":"2023"},{"key":"10.1016\/j.imavis.2026.105945_b59","doi-asserted-by":"crossref","unstructured":"X. Wang, I. Misra, Z. Zeng, R. Girdhar, T. Darrell, Videocutler: Surprisingly simple unsupervised video instance segmentation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 22755\u201322764.","DOI":"10.1109\/CVPR52733.2024.02147"},{"key":"10.1016\/j.imavis.2026.105945_b60","article-title":"Dvis++: Improved decoupled framework for universal video segmentation","author":"Zhang","year":"2025","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.105945_b61","unstructured":"S. Ding, R. Qian, X. Dong, P. Zhang, Y. Zang, Y. Cao, Y. Guo, D. Lin, J. Wang, Sam2long: Enhancing sam 2 for long video segmentation with a training-free memory tree, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2025, pp. 13614\u201313624."},{"key":"10.1016\/j.imavis.2026.105945_b62","doi-asserted-by":"crossref","unstructured":"L. Lin, X. Yu, Z. Pang, Y.-X. Wang, Glus: Global-local reasoning unified into a single large language model for video segmentation, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 8658\u20138667.","DOI":"10.1109\/CVPR52734.2025.00809"}],"container-title":["Image and Vision Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S026288562600051X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S026288562600051X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T11:09:13Z","timestamp":1775560153000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S026288562600051X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5]]},"references-count":62,"alternative-id":["S026288562600051X"],"URL":"https:\/\/doi.org\/10.1016\/j.imavis.2026.105945","relation":{},"ISSN":["0262-8856"],"issn-type":[{"value":"0262-8856","type":"print"}],"subject":[],"published":{"date-parts":[[2026,5]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"STSim-Mamb: A spatiotemporal similarity learning framework for unsupervised video object segmentation","name":"articletitle","label":"Article Title"},{"value":"Image and Vision Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.imavis.2026.105945","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"105945"}}