{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T18:56:50Z","timestamp":1777489010404,"version":"3.51.4"},"reference-count":97,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,3,10]],"date-time":"2025-03-10T00:00:00Z","timestamp":1741564800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,3,10]],"date-time":"2025-03-10T00:00:00Z","timestamp":1741564800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s13735-025-00361-z","type":"journal-article","created":{"date-parts":[[2025,3,10]],"date-time":"2025-03-10T10:59:25Z","timestamp":1741604365000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["DMFNet: geometric multi-scale pixel-level contrastive learning for video salient object detection"],"prefix":"10.1007","volume":"14","author":[{"given":"Hemraj","family":"Singh","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mridula","family":"Verma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ramalingaswamy","family":"Cheruku","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,3,10]]},"reference":[{"issue":"5","key":"361_CR1","doi-asserted-by":"publisher","first-page":"13","DOI":"10.5815\/ijigsp.2020.05.02","volume":"12","author":"K Acharya","year":"2020","unstructured":"Acharya K, Ghoshal D (2020) Contrast enhancement of images through skewness and mode based bi-histogram equalization. Int J Image Graphics Signal Process 12(5):13\u201327","journal-title":"Int J Image Graphics Signal Process"},{"key":"361_CR2","doi-asserted-by":"crossref","unstructured":"Ahn WJ, Yang GY, Choi HD, Lim MT (2024) Style blind domain generalized semantic segmentation via covariance alignment and semantic consistence contrastive learning. arXiv preprint arXiv:2403.06122","DOI":"10.1109\/CVPR52733.2024.00347"},{"key":"361_CR3","doi-asserted-by":"crossref","unstructured":"Alonso I, Sabater A, Ferstl D, Montesano L, Murillo AC (2021) Semi-supervised semantic segmentation with pixel-level contrastive learning from a class-wise memory bank. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 8219\u20138228","DOI":"10.1109\/ICCV48922.2021.00811"},{"key":"361_CR4","unstructured":"Bardes A, Ponce J, LeCun Y (2021) Vicreg: Variance-invariance-covariance regularization for self-supervised learning. arXiv preprint arXiv:2105.04906"},{"issue":"6","key":"361_CR5","doi-asserted-by":"publisher","first-page":"3450","DOI":"10.1007\/s10489-020-01961-4","volume":"51","author":"HB Bi","year":"2021","unstructured":"Bi HB, Lu D, Zhu HH, Yang LN, Guan HP (2021) STA-Net: spatial-temporal attention network for video salient object detection. Appl Intell 51(6):3450\u20133459","journal-title":"Appl Intell"},{"key":"361_CR6","doi-asserted-by":"publisher","first-page":"3995","DOI":"10.1109\/TIP.2021.3068644","volume":"30","author":"C Chen","year":"2021","unstructured":"Chen C, Wang G, Peng C, Fang Y, Zhang D, Qin H (2021) Exploring rich and efficient spatial temporal interactions for real-time video salient object detection. IEEE Transact Image Process 30:3995\u20134007","journal-title":"IEEE Transact Image Process"},{"key":"361_CR7","first-page":"1","volume":"72","author":"H Chen","year":"2023","unstructured":"Chen H, Du Y, Fu Y, Zhu J, Zeng H (2023) DCAM-Net: a rapid detection network for strip steel surface defects based on deformable convolution and attention mechanism. IEEE Transact Instrum Meas 72:1\u201312","journal-title":"IEEE Transact Instrum Meas"},{"key":"361_CR8","doi-asserted-by":"crossref","unstructured":"Chen YW, Jin X, Shen X, Yang MH (2022) Video salient object detection via contrastive features and attention modules. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 1320\u20131329","DOI":"10.1109\/WACV51458.2022.00061"},{"issue":"11","key":"361_CR9","doi-asserted-by":"publisher","first-page":"8006","DOI":"10.1109\/TPAMI.2021.3107956","volume":"44","author":"MM Cheng","year":"2021","unstructured":"Cheng MM, Gao SH, Borji A, Tan YQ, Lin Z, Wang M (2021) A highly efficient model to study the semantics of salient object detection. IEEE Transact Pattern Anal Mach Intell 44(11):8006\u20138021","journal-title":"IEEE Transact Pattern Anal Mach Intell"},{"key":"361_CR10","doi-asserted-by":"crossref","unstructured":"Cho S, Lee M, Lee S, Park C, Kim D, Lee S (2023) Treating motion as option to reduce motion dependency in unsupervised video object segmentation. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 5140\u20135149","DOI":"10.1109\/WACV56688.2023.00511"},{"key":"361_CR11","doi-asserted-by":"crossref","unstructured":"Cong R, Song W, Lei J, Yue G, Zhao Y, Kwong S (2022) Psnet: Parallel symmetric network for video salient object detection. IEEE Transact Emerg Top Comput Intell, vol 5","DOI":"10.1109\/TETCI.2022.3220250"},{"key":"361_CR12","doi-asserted-by":"crossref","unstructured":"Dai J, Qi H, Xiong Y, Li Y, Zhang G, Hu H, Wei Y (2017) Deformable convolutional networks. In: Proceedings of the IEEE international conference on computer vision, pp 764\u2013773","DOI":"10.1109\/ICCV.2017.89"},{"issue":"2","key":"361_CR13","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0263729","volume":"17","author":"Y Dai","year":"2022","unstructured":"Dai Y, Xue C, Zhou L (2022) Visual saliency guided perceptual adaptive quantization based on HEVC intra-coding for planetary images. Plos one 17(2):e0263729","journal-title":"Plos one"},{"key":"361_CR14","doi-asserted-by":"crossref","unstructured":"Deng J, Dong S, Chen L, Hu J, Zhuo C (2024) Stdf: Spatio-temporal deformable fusion for video quality enhancement on embedded platforms. ACM Transactions on Embedded Computing Systems","DOI":"10.1145\/3645113"},{"key":"361_CR15","doi-asserted-by":"crossref","unstructured":"Fan DP, Wang W, Cheng MM, Shen J (2019) Shifting more attention to video salient object detection. In: IEEE CVPR, IEEE, Long Beach, CA, vol\u00a032","DOI":"10.1109\/CVPR.2019.00875"},{"key":"361_CR16","doi-asserted-by":"crossref","unstructured":"Fan DP, Wang W, Cheng MM, Shen J (2019) Shifting more attention to video salient object detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, IEEE, Hawaii, pp 8554\u20138564","DOI":"10.1109\/CVPR.2019.00875"},{"key":"361_CR17","doi-asserted-by":"crossref","unstructured":"Fan J, Su T, Zhang K, Liu Q (2022) Bidirectionally learning dense spatio-temporal feature propagation network for unsupervised video object segmentation. In: Proceedings of the 30th ACM international conference on multimedia, pp 3646\u20133655","DOI":"10.1145\/3503161.3548039"},{"issue":"7","key":"361_CR18","doi-asserted-by":"publisher","first-page":"1531","DOI":"10.1109\/TMM.2017.2679898","volume":"19","author":"K Fu","year":"2017","unstructured":"Fu K, Gu IYH, Yang J (2017) Saliency detection by fully learning a continuous conditional random field. IEEE Transact Multimed 19(7):1531\u20131544","journal-title":"IEEE Transact Multimed"},{"key":"361_CR19","first-page":"5617712","volume":"60","author":"Z GongyangLi","year":"2022","unstructured":"GongyangLi Z, Bai Z, Lin W, Ling H (2022) Lightweight salient object detection in optical remote sensing images via feature correlation. IEEE Trans Geosci Remote Sens 60:5617712","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"361_CR20","first-page":"21271","volume":"33","author":"JB Grill","year":"2020","unstructured":"Grill JB, Strub F, Altch\u00e9 F, Tallec C, Richemond P, Buchatskaya E, Doersch C, Avila Pires B, Guo Z, Gheshlaghi Azar M et al (2020) Bootstrap your own latent-a new approach to self-supervised learning. Adv Neural Inf Process Syst 33:21271\u201321284","journal-title":"Adv Neural Inf Process Syst"},{"key":"361_CR21","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"361_CR22","doi-asserted-by":"crossref","unstructured":"Heo Y, Jun\u00a0Koh Y, Kim CS (2020) Interactive video object segmentation using global and local transfer modules. In: Computer vision\u2013ECCV 2020: 16th European conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XVII 16, Springer, vol\u00a016, pp 297\u2013313","DOI":"10.1007\/978-3-030-58520-4_18"},{"key":"361_CR23","doi-asserted-by":"crossref","unstructured":"Hu C, Zhu L (2022) Efficient unsupervised video object segmentation network based on motion guidance. arXiv preprint arXiv:2211.05364 10","DOI":"10.1109\/ISCTech58360.2022.00051"},{"key":"361_CR24","doi-asserted-by":"crossref","unstructured":"Hu F, Palazzo S, Salanitri FP, Bellitto G, Moradi M, Spampinato C, McGuinness K (2023) Tinyhd: Efficient video saliency prediction with heterogeneous decoders using hierarchical maps distillation. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 2051\u20132060","DOI":"10.1109\/WACV56688.2023.00209"},{"key":"361_CR25","doi-asserted-by":"crossref","unstructured":"Huang K, Xu Z (2023) Lightweight video salient object detection via channel-shuffle enhanced multi-modal fusion network. Multimedia Tools and Applications pp 1\u201315","DOI":"10.1007\/s11042-023-15251-x"},{"issue":"1","key":"361_CR26","doi-asserted-by":"publisher","first-page":"1025","DOI":"10.1007\/s11042-023-15251-x","volume":"83","author":"K Huang","year":"2024","unstructured":"Huang K, Xu Z (2024) Lightweight video salient object detection via channel-shuffle enhanced multi-modal fusion network. Multimed Tools Appl 83(1):1025\u20131039","journal-title":"Multimed Tools Appl"},{"key":"361_CR27","unstructured":"Huang T, Huang L, You S, Wang F, Qian C, Xu C (2022) Lightvit: Towards light-weight convolution-free vision transformers. arXiv preprint arXiv:2207.05557"},{"key":"361_CR28","unstructured":"Huang Z, Wang N (2017) Like what you like: Knowledge distill via neuron selectivity transfer. arXiv preprint arXiv:1707.01219"},{"issue":"4","key":"361_CR29","doi-asserted-by":"publisher","first-page":"2592","DOI":"10.1109\/TII.2019.2937905","volume":"16","author":"T Hussain","year":"2019","unstructured":"Hussain T, Muhammad K, Del Ser J, Baik SW, de Albuquerque VHC (2019) Intelligent embedded vision for summarization of multiview videos in IIoT. IEEE Transact Ind Inf 16(4):2592\u20132602","journal-title":"IEEE Transact Ind Inf"},{"key":"361_CR30","unstructured":"Iandola FN, Han S, Moskewicz MW, Ashraf K, Dally WJ, Keutzer K (2016) Squeezenet: Alexnet-level accuracy with 50x fewer parameters and$$<$$ 0.5 mb model size. arXiv preprint arXiv:1602.07360 34"},{"key":"361_CR31","doi-asserted-by":"crossref","unstructured":"Ji GP, Fu K, Wu Z, Fan DP, Shen J, Shao L (2021) Full-duplex strategy for video object segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, IEEE, ICCV, China 30:4922\u20134933","DOI":"10.1109\/ICCV48922.2021.00488"},{"issue":"6","key":"361_CR32","doi-asserted-by":"publisher","first-page":"2676","DOI":"10.1109\/TNNLS.2020.3007534","volume":"32","author":"Y Ji","year":"2020","unstructured":"Ji Y, Zhang H, Jie Z, Ma L, Wu QJ (2020) Casnet: a cross-attention siamese network for video salient object detection. IEEE Transact Neural Netw Learn Syst 32(6):2676\u20132690","journal-title":"IEEE Transact Neural Netw Learn Syst"},{"key":"361_CR33","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104143","volume":"109","author":"F Jia","year":"2021","unstructured":"Jia F, Wang X, Guan J, Li H, Qiu C, Qi S (2021) Wrgpruner: a new model pruning solution for tiny salient object detection. Image Vis Comput 109:104143","journal-title":"Image Vis Comput"},{"key":"361_CR34","doi-asserted-by":"crossref","unstructured":"Khan A, Kuribayashi M, Wong K, Monn\u00a0Baskaran V (2023) Hdr image watermarking using saliency detection and quantization index modulation. arXiv e-prints pp arXiv\u20132302","DOI":"10.1109\/APSIPAASC58517.2023.10317464"},{"issue":"4","key":"361_CR35","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1007\/s13735-024-00346-4","volume":"13","author":"SC Kumain","year":"2024","unstructured":"Kumain SC, Singh M, Awasthi LK (2024) Dbtsf-vsod: a decision-based two-stage framework for video salient object detection. Int J Multimed Inf Retr 13(4):38","journal-title":"Int J Multimed Inf Retr"},{"key":"361_CR36","doi-asserted-by":"crossref","unstructured":"Lee M, Cho S, Lee S, Park C, Lee S (2023) Unsupervised video object segmentation via prototype memory network. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 5924\u20135934","DOI":"10.1109\/WACV56688.2023.00587"},{"key":"361_CR37","doi-asserted-by":"crossref","unstructured":"Li F, Kim T, Humayun A, Tsai D, Rehg JM (2013) Video segmentation by tracking many figure-ground segments. In: Proceedings of the IEEE International Conference on Computer Vision, IEEE, Portland, OR, USA 26:2192\u20132199","DOI":"10.1109\/ICCV.2013.273"},{"key":"361_CR38","doi-asserted-by":"crossref","unstructured":"Li F, Wu B, Yi K, Zhao Z (2016) Wander join: Online aggregation via random walks. In: Proceedings of the 2016 international conference on management of data, ACM, pp 615\u2013629","DOI":"10.1145\/2882903.2915235"},{"key":"361_CR39","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.110078","volume":"146","author":"P Li","year":"2024","unstructured":"Li P, Zhang Y, Yuan L, Xiao H, Lin B, Xu X (2024) Efficient long-short temporal attention network for unsupervised video object segmentation. Pattern Recognit 146:110078","journal-title":"Pattern Recognit"},{"key":"361_CR40","doi-asserted-by":"crossref","unstructured":"Lin L, Zheng Y, Chen W, Lan C, Zhao T (2023) Saliency-aware spatio-temporal artifact detection for compressed video quality assessment. arXiv preprint arXiv:2301.01069","DOI":"10.1109\/LSP.2023.3283541"},{"key":"361_CR41","doi-asserted-by":"crossref","unstructured":"Liu L, Prost J, Zhu L, Papadakis N, Li\u00f2 P, Sch\u00f6nlieb CB, Aviles-Rivero AI (2023) Scotch and soda: A transformer video shadow detection framework. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 10449\u201310458","DOI":"10.1109\/CVPR52729.2023.01007"},{"key":"361_CR42","doi-asserted-by":"crossref","unstructured":"Liu N, Nan K, Zhao W, Yao X, Han J (2023) Learning complementary spatial\u2013temporal transformer for video salient object detection. IEEE Transactions on Neural Networks and Learning Systems","DOI":"10.1109\/TNNLS.2023.3243246"},{"key":"361_CR43","doi-asserted-by":"crossref","unstructured":"Liu S, Huang D, et\u00a0al. (2018) Receptive field block net for accurate and fast object detection. In: Proceedings of the European conference on computer vision (ECCV), pp 385\u2013400","DOI":"10.1007\/978-3-030-01252-6_24"},{"key":"361_CR44","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2024.106144","volume":"173","author":"X Liu","year":"2024","unstructured":"Liu X, Wang L (2024) Msrmnet: multi-scale skip residual and multi-mixed features network for salient object detection. Neural Netw 173:106144","journal-title":"Neural Netw"},{"issue":"9","key":"361_CR45","doi-asserted-by":"publisher","first-page":"4439","DOI":"10.1109\/TCYB.2020.3035613","volume":"51","author":"Y Liu","year":"2020","unstructured":"Liu Y, Gu YC, Zhang XY, Wang W, Cheng MM (2020) Lightweight salient object detection via hierarchical visual perception learning. IEEE Transact Cybern 51(9):4439\u20134449","journal-title":"IEEE Transact Cybern"},{"key":"361_CR46","doi-asserted-by":"crossref","unstructured":"Mannor S, Peleg D, Rubinstein R (2005) The cross entropy method for classification. In: Proceedings of the 22nd international conference on Machine learning, pp 561\u2013568","DOI":"10.1145\/1102351.1102422"},{"key":"361_CR47","doi-asserted-by":"crossref","unstructured":"Miech A, Alayrac JB, Smaira L, Laptev I, Sivic J, Zisserman A (2020) End-to-end learning of visual representations from uncurated instructional videos. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 9879\u20139889","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"361_CR48","doi-asserted-by":"crossref","unstructured":"Pang Z, Nakashima Y, Otani M, Nagahara H (2024) Revisiting pixel-level contrastive pre-training on scene images. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 1784\u20131793","DOI":"10.1109\/WACV57701.2024.00180"},{"key":"361_CR49","first-page":"8026","volume":"32","author":"A Paszke","year":"2019","unstructured":"Paszke A, Gross S, Massa F, Lerer A, Bradbury J, Chanan G, Killeen T, Lin Z, Gimelshein N, Antiga L et al (2019) Pytorch: an imperative style, high-performance deep learning library. Adv Neural Inf Process Syst 32:8026\u20138037","journal-title":"Adv Neural Inf Process Syst"},{"key":"361_CR50","doi-asserted-by":"crossref","unstructured":"Pei G, Yao Y, Xie GS, Shen F, Tang Z, Tang J (2022) Hierarchical feature alignment network for unsupervised video object segmentation. arXiv preprint arXiv:2207.08485 12(3-4):223\u2013240","DOI":"10.1007\/978-3-031-19830-4_34"},{"key":"361_CR51","doi-asserted-by":"crossref","unstructured":"Pei G, Yao Y, Shen F, Huang D, Huang X, Shen HT (2023) Hierarchical co-attention propagation network for zero-shot video object segmentation. IEEE Transactions on Image Processing","DOI":"10.1109\/TIP.2023.3267244"},{"key":"361_CR52","doi-asserted-by":"publisher","first-page":"410","DOI":"10.1016\/j.neunet.2023.12.031","volume":"171","author":"D Peng","year":"2024","unstructured":"Peng D, Zhou W, Pan J, Wang D (2024) Msednet: multi-scale fusion and edge-supervised network for rgb-t salient object detection. Neural Netw 171:410\u2013422","journal-title":"Neural Netw"},{"key":"361_CR53","first-page":"5614","volume":"35","author":"Y Piao","year":"2022","unstructured":"Piao Y, Lu C, Zhang M, Lu H (2022) Semi-supervised video salient object detection based on uncertainty-guided pseudo labels. Adv Neural Inf Process Syst 35:5614\u20135627","journal-title":"Adv Neural Inf Process Syst"},{"key":"361_CR54","doi-asserted-by":"crossref","unstructured":"Ponimatkin G, Samet N, Xiao Y, Du Y, Marlet R, Lepetit V (2023) A simple and powerful global optimization for unsupervised video object segmentation. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 5892\u20135903","DOI":"10.1109\/WACV56688.2023.00584"},{"issue":"5","key":"361_CR55","doi-asserted-by":"publisher","first-page":"1192","DOI":"10.1109\/JAS.2023.123456","volume":"10","author":"Z Qin","year":"2023","unstructured":"Qin Z, Lu X, Nie X, Liu D, Yin Y, Wang W (2023) Coarse-to-fine video instance segmentation with factorized conditional appearance flows. IEEE\/CAA J Automatica Sin 10(5):1192\u20131208","journal-title":"IEEE\/CAA J Automatica Sin"},{"key":"361_CR56","doi-asserted-by":"crossref","unstructured":"Ren S, Han C, Yang X, Han G, He S (2020) Tenet: Triple excitation network for video salient object detection. European conference on computer vision, Springer, China 16:212\u2013228","DOI":"10.1007\/978-3-030-58558-7_13"},{"key":"361_CR57","doi-asserted-by":"crossref","unstructured":"Sandler M, Howard A, Zhu M, Zhmoginov A, Chen LC (2018) Mobilenetv 2: Inverted residuals and linear bottlenecks. Proceedings of the IEEE conference on computer vision and pattern recognition, IEEE, CVPR, Salt Lake City 31:4510\u20134520","DOI":"10.1109\/CVPR.2018.00474"},{"key":"361_CR58","unstructured":"Simonyan K, Zisserman A (2014) Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556"},{"key":"361_CR59","doi-asserted-by":"crossref","unstructured":"Singh H, Verma M, Cheruku R (2022) Vs-net: Multiscale spatiotemporal features for lightweight video salient document detection. In: 2022 IEEE 34th International conference on tools with artificial intelligence (ICTAI), IEEE, pp 1307\u20131311","DOI":"10.1109\/ICTAI56018.2022.00198"},{"key":"361_CR60","first-page":"1286","volume":"2023","author":"H Singh","year":"2023","unstructured":"Singh H, Verma M, Cheruku R (2023) Dsnet: efficient lightweight model for video salient object detection for iot and wot applications. Companion Proc ACM Web Conf 2023:1286\u20131295","journal-title":"Companion Proc ACM Web Conf"},{"key":"361_CR61","doi-asserted-by":"crossref","unstructured":"Singh H, Verma M, Cheruku R (2023) Novel dilated separable convolution networks for efficient video salient object detection in the wild. IEEE Transactions on Instrumentation and Measurement.","DOI":"10.1109\/TIM.2023.3302911"},{"key":"361_CR62","doi-asserted-by":"crossref","unstructured":"Singh H, Verma M, Cheruku R (2024) Dsfnet: Video salient object detection using a novel lightweight deformable separable fusion network. IEEE Transactions on Instrumentation and Measurement","DOI":"10.1109\/TIM.2024.3470045"},{"key":"361_CR63","unstructured":"Su Y, Deng J, Sun R, Lin G, Wu Q (2022) A unified transformer framework for group-based segmentation: Co-segmentation, co-saliency detection and video salient object detection. arXiv preprint arXiv:2203.04708 14"},{"key":"361_CR64","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109399","volume":"138","author":"J Sun","year":"2023","unstructured":"Sun J, Mao Y, Dai Y, Zhong Y, Wang J (2023) Munet: motion uncertainty-aware semi-supervised video object segmentation. Pattern Recognit 138:109399","journal-title":"Pattern Recognit"},{"key":"361_CR65","doi-asserted-by":"crossref","unstructured":"Tang Y, Zou W, Jin Z, Li X (2018) Multi-scale spatiotemporal conv-lstm network for video saliency detection. In: Proceedings of the 2018 ACM on international conference on multimedia retrieval, pp 362\u2013369","DOI":"10.1145\/3206025.3206052"},{"key":"361_CR66","doi-asserted-by":"crossref","unstructured":"Tang Y, Li Y, Zou W (2020) Fast video salient object detection via spatiotemporal knowledge distillation. arXiv preprint arXiv:2010.10027","DOI":"10.1016\/j.neucom.2019.09.064"},{"key":"361_CR67","doi-asserted-by":"crossref","unstructured":"Tokmakov P, Alahari K, Schmid C (2017) Learning video object segmentation with visual memory. Proceedings of the IEEE International Conference on Computer Vision, IEEE, Hawaii 30:4481\u20134490","DOI":"10.1109\/ICCV.2017.480"},{"key":"361_CR68","doi-asserted-by":"crossref","unstructured":"Tran VN, Liu SH, Huang CE, Aslam MS, Yang KL, Li YH, Wang JC (2024) Hapiclr: heuristic attention pixel-level contrastive loss representation learning for self-supervised pretraining. The Visual Computer pp 1\u201316","DOI":"10.1007\/s00371-023-03217-x"},{"key":"361_CR69","doi-asserted-by":"crossref","unstructured":"Tsiami A, Koutras P, Maragos P (2020) Stavis: Spatio-temporal audiovisual saliency network. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 4766\u20134776","DOI":"10.1109\/CVPR42600.2020.00482"},{"key":"361_CR70","doi-asserted-by":"crossref","unstructured":"Tu Y, Li L, Su L, Zha ZJ, Yan C, Huang Q (2023) Self-supervised cross-view representation reconstruction for change captioning. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 2805\u20132815","DOI":"10.1109\/ICCV51070.2023.00263"},{"key":"361_CR71","doi-asserted-by":"publisher","first-page":"4926","DOI":"10.1109\/TPAMI.2024.3365104","volume":"46","author":"Y Tu","year":"2024","unstructured":"Tu Y, Li L, Su L, Zha ZJ, Huang Q (2024) Smart: syntax-calibrated multi-aspect relation transformer for change captioning. IEEE Transact Pattern Anal Mach Intell 46:4926","journal-title":"IEEE Transact Pattern Anal Mach Intell"},{"key":"361_CR72","doi-asserted-by":"crossref","unstructured":"Tu Y, Li L, Su L, Zha ZJ, Yan C, Huang Q (2024) Context-aware difference distilling for multi-change captioning. arXiv preprint arXiv:2405.20810","DOI":"10.18653\/v1\/2024.acl-long.430"},{"key":"361_CR73","doi-asserted-by":"crossref","unstructured":"Tu Y, Li L, Su L, Yan C, Huang Q (2025) Distractors-immune representation learning with cross-modal contrastive regularization for change captioning. In: European conference on computer vision, Springer, pp 311\u2013328","DOI":"10.1007\/978-3-031-72775-7_18"},{"key":"361_CR74","doi-asserted-by":"crossref","unstructured":"Van\u00a0Gansbeke W, Vandenhende S, Georgoulis S, Van\u00a0Gool L (2021) Unsupervised semantic segmentation by contrasting object mask proposals. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 10052\u201310062","DOI":"10.1109\/ICCV48922.2021.00990"},{"key":"361_CR75","doi-asserted-by":"crossref","unstructured":"Wang J, Chen D, Wu Z, Luo C, Tang C, Dai X, Zhao Y, Xie Y, Yuan L, Jiang YG (2023) Look before you match: Instance understanding matters in video object segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 2268\u20132278","DOI":"10.1109\/CVPR52729.2023.00225"},{"issue":"17","key":"361_CR76","doi-asserted-by":"publisher","first-page":"53139","DOI":"10.1007\/s11042-023-17614-w","volume":"83","author":"J Wang","year":"2024","unstructured":"Wang J, Huang Z, Huang Z, Zhang M, Ren X (2024) Dsfnet: dynamic selection-fusion networks for video salient object detection. Multimed Tools Appl 83(17):53139\u201353164","journal-title":"Multimed Tools Appl"},{"key":"361_CR77","unstructured":"Wang RJ, Li X, Ling CX (2018) Pelee: A real-time object detection system on mobile devices. Advances in neural information processing systems 31"},{"key":"361_CR78","doi-asserted-by":"crossref","unstructured":"Wang W, Zhou T, Yu F, Dai J, Konukoglu E, Van\u00a0Gool L (2021) Exploring cross-image pixel contrast for semantic segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 7303\u20137313","DOI":"10.1109\/ICCV48922.2021.00721"},{"key":"361_CR79","doi-asserted-by":"crossref","unstructured":"Wang W, Dai J, Chen Z, Huang Z, Li Z, Zhu X, Hu X, Lu T, Lu L, Li H, et\u00a0al. (2023) Internimage: Exploring large-scale vision foundation models with deformable convolutions. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 14408\u201314419","DOI":"10.1109\/CVPR52729.2023.01385"},{"key":"361_CR80","doi-asserted-by":"crossref","unstructured":"Wang X, Zhang R, Shen C, Kong T, Li L (2021) Dense contrastive learning for self-supervised visual pre-training. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3024\u20133033","DOI":"10.1109\/CVPR46437.2021.00304"},{"key":"361_CR81","doi-asserted-by":"crossref","unstructured":"Wang Z, Zhong Y, Miao Y, Ma L, Specia L (2022) Contrastive video-language learning with fine-grained frame sampling. arXiv preprint arXiv:2210.05039 2","DOI":"10.18653\/v1\/2022.aacl-main.53"},{"key":"361_CR82","doi-asserted-by":"publisher","first-page":"1011","DOI":"10.1109\/TMM.2023.3275308","volume":"26","author":"J Wu","year":"2024","unstructured":"Wu J, Hao F, Liang W, Xu J (2024) Transformer fusion and pixel-level contrastive learning for rgb-d salient object detection. IEEE Transact Multimed 26:1011\u20131026. https:\/\/doi.org\/10.1109\/TMM.2023.3275308","journal-title":"IEEE Transact Multimed"},{"key":"361_CR83","doi-asserted-by":"crossref","unstructured":"Xie S, Girshick R, Doll\u00e1r P, Tu Z, He K (2017) Aggregated residual transformations for deep neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1492\u20131500","DOI":"10.1109\/CVPR.2017.634"},{"key":"361_CR84","unstructured":"Xu B, Liang H, Ni W, Gong W, Liang R, Chen P (2022) Learning video salient object detection progressively from unlabeled videos. arXiv preprint arXiv:2204.02008"},{"key":"361_CR85","doi-asserted-by":"crossref","unstructured":"Xu B, Jiang Q, Zhao X, Lu C, Liang H, Liang R (2024) Multidimensional exploration of segment anything model for weakly supervised video salient object detection. IEEE Transactions on circuits and systems for video technology","DOI":"10.1109\/TCSVT.2024.3368053"},{"key":"361_CR86","doi-asserted-by":"crossref","unstructured":"Xu M, Fu P, Liu B, Li J (2021) Multi-stream attention-aware graph convolution network for video salient object detection. IEEE Transact Image Process 30:4183\u20134197","DOI":"10.1109\/TIP.2021.3070200"},{"key":"361_CR87","doi-asserted-by":"crossref","unstructured":"Xu Y, Song D, Hoogs A (2014) An efficient online hierarchical supervoxel segmentation algorithm for time-critical applications. In: BMVC, Citeseer, San Francisco, CA, vol\u00a023","DOI":"10.5244\/C.28.130"},{"key":"361_CR88","doi-asserted-by":"crossref","unstructured":"Yu J, Jiang Y, Wang Z, Cao Z, Huang T (2016) Unitbox: An advanced object detection network. In: Proceedings of the 24th ACM international conference on multimedia, pp 516\u2013520","DOI":"10.1145\/2964284.2967274"},{"key":"361_CR89","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3660346","volume":"20","author":"S Yue","year":"2024","unstructured":"Yue S, Tu Y, Li L, Gao S, Yu Z (2024) Multi-grained representation aggregating transformer with gating cycle for change captioning. ACM Transact Multimed Comput, Commun Appl 20:1","journal-title":"ACM Transact Multimed Comput, Commun Appl"},{"key":"361_CR90","unstructured":"Zhang J, Liang Q, Shi Y (2022) Kd-scfnet: Towards more accurate and efficient salient object detection via knowledge distillation. arXiv preprint arXiv:2208.02178"},{"key":"361_CR91","doi-asserted-by":"crossref","unstructured":"Zhang X, Zhou X, Lin M, Sun J (2018) Shufflenet: An extremely efficient convolutional neural network for mobile devices. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6848\u20136856","DOI":"10.1109\/CVPR.2018.00716"},{"key":"361_CR92","doi-asserted-by":"crossref","unstructured":"Zhao JX, Cao Y, Fan DP, Cheng MM, Li XY, Zhang L (2019) Contrast prior and fluid pyramid integration for rgbd salient object detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 3927\u20133936","DOI":"10.1109\/CVPR.2019.00405"},{"key":"361_CR93","doi-asserted-by":"crossref","unstructured":"Zhao X, Pang Y, Yang J, Zhang L, Lu H (2021) Multi-source fusion and automatic predictor selection for zero-shot video object segmentation. In: Proceedings of the 29th ACM International Conference on Multimedia, ACM MM, China, vol\u00a029, pp 2645\u20132653","DOI":"10.1145\/3474085.3475192"},{"key":"361_CR94","doi-asserted-by":"crossref","unstructured":"Zhao X, Liang H, Li P, Sun G, Zhao D, Liang R, He X (2024) Motion-aware memory network for fast video salient object detection. IEEE Transactions on Image Processing","DOI":"10.1109\/TIP.2023.3348659"},{"key":"361_CR95","doi-asserted-by":"crossref","unstructured":"Zhong Y, Yuan B, Wu H, Yuan Z, Peng J, Wang YX (2021) Pixel contrastive-consistent semi-supervised semantic segmentation. In: Proceedings of the IEEE\/CVF International conference on computer vision, pp 7273\u20137282","DOI":"10.1109\/ICCV48922.2021.00718"},{"key":"361_CR96","doi-asserted-by":"crossref","unstructured":"Zhou W, Sun F, Jiang Q, Cong R, Hwang JN (2023) Wavenet: Wavelet network with knowledge distillation for rgb-t salient object detection. IEEE Transactions on Image Processing","DOI":"10.1109\/TIP.2023.3275538"},{"key":"361_CR97","doi-asserted-by":"crossref","unstructured":"Zhu X, Hu H, Lin S, Dai J (2019) Deformable convnets v2: More deformable, better results. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 9308\u20139316","DOI":"10.1109\/CVPR.2019.00953"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-025-00361-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13735-025-00361-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-025-00361-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T13:40:31Z","timestamp":1749562831000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13735-025-00361-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,10]]},"references-count":97,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["361"],"URL":"https:\/\/doi.org\/10.1007\/s13735-025-00361-z","relation":{},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"value":"2192-6611","type":"print"},{"value":"2192-662X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,3,10]]},"assertion":[{"value":"23 July 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"12 February 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 February 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 March 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors state that they do not have any competing financial interests or personal relationships that could have influenced the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"12"}}