{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,25]],"date-time":"2025-04-25T22:45:41Z","timestamp":1745621141358},"reference-count":56,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2024,4,1]],"date-time":"2024-04-01T00:00:00Z","timestamp":1711929600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,4,1]],"date-time":"2024-04-01T00:00:00Z","timestamp":1711929600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2024,4]]},"DOI":"10.1007\/s10489-024-05486-y","type":"journal-article","created":{"date-parts":[[2024,5,7]],"date-time":"2024-05-07T05:01:49Z","timestamp":1715058109000},"page":"6138-6153","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Learning spatiotemporal relationships with a unified framework for video object segmentation"],"prefix":"10.1007","volume":"54","author":[{"given":"Jianbiao","family":"Mei","sequence":"first","affiliation":[]},{"given":"Mengmeng","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Zizhang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Yong","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,5,7]]},"reference":[{"key":"5486_CR1","doi-asserted-by":"crossref","unstructured":"Caelles S, Maninis KK, Pont-Tuset J, Leal-Taix\u00e9 L, Cremers D, Van\u00a0Gool L (2017) One-shot video object segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 221\u2013230","DOI":"10.1109\/CVPR.2017.565"},{"key":"5486_CR2","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G, Usunier N, Kirillov A, Zagoruyko S (2020) End-to-end object detection with transformers. In: European Conference on Computer Vision, Springer, pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"5486_CR3","doi-asserted-by":"crossref","unstructured":"Chen X, Li Z, Yuan Y, Yu G, Shen J, Qi D (2020) State-aware tracker for real-time video object segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, 9384\u20139393","DOI":"10.1109\/CVPR42600.2020.00940"},{"key":"5486_CR4","unstructured":"Cheng B, Schwing A, Kirillov A (2021) Per-pixel classification is not all you need for semantic segmentation. Adv Neural Inf Process Syst 34"},{"key":"5486_CR5","first-page":"11781","volume":"34","author":"HK Cheng","year":"2021","unstructured":"Cheng HK, Tai YW, Tang CK (2021) Rethinking space-time networks with improved memory coverage for efficient video object segmentation. Adv Neural Inf Process Syst 34:11781\u201311794","journal-title":"Adv Neural Inf Process Syst"},{"issue":"3","key":"5486_CR6","doi-asserted-by":"publisher","first-page":"569","DOI":"10.1109\/TPAMI.2014.2345401","volume":"37","author":"MM Cheng","year":"2014","unstructured":"Cheng MM, Mitra NJ, Huang X, Torr PH, Hu SM (2014) Global contrast based salient region detection. IEEE Trans Pattern Anal Mach Intell 37(3):569\u2013582","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"5486_CR7","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S et\u00a0al (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"5486_CR8","doi-asserted-by":"crossref","unstructured":"Duke B, Ahmed A, Wolf C, Aarabi P, Taylor GW (2021) Sstvos: Sparse spatiotemporal transformers for video object segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5912\u20135921","DOI":"10.1109\/CVPR46437.2021.00585"},{"issue":"2","key":"5486_CR9","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham M, Van Gool L, Williams CK, Winn J, Zisserman A (2010) The pascal visual object classes (voc) challenge. Int J Comput Vision 88(2):303\u2013338","journal-title":"Int J Comput Vision"},{"key":"5486_CR10","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"5486_CR11","doi-asserted-by":"crossref","unstructured":"Hu YT, Huang JB, Schwing AG (2018) Videomatch: Matching based video object segmentation. In: Proceedings of the European conference on computer vision (ECCV), pp 54\u201370","DOI":"10.1007\/978-3-030-01237-3_4"},{"issue":"6","key":"5486_CR12","doi-asserted-by":"publisher","first-page":"1908","DOI":"10.1007\/s10489-019-01605-2","volume":"50","author":"W Huang","year":"2020","unstructured":"Huang W, Gu J, Ma X, Li Y (2020) End-to-end multitask siamese network with residual hierarchical attention for real-time object tracking. Appl Intell 50(6):1908\u20131921","journal-title":"Appl Intell"},{"key":"5486_CR13","doi-asserted-by":"crossref","unstructured":"Huang X, Xu J, Tai YW, Tang CK (2020) Fast video object segmentation with temporal aggregation network and dynamic template matching. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 8879\u20138889","DOI":"10.1109\/CVPR42600.2020.00890"},{"key":"5486_CR14","doi-asserted-by":"crossref","unstructured":"Johnander J, Danelljan M, Brissman E, Khan FS, Felsberg M (2019) A generative appearance model for end-to-end video object segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 8953\u20138962","DOI":"10.1109\/CVPR.2019.00916"},{"key":"5486_CR15","doi-asserted-by":"crossref","unstructured":"Lai Z, Lu E, Xie W (2020) Mast: A memory-augmented self-supervised tracker. In: Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition, pp 6479\u20136488","DOI":"10.1109\/CVPR42600.2020.00651"},{"key":"5486_CR16","doi-asserted-by":"crossref","unstructured":"Lan M, Zhang J, He F, Zhang L (2022) Siamese network with interactive transformer for video object segmentation. In: Proceedings of the AAAI Conference on artificial intelligence, 36:1228\u20131236","DOI":"10.1609\/aaai.v36i2.20009"},{"key":"5486_CR17","doi-asserted-by":"crossref","unstructured":"Li B, Yan J, Wu W, Zhu Z, Hu X (2018) High performance visual tracking with siamese region proposal network. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 8971\u20138980","DOI":"10.1109\/CVPR.2018.00935"},{"key":"5486_CR18","doi-asserted-by":"crossref","unstructured":"Li Y, Hou X, Koch C, Rehg JM, Yuille AL (2014) The secrets of salient object segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 280\u2013287","DOI":"10.1109\/CVPR.2014.43"},{"key":"5486_CR19","doi-asserted-by":"crossref","unstructured":"Li Y, Shen Z, Shan Y (2020) Fast video object segmentation using the global context module. In: European Conference on Computer Vision, Springer, pp 735\u2013750","DOI":"10.1007\/978-3-030-58607-2_43"},{"key":"5486_CR20","first-page":"1218","volume":"33","author":"Y Li","year":"2020","unstructured":"Li Y, Xu N, Peng J, See J, Lin W (2020) Delving into the cyclic mechanism in semi-supervised video object segmentation. Adv Neural Inf Process Syst 33:1218\u20131228","journal-title":"Adv Neural Inf Process Syst"},{"key":"5486_CR21","unstructured":"Liang Y, Ge C, Tong Z, Song Y, Wang J, Xie P (2022) Not all patches are what you need: Expediting vision transformers via token reorganizations. arXiv preprint arXiv:2202.07800"},{"key":"5486_CR22","first-page":"3430","volume":"33","author":"Y Liang","year":"2020","unstructured":"Liang Y, Li X, Jafari N, Chen J (2020) Video object segmentation with adaptive feature bank and uncertain-region refinement. Adv Neural Inf Process Syst 33:3430\u20133441","journal-title":"Adv Neural Inf Process Syst"},{"key":"5486_CR23","doi-asserted-by":"crossref","unstructured":"Lin H, Qi X, Jia J (2019) Agss-vos: Attention guided single-shot video object segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 3949\u20133957","DOI":"10.1109\/ICCV.2019.00405"},{"key":"5486_CR24","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S, Hays J, Perona P, Ramanan D, Doll\u00e1r P, Zitnick CL (2014) Microsoft coco: Common objects in context. In: European conference on computer vision, Springer, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"5486_CR25","doi-asserted-by":"crossref","unstructured":"Liu Y, Zhang D, Zhang Q, Han J (2021) Part-object relational visual saliency. IEEE Trans Pattern Anal Mach Intell","DOI":"10.1109\/TPAMI.2021.3053577"},{"key":"5486_CR26","doi-asserted-by":"crossref","unstructured":"Liu Z, Lin Y, Cao Y, Hu H, Wei Y, Zhang Z, Lin S, Guo B (2021) Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 10012\u201310022","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"5486_CR27","doi-asserted-by":"crossref","unstructured":"Lu X, Wang W, Danelljan M, Zhou T, Shen J, Van\u00a0Gool L (2020) Video object segmentation with episodic graph memory networks. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part III 16, Springer, pp 661\u2013679","DOI":"10.1007\/978-3-030-58580-8_39"},{"key":"5486_CR28","doi-asserted-by":"crossref","unstructured":"Luiten J, Voigtlaender P, Leibe B (2018) Premvos: Proposal-generation, refinement and merging for video object segmentation. In: Asian conference on computer vision, Springer, pp 565\u2013580","DOI":"10.1007\/978-3-030-20870-7_35"},{"key":"5486_CR29","doi-asserted-by":"crossref","unstructured":"Mao Y, Wang N, Zhou W, Li H (2021) Joint inductive and transductive learning for video object segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 9670\u20139679","DOI":"10.1109\/ICCV48922.2021.00953"},{"issue":"3","key":"5486_CR30","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3585076","volume":"14","author":"J Mei","year":"2023","unstructured":"Mei J, Wang M, Yang Y, Li Y, Liu Y (2023) Fast real-time video object segmentation with a tangled memory network. ACM Trans Intell Syst Technol 14(3):1\u201321","journal-title":"ACM Trans Intell Syst Technol"},{"key":"5486_CR31","doi-asserted-by":"crossref","unstructured":"Oh SW, Lee JY, Sunkavalli K, Kim SJ (2018) Fast video object segmentation by reference-guided mask propagation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7376\u20137385","DOI":"10.1109\/CVPR.2018.00770"},{"key":"5486_CR32","doi-asserted-by":"crossref","unstructured":"Oh SW, Lee JY, Xu N, Kim SJ (2019) Video object segmentation using space-time memory networks. In: Proceedings of the IEEE\/CVF International conference on computer vision, pp 9226\u20139235","DOI":"10.1109\/ICCV.2019.00932"},{"key":"5486_CR33","doi-asserted-by":"crossref","unstructured":"Perazzi F, Khoreva A, Benenson R, Schiele B, Sorkine-Hornung A (2017) Learning video object segmentation from static images. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2663\u20132672","DOI":"10.1109\/CVPR.2017.372"},{"key":"5486_CR34","doi-asserted-by":"crossref","unstructured":"Perazzi F, Pont-Tuset J, McWilliams B, Van\u00a0Gool L, Gross M, Sorkine-Hornung A (2016) A benchmark dataset and evaluation methodology for video object segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 724\u2013732","DOI":"10.1109\/CVPR.2016.85"},{"key":"5486_CR35","unstructured":"Pont-Tuset J, Perazzi F, Caelles S, Arbel\u00e1ez P, Sorkine-Hornung A, Van\u00a0Gool L (2017) The 2017 davis challenge on video object segmentation. arXiv preprint arXiv:1704.00675"},{"key":"5486_CR36","doi-asserted-by":"crossref","unstructured":"Robinson A, Lawin FJ, Danelljan M, Khan FS, Felsberg M (2020) Learning fast and robust target models for video object segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 7406\u20137415","DOI":"10.1109\/CVPR42600.2020.00743"},{"key":"5486_CR37","doi-asserted-by":"crossref","unstructured":"Seong H, Hyun J, Kim E (2020) Kernelized memory network for video object segmentation. In: European Conference on Computer Vision, Springer, pp 629\u2013645","DOI":"10.1007\/978-3-030-58542-6_38"},{"key":"5486_CR38","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Adv Neural Inf Process Syst 30"},{"key":"5486_CR39","doi-asserted-by":"crossref","unstructured":"Voigtlaender P, Chai Y, Schroff F, Adam H, Leibe B, Chen LC (2019) Feelvos: Fast end-to-end embedding learning for video object segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 9481\u20139490","DOI":"10.1109\/CVPR.2019.00971"},{"key":"5486_CR40","doi-asserted-by":"crossref","unstructured":"Voigtlaender P, Leibe B (2017) Online adaptation of convolutional neural networks for the 2017 davis challenge on video object segmentation. In: The 2017 DAVIS challenge on video object segmentation-CVPR Workshops, vol.\u00a05","DOI":"10.5244\/C.31.116"},{"key":"5486_CR41","doi-asserted-by":"crossref","unstructured":"Voigtlaender P, Luiten J, Torr PH, Leibe B (2020) Siam r-cnn: Visual tracking by re-detection. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 6578\u20136588","DOI":"10.1109\/CVPR42600.2020.00661"},{"key":"5486_CR42","doi-asserted-by":"crossref","unstructured":"Wang H, Jiang X, Ren H, Hu Y, Bai S (2021) Swiftnet: Real-time video object segmentation. In: Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition, pp 1296\u20131305","DOI":"10.1109\/CVPR46437.2021.00135"},{"issue":"2","key":"5486_CR43","doi-asserted-by":"publisher","first-page":"2290","DOI":"10.1007\/s10489-021-02547-4","volume":"52","author":"H Wang","year":"2022","unstructured":"Wang H, Liu W, Xing W (2022) A temporal attention based appearance model for video object segmentation. Appl Intell 52(2):2290\u20132300","journal-title":"Appl Intell"},{"key":"5486_CR44","doi-asserted-by":"crossref","unstructured":"Wang H, Zhu Y, Adam H, Yuille A, Chen LC (2021) Max-deeplab: End-to-end panoptic segmentation with mask transformers. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 5463\u20135474","DOI":"10.1109\/CVPR46437.2021.00542"},{"key":"5486_CR45","doi-asserted-by":"publisher","first-page":"6255","DOI":"10.1109\/TIP.2022.3208409","volume":"31","author":"M Wang","year":"2022","unstructured":"Wang M, Mei J, Liu L, Tian G, Liu Y, Pan Z (2022) Delving deeper into mask utilization in video object segmentation. IEEE Trans Image Process 31:6255\u20136266","journal-title":"IEEE Trans Image Process"},{"key":"5486_CR46","doi-asserted-by":"crossref","unstructured":"Wang Q, Zhang L, Bertinetto L, Hu W, Torr PH (2019) Fast online object tracking and segmentation: A unifying approach. In: Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition, pp 1328\u20131338","DOI":"10.1109\/CVPR.2019.00142"},{"key":"5486_CR47","doi-asserted-by":"crossref","unstructured":"Wang Z, Xu J, Liu L, Zhu F, Shao L (2019) Ranet: Ranking attention network for fast video object segmentation. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 3978\u20133987","DOI":"10.1109\/ICCV.2019.00408"},{"key":"5486_CR48","doi-asserted-by":"crossref","unstructured":"Xie H, Yao H, Zhou S, Zhang S, Sun W (2021) Efficient regional memory network for video object segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 1286\u20131295","DOI":"10.1109\/CVPR46437.2021.00134"},{"key":"5486_CR49","doi-asserted-by":"crossref","unstructured":"Xu N, Yang L, Fan Y, Yang J, Yue D, Liang Y, Price B, Cohen S, Huang T (2018) Youtube-vos: Sequence-to-sequence video object segmentation. In: Proceedings of the European conference on computer vision (ECCV), pp 585\u2013601","DOI":"10.1007\/978-3-030-01228-1_36"},{"key":"5486_CR50","doi-asserted-by":"crossref","unstructured":"Xu Y, Wang Z, Li Z, Yuan Y, Yu G (2020) Siamfc++: Towards robust and accurate visual tracking with target estimation guidelines. In: Proceedings of the AAAI conference on artificial intelligence, pp 12549\u201312556","DOI":"10.1609\/aaai.v34i07.6944"},{"key":"5486_CR51","doi-asserted-by":"crossref","unstructured":"Yan B, Peng H, Fu J, Wang D, Lu H (2021) Learning spatio-temporal transformer for visual tracking. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 10448\u201310457","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"5486_CR52","doi-asserted-by":"crossref","unstructured":"Yang J, Ge H, Su S, Liu G (2022) Transformer-based two-source motion model for multi-object tracking. Appl Intell pp 1\u201313","DOI":"10.1007\/s10489-021-03012-y"},{"key":"5486_CR53","doi-asserted-by":"crossref","unstructured":"Yang Z, Wei Y, Yang Y (2020) Collaborative video object segmentation by foreground-background integration. In: European Conference on Computer Vision, Springer, pp 332\u2013348","DOI":"10.1007\/978-3-030-58558-7_20"},{"key":"5486_CR54","unstructured":"Yang Z, Wei Y, Yang Y (2021) Associating objects with transformers for video object segmentation. Adv Neural Inf Process Syst 34"},{"issue":"4","key":"5486_CR55","doi-asserted-by":"publisher","first-page":"2589","DOI":"10.1007\/s10489-020-01905-y","volume":"51","author":"XY Zhang","year":"2021","unstructured":"Zhang XY, Huang YP, Mi Y, Pei YT, Zou Q, Wang S (2021) Video sketch: A middle-level representation for action recognition. Appl Intell 51(4):2589\u20132608","journal-title":"Appl Intell"},{"key":"5486_CR56","doi-asserted-by":"crossref","unstructured":"Zhu W, Li J, Lu J, Zhou J (2021) Separable structure modeling for semi-supervised video object segmentation. IEEE Trans Circuits Syst Video Technol","DOI":"10.1109\/TCSVT.2021.3060015"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-024-05486-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-024-05486-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-024-05486-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,6,15]],"date-time":"2024-06-15T12:11:34Z","timestamp":1718453494000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-024-05486-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4]]},"references-count":56,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2024,4]]}},"alternative-id":["5486"],"URL":"https:\/\/doi.org\/10.1007\/s10489-024-05486-y","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,4]]},"assertion":[{"value":"24 April 2024","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 May 2024","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}