{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T16:16:14Z","timestamp":1774455374090,"version":"3.50.1"},"publisher-location":"Cham","reference-count":39,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726668","type":"print"},{"value":"9783031726675","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T00:00:00Z","timestamp":1727568000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T00:00:00Z","timestamp":1727568000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72667-5_6","type":"book-chapter","created":{"date-parts":[[2024,9,28]],"date-time":"2024-09-28T20:11:48Z","timestamp":1727554308000},"page":"93-109","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["VISAGE: Video Instance Segmentation with\u00a0Appearance-Guided Enhancement"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-0128-1306","authenticated-orcid":false,"given":"Hanjung","family":"Kim","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5922-1770","authenticated-orcid":false,"given":"Jaehyun","family":"Kang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4130-8346","authenticated-orcid":false,"given":"Miran","family":"Heo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1904-3889","authenticated-orcid":false,"given":"Sukjun","family":"Hwang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8498-0864","authenticated-orcid":false,"given":"Seoung Wug","family":"Oh","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8512-216X","authenticated-orcid":false,"given":"Seon Joo","family":"Kim","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,29]]},"reference":[{"key":"6_CR1","doi-asserted-by":"publisher","first-page":"158","DOI":"10.1007\/978-3-030-58621-8_10","volume-title":"Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XI","author":"A Athar","year":"2020","unstructured":"Athar, A., Mahadevan, S., Os\u0306ep, A., Leal-Taix\u00e9, L., Leibe, B.: STEm-seg: spatio-temporal embeddings for instance segmentation in videos. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XI, pp. 158\u2013177. Springer International Publishing, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58621-8_10"},{"key":"6_CR2","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part I","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part I, pp. 213\u2013229. Springer International Publishing, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"6_CR3","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: ICML (2020)"},{"key":"6_CR4","unstructured":"Cheng, B., Choudhuri, A., Misra, I., Kirillov, A., Girdhar, R., Schwing, A.G.: Mask2former for video instance segmentation. arXiv preprint arXiv:2112.10764 (2021)"},{"key":"6_CR5","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"6_CR6","doi-asserted-by":"crossref","unstructured":"Choudhuri, A., Chowdhary, G., Schwing, A.G.: Context-aware relative object queries to unify video instance and panoptic segmentation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00617"},{"key":"6_CR7","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"6_CR8","doi-asserted-by":"crossref","unstructured":"Fischer, T., et al.: QDTrack: quasi-dense similarity learning for appearance-only multiple object tracking. TPAMI (2023)","DOI":"10.1109\/TPAMI.2023.3301975"},{"key":"6_CR9","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., et al.: Simple copy-paste is a strong data augmentation method for instance segmentation. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00294"},{"key":"6_CR10","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast R-CNN. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"6_CR11","doi-asserted-by":"crossref","unstructured":"Han, S.H., et al.: VISOLO: grid-based space-time aggregation for efficient online video instance segmentation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00291"},{"key":"6_CR12","unstructured":"He, F., et al.: InsPro: propagating instance query and proposal for online video instance segmentation. NeurIPS 35, 19370\u201319383 (2022)"},{"key":"6_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Dollar, P., Girshick, R.: Mask R-CNN. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"6_CR14","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"6_CR15","doi-asserted-by":"crossref","unstructured":"Heo, M., et al.: A generalized framework for video instance segmentation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01405"},{"key":"6_CR16","unstructured":"Heo, M., Hwang, S., Oh, S.W., Lee, J.Y., Kim, S.J.: Vita: Video instance segmentation via object token association. NeurIPS (2022)"},{"key":"6_CR17","unstructured":"Huang, D.A., Yu, Z., Anandkumar, A.: MinVIS: a minimal video instance segmentation framework without video-based training. NeurIPS 35, 31265\u201331277 (2022)"},{"key":"6_CR18","unstructured":"Hwang, S., Heo, M., Oh, S.W., Kim, S.J.: Video instance segmentation using inter-frame communication transformers. NeurIPS 34, 13352\u201313363 (2021)"},{"key":"6_CR19","unstructured":"Ke, L., Li, X., Danelljan, M., Tai, Y.W., Tang, C.K., Yu, F.: Prototypical cross-attention networks for multiple object tracking and segmentation. NeurIPS 34, 1192\u20131203 (2021)"},{"key":"6_CR20","doi-asserted-by":"crossref","unstructured":"Kim, D., Woo, S., Lee, J.Y., Kweon, I.S.: Video panoptic segmentation. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00988"},{"key":"6_CR21","doi-asserted-by":"crossref","unstructured":"Kuhn, H.W.: The hungarian method for the assignment problem. NRL 2(1-2), 83\u201397 (1955)","DOI":"10.1002\/nav.3800020109"},{"issue":"2","key":"6_CR22","doi-asserted-by":"publisher","first-page":"246","DOI":"10.1007\/s11263-021-01541-0","volume":"130","author":"J Li","year":"2022","unstructured":"Li, J., Zhang, J., Maybank, S.J., Tao, D.: Bridging composite and real: towards end-to-end deep image matting. Int. J. Comput. Vision 130(2), 246\u2013266 (2022). https:\/\/doi.org\/10.1007\/s11263-021-01541-0","journal-title":"Int. J. Comput. Vision"},{"key":"6_CR23","doi-asserted-by":"crossref","unstructured":"Li, J., Yu, B., Rao, Y., Zhou, J., Lu, J.: TCOVIS: temporally consistent online video instance segmentation. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00107"},{"key":"6_CR24","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) Computer Vision \u2013 ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V, pp. 740\u2013755. Springer International Publishing, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"6_CR25","unstructured":"Qi, J., et al.: Occluded video instance segmentation. arXiv preprint arXiv:2102.01558 (2021)"},{"key":"6_CR26","doi-asserted-by":"publisher","unstructured":"Qi, J., et al.: Occluded video instance segmentation: a benchmark. IJCV (2022). https:\/\/doi.org\/10.1007\/s11263-022-01629-1","DOI":"10.1007\/s11263-022-01629-1"},{"key":"6_CR27","unstructured":"Sohn, K.: Improved deep metric learning with multi-class n-pair loss objective. NeurIPS 29 (2016)"},{"key":"6_CR28","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: End-to-end video instance segmentation with transformers. In: CVPR (2020)","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"6_CR29","doi-asserted-by":"crossref","unstructured":"Wu, J., et al.: Efficient video instance segmentation via tracklet query and proposal. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00103"},{"key":"6_CR30","doi-asserted-by":"crossref","unstructured":"Wu, J., Jiang, Y., Zhang, W., Bai, X., Bai, S.: SeqFormer: a frustratingly simple model for video instance segmentation. In: ECCV (2022)","DOI":"10.1007\/978-3-031-19815-1_32"},{"key":"6_CR31","doi-asserted-by":"crossref","unstructured":"Wu, J., Liu, Q., Jiang, Y., Bai, S., Yuille, A., Bai, X.: In defense of online models for video instance segmentation. In: ECCV (2022)","DOI":"10.1007\/978-3-031-19815-1_34"},{"key":"6_CR32","doi-asserted-by":"crossref","unstructured":"Yang, L., Fan, Y., Xu, N.: Video instance segmentation. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00529"},{"key":"6_CR33","unstructured":"Yang, L., Fan, Y., Xu, N.: The 3rd large-scale video object segmentation challenge - video instance segmentation track (2021)"},{"key":"6_CR34","unstructured":"Yang, L., Fan, Y., Xu, N.: The 4th large-scale video object segmentation challenge - video instance segmentation track (2022)"},{"key":"6_CR35","doi-asserted-by":"crossref","unstructured":"Yang, S., et al.: Crossover learning for fast online video instance segmentation. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00794"},{"key":"6_CR36","doi-asserted-by":"crossref","unstructured":"Yang, S., et al.: Temporally efficient vision transformer for video instance segmentation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00290"},{"key":"6_CR37","doi-asserted-by":"crossref","unstructured":"Ying, K., et al.: CTVIS: consistent training for online video instance segmentation. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00089"},{"key":"6_CR38","doi-asserted-by":"crossref","unstructured":"Zhang, T., et al.: DVIS: decoupled video instance segmentation framework. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00124"},{"key":"6_CR39","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. In: ICLR (2021)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72667-5_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,28]],"date-time":"2024-09-28T20:12:48Z","timestamp":1727554368000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72667-5_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,29]]},"ISBN":["9783031726668","9783031726675"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72667-5_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,29]]},"assertion":[{"value":"29 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}