{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T09:03:46Z","timestamp":1742979826263,"version":"3.40.3"},"publisher-location":"Cham","reference-count":81,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031726835"},{"type":"electronic","value":"9783031726842"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72684-2_13","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:04:47Z","timestamp":1730574287000},"page":"216-235","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["ActionVOS: Actions as\u00a0Prompts for\u00a0Video Object Segmentation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-0733-2858","authenticated-orcid":false,"given":"Liangyang","family":"Ouyang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8460-8763","authenticated-orcid":false,"given":"Ruicong","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8067-6227","authenticated-orcid":false,"given":"Yifei","family":"Huang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1441-889X","authenticated-orcid":false,"given":"Ryosuke","family":"Furuta","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0097-4537","authenticated-orcid":false,"given":"Yoichi","family":"Sato","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"13_CR1","doi-asserted-by":"crossref","unstructured":"Bertasius, G., Park, H.S., Stella, X.Y., Shi, J.: First-person action-object detection with egonet. In: Robotics: Science and Systems (2017)","DOI":"10.15607\/RSS.2017.XIII.012"},{"key":"13_CR2","doi-asserted-by":"crossref","unstructured":"Botach, A., Zheltonozhskii, E., Baskin, C.: End-to-end referring video object segmentation with multimodal transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4985\u20134995 (2022)","DOI":"10.1109\/CVPR52688.2022.00493"},{"key":"13_CR3","unstructured":"Cai, M., Kitani, K.M., Sato, Y.: Understanding hand-object manipulation with grasp types and object attributes. In: Robotics: Science and Systems, vol.\u00a03 (2016)"},{"key":"13_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"13_CR5","doi-asserted-by":"crossref","unstructured":"Chen, Z., Ma, L., Luo, W., Wong, K.Y.K.: Weakly-supervised spatio-temporally grounding natural sentence in video. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 1884\u20131894 (2019)","DOI":"10.18653\/v1\/P19-1183"},{"key":"13_CR6","doi-asserted-by":"crossref","unstructured":"Cheng, H.K., Oh, S.W., Price, B., Schwing, A., Lee, J.Y.: Tracking anything with decoupled video segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1316\u20131326 (2023)","DOI":"10.1109\/ICCV51070.2023.00127"},{"key":"13_CR7","doi-asserted-by":"crossref","unstructured":"Damen, D., et\u00a0al.: Scaling egocentric vision: the epic-kitchens dataset. In: Proceedings of the European Conference on Computer Vision, pp. 720\u2013736 (2018)","DOI":"10.1007\/978-3-030-01225-0_44"},{"key":"13_CR8","doi-asserted-by":"crossref","unstructured":"Damen, D., et\u00a0al.: Rescaling egocentric vision: collection, pipeline and challenges for epic-kitchens-100. Int. J. Comput. Vision 1\u201323 (2022)","DOI":"10.1007\/s11263-021-01531-2"},{"key":"13_CR9","first-page":"13745","volume":"35","author":"A Darkhalil","year":"2022","unstructured":"Darkhalil, A., et al.: Epic-kitchens visor benchmark: video segmentations and object relations. Adv. Neural. Inf. Process. Syst. 35, 13745\u201313758 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"13_CR10","doi-asserted-by":"crossref","unstructured":"De\u00a0Vries, H., Strub, F., Chandar, S., Pietquin, O., Larochelle, H., Courville, A.: GuessWhat?! Visual object discovery through multi-modal dialogue. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5503\u20135512 (2017)","DOI":"10.1109\/CVPR.2017.475"},{"key":"13_CR11","doi-asserted-by":"crossref","unstructured":"Deruyttere, T., Vandenhende, S., Grujicic, D., Van\u00a0Gool, L., Moens, M.F.: Talk2Car: taking control of your self-driving car. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, pp. 2088\u20132098 (2019)","DOI":"10.18653\/v1\/D19-1215"},{"key":"13_CR12","doi-asserted-by":"crossref","unstructured":"Ding, H., Liu, C., He, S., Jiang, X., Loy, C.C.: MeViS: a large-scale benchmark for video segmentation with motion expressions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2694\u20132703 (2023)","DOI":"10.1109\/ICCV51070.2023.00254"},{"key":"13_CR13","doi-asserted-by":"crossref","unstructured":"Ding, H., Liu, C., Wang, S., Jiang, X.: Vlt: Vision-language transformer and query generation for referring segmentation. IEEE Trans. Pattern Anal. Mach. Intell. (2022)","DOI":"10.1109\/TPAMI.2022.3217852"},{"key":"13_CR14","doi-asserted-by":"crossref","unstructured":"Fu, Q., Liu, X., Kitani, K.: Sequential voting with relational box fields for active object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2374\u20132383 (2022)","DOI":"10.1109\/CVPR52688.2022.00241"},{"key":"13_CR15","doi-asserted-by":"crossref","unstructured":"Gavrilyuk, K., Ghodrati, A., Li, Z., Snoek, C.G.: Actor and action video segmentation from a sentence. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5958\u20135966 (2018)","DOI":"10.1109\/CVPR.2018.00624"},{"key":"13_CR16","unstructured":"Grauman, K., et\u00a0al.: EGO4D: around the world in 3,000 hours of egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18995\u201319012 (2022)"},{"key":"13_CR17","doi-asserted-by":"crossref","unstructured":"Gupta, A., Davis, L.S.: Objects in action: an approach for combining action understanding and object perception. In: 2007 IEEE Conference on Computer Vision and Pattern Recognition, pp.\u00a01\u20138 (2007)","DOI":"10.1109\/CVPR.2007.383331"},{"key":"13_CR18","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"13_CR19","unstructured":"He, S., Ding, H., Liu, C., Jiang, X.: GREC: generalized referring expression comprehension. arXiv preprint arXiv:2308.16182 (2023)"},{"key":"13_CR20","doi-asserted-by":"crossref","unstructured":"Higgins, R.E.L., Fouhey, D.F.: MOVES: manipulated objects in video enable segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 6334\u20136343 (2023)","DOI":"10.1109\/CVPR52729.2023.00613"},{"key":"13_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"108","DOI":"10.1007\/978-3-319-46448-0_7","volume-title":"Computer Vision \u2013 ECCV 2016","author":"R Hu","year":"2016","unstructured":"Hu, R., Rohrbach, M., Darrell, T.: Segmentation from natural language expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 108\u2013124. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_7"},{"key":"13_CR22","doi-asserted-by":"crossref","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., Carion, N.: MDETR-modulated detection for end-to-end multi-modal understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1780\u20131790 (2021)","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"13_CR23","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: ReferItGame: referring to objects in photographs of natural scenes. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing, pp. 787\u2013798 (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"13_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"123","DOI":"10.1007\/978-3-030-20870-7_8","volume-title":"Computer Vision \u2013 ACCV 2018","author":"A Khoreva","year":"2019","unstructured":"Khoreva, A., Rohrbach, A., Schiele, B.: Video object segmentation with language referring expressions. In: Jawahar, C.V., Li, H., Mori, G., Schindler, K. (eds.) ACCV 2018. LNCS, vol. 11364, pp. 123\u2013141. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-20870-7_8"},{"issue":"1","key":"13_CR25","doi-asserted-by":"publisher","first-page":"81","DOI":"10.1016\/j.cviu.2010.08.002","volume":"115","author":"H Kjellstr\u00f6m","year":"2011","unstructured":"Kjellstr\u00f6m, H., Romero, J., Kragi\u0107, D.: Visual object-action recognition: inferring object affordances from human demonstration. Comput. Vis. Image Underst. 115(1), 81\u201390 (2011)","journal-title":"Comput. Vis. Image Underst."},{"issue":"10","key":"13_CR26","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1016\/j.robot.2011.05.009","volume":"59","author":"N Kr\u00fcger","year":"2011","unstructured":"Kr\u00fcger, N., et al.: Object-action complexes: grounded abstractions of sensory-motor processes. Robot. Auton. Syst. 59(10), 740\u2013757 (2011)","journal-title":"Robot. Auton. Syst."},{"key":"13_CR27","doi-asserted-by":"crossref","unstructured":"Kurita, S., Katsura, N., Onami, E.: RefEgo: referring expression comprehension dataset from first-person perception of ego4d. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15214\u201315224 (2023)","DOI":"10.1109\/ICCV51070.2023.01396"},{"key":"13_CR28","doi-asserted-by":"crossref","unstructured":"Lee, C., Kumar, M.G., Tan, C.: DetermiNet: a large-scale diagnostic dataset for complex visually-grounded referencing using determiners. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 20019\u201320028 (2023)","DOI":"10.1109\/ICCV51070.2023.01832"},{"key":"13_CR29","doi-asserted-by":"crossref","unstructured":"Li, L.H., et\u00a0al.: Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10965\u201310975 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"13_CR30","doi-asserted-by":"crossref","unstructured":"Li, X., Wang, J., Xu, X., Li, X., Raj, B., Lu, Y.: Robust referring video object segmentation with cyclic structural consensus. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 22236\u201322245 (2023)","DOI":"10.1109\/ICCV51070.2023.02032"},{"key":"13_CR31","doi-asserted-by":"crossref","unstructured":"Li, Z., Tao, R., Gavves, E., Snoek, C.G., Smeulders, A.W.: Tracking by natural language specification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6495\u20136503 (2017)","DOI":"10.1109\/CVPR.2017.777"},{"key":"13_CR32","first-page":"7575","volume":"35","author":"KQ Lin","year":"2022","unstructured":"Lin, K.Q., et al.: Egocentric video-language pretraining. Adv. Neural. Inf. Process. Syst. 35, 7575\u20137586 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"13_CR33","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"13_CR34","doi-asserted-by":"crossref","unstructured":"Liu, C., Ding, H., Jiang, X.: GRES: generalized referring expression segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23592\u201323601 (2023)","DOI":"10.1109\/CVPR52729.2023.02259"},{"key":"13_CR35","doi-asserted-by":"crossref","unstructured":"Liu, J., et al.: PolyFormer: referring image segmentation as sequential polygon generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18653\u201318663 (2023)","DOI":"10.1109\/CVPR52729.2023.01789"},{"key":"13_CR36","doi-asserted-by":"crossref","unstructured":"Liu, R., Ohkawa, T., Zhang, M., Sato, Y.: Single-to-dual-view adaptation for egocentric 3D hand pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 677\u2013686 (2024)","DOI":"10.1109\/CVPR52733.2024.00071"},{"key":"13_CR37","doi-asserted-by":"crossref","unstructured":"Liu, R., Liu, C., Bai, Y., Yuille, A.L.: CLEVR-Ref+: diagnosing visual reasoning with referring expressions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4185\u20134194 (2019)","DOI":"10.1109\/CVPR.2019.00431"},{"key":"13_CR38","unstructured":"Liu, Y., et al.: RoBERTa: a robustly optimized BERT pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"13_CR39","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"13_CR40","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Video swin transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3202\u20133211 (2022)","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"13_CR41","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2018)"},{"key":"13_CR42","doi-asserted-by":"crossref","unstructured":"L\u00fcddecke, T., Ecker, A.: Image segmentation using text and image prompts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7086\u20137096 (2022)","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"13_CR43","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A.L., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 11\u201320 (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"13_CR44","doi-asserted-by":"crossref","unstructured":"Mei, J., Piergiovanni, A., Hwang, J.N., Li, W.: SLVP: self-supervised language-video pre-training for referring video object segmentation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 507\u2013517 (2024)","DOI":"10.1109\/WACVW60836.2024.00061"},{"key":"13_CR45","unstructured":"Miao, Z., Zhao, K., Tsuruoka, Y.: Improving arithmetic reasoning ability of large language models through relation tuples, verification and dynamic feedback. arXiv preprint arXiv:2406.17873 (2024)"},{"key":"13_CR46","doi-asserted-by":"crossref","unstructured":"Milletari, F., Navab, N., Ahmadi, S.A.: V-net: fully convolutional neural networks for volumetric medical image segmentation. In: 2016 fourth International Conference on 3D Vision, pp. 565\u2013571 (2016)","DOI":"10.1109\/3DV.2016.79"},{"key":"13_CR47","unstructured":"Paszke, A., et\u00a0al.: PyTorch: an imperative style, high-performance deep learning library. In: Proceedings of the 33rd International Conference on Neural Information Processing Systems, pp. 8026\u20138037 (2019)"},{"key":"13_CR48","doi-asserted-by":"crossref","unstructured":"Perazzi, F., Pont-Tuset, J., McWilliams, B., Van\u00a0Gool, L., Gross, M., Sorkine-Hornung, A.: A benchmark dataset and evaluation methodology for video object segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 724\u2013732 (2016)","DOI":"10.1109\/CVPR.2016.85"},{"key":"13_CR49","doi-asserted-by":"crossref","unstructured":"Qi, Y., et al.: REVERIE: remote embodied visual referring expression in real indoor environments. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9982\u20139991 (2020)","DOI":"10.1109\/CVPR42600.2020.01000"},{"key":"13_CR50","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021)"},{"key":"13_CR51","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., Savarese, S.: Generalized intersection over union: a metric and a loss for bounding box regression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 658\u2013666 (2019)","DOI":"10.1109\/CVPR.2019.00075"},{"key":"13_CR52","doi-asserted-by":"crossref","unstructured":"Rodin, I., Furnari, A., Min, K., Tripathi, S., Farinella, G.M.: Action scene graphs for long-form understanding of egocentric videos. arXiv preprint arXiv:2312.03391 (2023)","DOI":"10.1109\/CVPR52733.2024.01762"},{"key":"13_CR53","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"208","DOI":"10.1007\/978-3-030-58555-6_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Seo","year":"2020","unstructured":"Seo, S., Lee, J.-Y., Han, B.: URVOS: unified referring video object segmentation network with a large-scale benchmark. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12360, pp. 208\u2013223. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58555-6_13"},{"key":"13_CR54","doi-asserted-by":"crossref","unstructured":"Shan, D., Geng, J., Shu, M., Fouhey, D.F.: Understanding human hands in contact at internet scale. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9869\u20139878 (2020)","DOI":"10.1109\/CVPR42600.2020.00989"},{"key":"13_CR55","doi-asserted-by":"crossref","unstructured":"Shin, T., Razeghi, Y., Logan\u00a0IV, R.L., Wallace, E., Singh, S.: AutoPrompt: eliciting knowledge from language models with automatically generated prompts. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing, pp. 4222\u20134235 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.346"},{"key":"13_CR56","unstructured":"Tateno, M., Yagi, T., Furuta, R., Sato, Y.: Learning object states from actions via large language models. arXiv preprint arXiv:2405.01090 (2024)"},{"key":"13_CR57","doi-asserted-by":"crossref","unstructured":"Tokmakov, P., Li, J., Gaidon, A.: Breaking the \u201cobject\u201d in video object segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22836\u201322845 (2023)","DOI":"10.1109\/CVPR52729.2023.02187"},{"key":"13_CR58","unstructured":"Wang, P., et al.: One-peace: exploring one general representation model toward unlimited modalities. arXiv preprint arXiv:2305.11172 (2023)"},{"key":"13_CR59","unstructured":"Wang, P., et al.: OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR (2022)"},{"key":"13_CR60","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Beyond literal descriptions: understanding and locating open-world objects aligned with human intentions. arXiv preprint arXiv:2402.11265 (2024)","DOI":"10.18653\/v1\/2024.findings-acl.43"},{"key":"13_CR61","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Towards more flexible and accurate object tracking with natural language: algorithms and benchmark. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13763\u201313773 (2021)","DOI":"10.1109\/CVPR46437.2021.01355"},{"key":"13_CR62","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhang, X., Cao, Y., Wang, W., Shen, C., Huang, T.: SegGPT: towards segmenting everything in context. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1130\u20131140 (2023)","DOI":"10.1109\/ICCV51070.2023.00110"},{"key":"13_CR63","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: End-to-end video instance segmentation with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8741\u20138750 (2021)","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"13_CR64","doi-asserted-by":"crossref","unstructured":"Wu, C., Lin, Z., Cohen, S., Bui, T., Maji, S.: PhraseCut: language-based image segmentation in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10216\u201310225 (2020)","DOI":"10.1109\/CVPR42600.2020.01023"},{"key":"13_CR65","doi-asserted-by":"crossref","unstructured":"Wu, D., Han, W., Wang, T., Dong, X., Zhang, X., Shen, J.: Referring multi-object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14633\u201314642 (2023)","DOI":"10.1109\/CVPR52729.2023.01406"},{"key":"13_CR66","doi-asserted-by":"crossref","unstructured":"Wu, J., Jiang, Y., Sun, P., Yuan, Z., Luo, P.: Language as queries for referring video object segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4974\u20134984 (2022)","DOI":"10.1109\/CVPR52688.2022.00492"},{"key":"13_CR67","doi-asserted-by":"crossref","unstructured":"Wu, J., Jiang, Y., Yan, B., Lu, H., Yuan, Z., Luo, P.: Segment every reference object in spatial and temporal spaces. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2538\u20132550 (2023)","DOI":"10.1109\/ICCV51070.2023.00240"},{"key":"13_CR68","doi-asserted-by":"crossref","unstructured":"Wu, T.L., Zhou, Y., Peng, N.: Localizing active objects from egocentric vision with symbolic world knowledge. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 4991\u20135006 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.304"},{"key":"13_CR69","doi-asserted-by":"crossref","unstructured":"Xu, N., et al.: YouTube-VOS: sequence-to-sequence video object segmentation. In: Proceedings of the European Conference on Computer Vision, pp. 585\u2013601 (2018)","DOI":"10.1007\/978-3-030-01228-1_36"},{"key":"13_CR70","doi-asserted-by":"crossref","unstructured":"Yamaguchi, M., Saito, K., Ushiku, Y., Harada, T.: Spatio-temporal person retrieval via natural language queries. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1453\u20131462 (2017)","DOI":"10.1109\/ICCV.2017.162"},{"key":"13_CR71","doi-asserted-by":"crossref","unstructured":"Yan, B., et al.: Universal instance perception as object discovery and retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15325\u201315336 (2023)","DOI":"10.1109\/CVPR52729.2023.01471"},{"key":"13_CR72","doi-asserted-by":"crossref","unstructured":"Yu, J., Li, X., Zhao, X., Zhang, H., Wang, Y.X.: Video state-changing object segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 20439\u201320448 (2023)","DOI":"10.1109\/ICCV51070.2023.01869"},{"key":"13_CR73","doi-asserted-by":"crossref","unstructured":"Yu, L., et al.: MattNet: modular attention network for referring expression comprehension. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1307\u20131315 (2018)","DOI":"10.1109\/CVPR.2018.00142"},{"key":"13_CR74","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Yu","year":"2016","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 69\u201385. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5"},{"key":"13_CR75","doi-asserted-by":"crossref","unstructured":"Zhang, C., Gupta, A., Zisserman, A.: Helping hands: an object-aware ego-centric video recognition model. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13901\u201313912 (2023)","DOI":"10.1109\/ICCV51070.2023.01278"},{"key":"13_CR76","first-page":"36067","volume":"35","author":"H Zhang","year":"2022","unstructured":"Zhang, H., et al.: GLIPv2: unifying localization and vision-language understanding. Adv. Neural. Inf. Process. Syst. 35, 36067\u201336080 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"13_CR77","doi-asserted-by":"crossref","unstructured":"Zhang, L., Zhou, S., Stent, S., Shi, J.: Fine-grained egocentric hand-object segmentation: Dataset, model, and applications. In: Proceedings of the European Conference on Computer Vision, pp. 127\u2013145 (2022)","DOI":"10.1007\/978-3-031-19818-2_8"},{"key":"13_CR78","doi-asserted-by":"crossref","unstructured":"Zhang, M., Huang, Y., Liu, R., Sato, Y.: Masked video and body-worn IMU autoencoder for egocentric action recognition. arXiv preprint arXiv:2407.06628 (2024)","DOI":"10.1007\/978-3-031-72649-1_18"},{"issue":"9","key":"13_CR79","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"},{"key":"13_CR80","doi-asserted-by":"crossref","unstructured":"Zhu, C., et al.: EgoObjects: a large-scale egocentric dataset for fine-grained object understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.01840"},{"key":"13_CR81","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. In: International Conference on Learning Representations (2020)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72684-2_13","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:07:56Z","timestamp":1730574476000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72684-2_13"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031726835","9783031726842"],"references-count":81,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72684-2_13","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}