{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T15:59:14Z","timestamp":1758124754683,"version":"3.40.3"},"publisher-location":"Cham","reference-count":96,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031727740"},{"type":"electronic","value":"9783031727757"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72775-7_19","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"329-348","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Rethinking Image-to-Video Adaptation: An Object-Centric Perspective"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0378-6438","authenticated-orcid":false,"given":"Rui","family":"Qian","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7033-774X","authenticated-orcid":false,"given":"Shuangrui","family":"Ding","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8865-7896","authenticated-orcid":false,"given":"Dahua","family":"Lin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"19_CR1","doi-asserted-by":"publisher","first-page":"5114","DOI":"10.1109\/TPAMI.2024.3362288","volume":"46","author":"JB Alayrac","year":"2024","unstructured":"Alayrac, J.B., Miech, A., Laptev, I., Sivic, J., et al.: Multi-task learning of object states and state-modifying actions from web videos. IEEE Trans. Pattern Anal. Mach. Intell. 46, 5114\u20135130 (2024)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"19_CR2","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lu\u010di\u0107, M., Schmid, C.: VIVIT: a video vision transformer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6836\u20136846 (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"19_CR3","unstructured":"Aydemir, G., Xie, W., Guney, F.: Self-supervised object-centric learning for videos. In: Thirty-Seventh Conference on Neural Information Processing Systems (2023). https:\/\/openreview.net\/forum?id=919tWtJPXe"},{"key":"19_CR4","unstructured":"Bahng, H., Jahanian, A., Sankaranarayanan, S., Isola, P.: Visual prompting: modifying pixel space to adapt pre-trained models. arXiv preprint arXiv:2203.17274 (2022)"},{"key":"19_CR5","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: ICML, vol.\u00a02, p.\u00a04 (2021)"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Besbinar, B., Frossard, P.: Self-supervision by prediction for object discovery in videos. In: 2021 IEEE International Conference on Image Processing (ICIP), pp. 1509\u20131513. IEEE (2021)","DOI":"10.1109\/ICIP42928.2021.9506062"},{"key":"19_CR7","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR8","unstructured":"Caelles, S., Pont-Tuset, J., Perazzi, F., Montes, A., Maninis, K.K., Van\u00a0Gool, L.: The 2019 DAVIS challenge on VOS: unsupervised multi-object segmentation. arXiv preprint arXiv:1905.00737 (2019)"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo Vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"19_CR11","first-page":"16664","volume":"35","author":"S Chen","year":"2022","unstructured":"Chen, S., et al.: AdaptFormer: adapting vision transformers for scalable visual recognition. Adv. Neural. Inf. Process. Syst. 35, 16664\u201316678 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR12","doi-asserted-by":"crossref","unstructured":"Crawford, E., Pineau, J.: Exploiting spatial invariance for scalable unsupervised object tracking. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 3684\u20133692 (2020)","DOI":"10.1609\/aaai.v34i04.5777"},{"key":"19_CR13","unstructured":"Damen, D., et\u00a0al.: Rescaling egocentric vision. arXiv preprint arXiv:2006.13256 (2020)"},{"key":"19_CR14","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"19_CR15","doi-asserted-by":"crossref","unstructured":"Ding, S., et al.: Motion-aware contrastive video representation learning via foreground-background merging. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9716\u20139726 (2022)","DOI":"10.1109\/CVPR52688.2022.00949"},{"key":"19_CR16","doi-asserted-by":"crossref","unstructured":"Ding, S., Qian, R., Xiong, H.: Dual contrastive learning for spatio-temporal representation. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 5649\u20135658 (2022)","DOI":"10.1145\/3503161.3547783"},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Ding, S., Qian, R., Xu, H., Lin, D., Xiong, H.: Betrayed by attention: a simple yet effective approach for self-supervised video object segmentation. arXiv preprint arXiv:2311.17893 (2023)","DOI":"10.1007\/978-3-031-72995-9_13"},{"key":"19_CR18","unstructured":"Ding, S., et al.: Motion-inductive self-supervised object discovery in videos. arXiv preprint arXiv:2210.00221 (2022)"},{"key":"19_CR19","doi-asserted-by":"crossref","unstructured":"Ding, S., Zhao, P., Zhang, X., Qian, R., Xiong, H., Tian, Q.: Prune spatio-temporal tokens by semantic-aware temporal accumulation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16945\u201316956 (2023)","DOI":"10.1109\/ICCV51070.2023.01554"},{"key":"19_CR20","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"19_CR21","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: SlowFast networks for video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"19_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"701","DOI":"10.1007\/978-3-031-20077-9_41","volume-title":"ECCV 2022, Part IX","author":"C Feng","year":"2022","unstructured":"Feng, C., et al.: PromptDet: towards open-vocabulary detection using uncurated images. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part IX. LNCS, vol. 13669, pp. 701\u2013717. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_41"},{"key":"19_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"540","DOI":"10.1007\/978-3-031-20059-5_31","volume-title":"ECCV 2022, Part XXXVI","author":"G Ghiasi","year":"2022","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, T.Y.: Scaling open-vocabulary image segmentation with image-level labels. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XXXVI. LNCS, vol. 13696, pp. 540\u2013557. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20059-5_31"},{"key":"19_CR24","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Carreira, J., Doersch, C., Zisserman, A.: Video action transformer network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 244\u2013253 (2019)","DOI":"10.1109\/CVPR.2019.00033"},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Goyal, R., et\u00a0al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5842\u20135850 (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"19_CR26","doi-asserted-by":"crossref","unstructured":"Gu, C., et\u00a0al.: AVA: a video dataset of spatio-temporally localized atomic visual actions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6047\u20136056 (2018)","DOI":"10.1109\/CVPR.2018.00633"},{"key":"19_CR27","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"752","DOI":"10.1007\/978-3-030-58580-8_44","volume-title":"Computer Vision \u2013 ECCV 2020","author":"T Gupta","year":"2020","unstructured":"Gupta, T., Vahdat, A., Chechik, G., Yang, X., Kautz, J., Hoiem, D.: Contrastive learning for weakly supervised phrase grounding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020, Part III. LNCS, vol. 12348, pp. 752\u2013768. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_44"},{"key":"19_CR28","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"19_CR29","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"19_CR30","doi-asserted-by":"crossref","unstructured":"Herzig, R., et al.: Object-region video transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3148\u20133159 (2022)","DOI":"10.1109\/CVPR52688.2022.00315"},{"key":"19_CR31","unstructured":"Houlsby, N., et al.: Parameter-efficient transfer learning for NLP. In: International Conference on Machine Learning, pp. 2790\u20132799. PMLR (2019)"},{"key":"19_CR32","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"19_CR33","unstructured":"Jia, B., Liu, Y., Huang, S.: Unsupervised object-centric learning with bi-level optimized query slot attention. arXiv preprint arXiv:2210.08990 (2022)"},{"key":"19_CR34","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"19_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"709","DOI":"10.1007\/978-3-031-19827-4_41","volume-title":"ECCV 2022, Part XXXIII","author":"M Jia","year":"2022","unstructured":"Jia, M., et al.: Visual prompt tuning. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XXXIII. LNCS, vol. 13693, pp. 709\u2013727. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19827-4_41"},{"key":"19_CR36","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"105","DOI":"10.1007\/978-3-031-19833-5_7","volume-title":"ECCV 2022, Part XXXV","author":"C Ju","year":"2022","unstructured":"Ju, C., Han, T., Zheng, K., Zhang, Y., Xie, W.: Prompting visual-language models for efficient video understanding. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part XXXV. LNCS, vol. 13695, pp. 105\u2013124. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_7"},{"key":"19_CR37","first-page":"20146","volume":"34","author":"R Kabra","year":"2021","unstructured":"Kabra, R., et al.: SIMONe: view-invariant, temporally-abstracted object representations via unsupervised video decomposition. Adv. Neural. Inf. Process. Syst. 34, 20146\u201320159 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR38","unstructured":"Kay, W., et\u00a0al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"19_CR39","unstructured":"Kipf, T., et al.: Conditional object-centric learning from video. In: International Conference on Learning Representations (2022). https:\/\/openreview.net\/forum?id=aD7uesX1GF_"},{"key":"19_CR40","unstructured":"Lafferty, J., McCallum, A., Pereira, F.C.: Conditional random fields: probabilistic models for segmenting and labeling sequence data (2001)"},{"key":"19_CR41","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"19_CR42","doi-asserted-by":"crossref","unstructured":"Li, X.L., Liang, P.: Prefix-tuning: optimizing continuous prompts for generation. arXiv preprint arXiv:2101.00190 (2021)","DOI":"10.18653\/v1\/2021.acl-long.353"},{"key":"19_CR43","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: MViTv2: improved multiscale vision transformers for classification and detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4804\u20134814 (2022)","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"19_CR44","doi-asserted-by":"crossref","unstructured":"Liang, F., et al.: Open-vocabulary semantic segmentation with mask-adapted clip. arXiv preprint arXiv:2210.04150 (2022)","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"19_CR45","unstructured":"Lin, Z., et al.: Space: unsupervised object-oriented scene representation via spatial attention and decomposition. arXiv preprint arXiv:2001.02407 (2020)"},{"key":"19_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"388","DOI":"10.1007\/978-3-031-19833-5_23","volume-title":"ECCV 2022","author":"Z Lin","year":"2022","unstructured":"Lin, Z., et al.: Frozen clip models are efficient video learners. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13695, pp. 388\u2013404. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_23"},{"key":"19_CR47","doi-asserted-by":"crossref","unstructured":"Liu, R., Huang, J., Li, G., Feng, J., Wu, X., Li, T.H.: Revisiting temporal modeling for clip-based image-to-video knowledge transferring. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6555\u20136564, June 2023","DOI":"10.1109\/CVPR52729.2023.00634"},{"key":"19_CR48","doi-asserted-by":"crossref","unstructured":"Liu, X., et al.: P-tuning V2: prompt tuning can be comparable to fine-tuning universally across scales and tasks. arXiv preprint arXiv:2110.07602 (2021)","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"19_CR49","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"19_CR50","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Video swin transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3202\u20133211 (2022)","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"19_CR51","first-page":"11525","volume":"33","author":"F Locatello","year":"2020","unstructured":"Locatello, F., et al.: Object-centric learning with slot attention. Adv. Neural. Inf. Process. Syst. 33, 11525\u201311538 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR52","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: International Conference on Learning Representations (2018)"},{"key":"19_CR53","doi-asserted-by":"crossref","unstructured":"Materzynska, J., Xiao, T., Herzig, R., Xu, H., Wang, X., Darrell, T.: Something-else: compositional action recognition with spatial-temporal interaction networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1049\u20131059 (2020)","DOI":"10.1109\/CVPR42600.2020.00113"},{"key":"19_CR54","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1007\/978-3-030-01246-5_11","volume-title":"Computer Vision \u2013 ECCV 2018","author":"T Nagarajan","year":"2018","unstructured":"Nagarajan, T., Grauman, K.: Attributes as operators: factorizing unseen attribute-object compositions. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11205, pp. 172\u2013190. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01246-5_11"},{"key":"19_CR55","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1007\/978-3-031-19772-7_1","volume-title":"ECCV 2022, Part IV","author":"B Ni","year":"2022","unstructured":"Ni, B., et al.: Expanding language-image pretrained models for general video recognition. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022, Part IV. LNCS, vol. 13664, pp. 1\u201318. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19772-7_1"},{"key":"19_CR56","unstructured":"Oquab, M., et\u00a0al.: DINOv2: learning robust visual features without supervision. arXiv preprint arXiv:2304.07193 (2023)"},{"key":"19_CR57","unstructured":"Pan, J., Lin, Z., Zhu, X., Shao, J., Li, H.: ST-Adapter: parameter-efficient image-to-video transfer learning for action recognition. arXiv preprint arXiv:2206.13559 (2022)"},{"key":"19_CR58","doi-asserted-by":"crossref","unstructured":"Park, J., Lee, J., Sohn, K.: Dual-path adaptation from image to video transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2203\u20132213, June 2023","DOI":"10.1109\/CVPR52729.2023.00219"},{"key":"19_CR59","unstructured":"Peh, E., Parmar, P., Fernando, B.: Learning to visually connect actions and their effects. arXiv preprint arXiv:2401.10805 (2024)"},{"key":"19_CR60","doi-asserted-by":"crossref","unstructured":"Pfeiffer, J., Kamath, A., R\u00fcckl\u00e9, A., Cho, K., Gurevych, I.: AdapterFusion: non-destructive task composition for transfer learning. In: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, pp. 487\u2013503 (2021)","DOI":"10.18653\/v1\/2021.eacl-main.39"},{"key":"19_CR61","doi-asserted-by":"crossref","unstructured":"Pfeiffer, J., et al.: AdapterHub: a framework for adapting transformers. arXiv preprint arXiv:2007.07779 (2020)","DOI":"10.18653\/v1\/2020.emnlp-demos.7"},{"key":"19_CR62","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"145","DOI":"10.1007\/978-3-031-19809-0_9","volume-title":"ECCV 2022","author":"R Qian","year":"2022","unstructured":"Qian, R., Ding, S., Liu, X., Lin, D.: Static and dynamic concepts for self-supervised video representation learning. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13686, pp. 145\u2013164. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19809-0_9"},{"key":"19_CR63","doi-asserted-by":"crossref","unstructured":"Qian, R., Ding, S., Liu, X., Lin, D.: Semantics meets temporal correspondence: self-supervised object-centric learning in videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16675\u201316687 (2023)","DOI":"10.1109\/ICCV51070.2023.01529"},{"key":"19_CR64","unstructured":"Qian, R., et al.: Streaming long video understanding with large language models. arXiv preprint arXiv:2405.16009 (2024)"},{"key":"19_CR65","doi-asserted-by":"crossref","unstructured":"Qian, R., et al.: Enhancing self-supervised video representation learning via multi-level feature optimization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7990\u20138001 (2021)","DOI":"10.1109\/ICCV48922.2021.00789"},{"key":"19_CR66","doi-asserted-by":"crossref","unstructured":"Qian, R., et al.: Spatiotemporal contrastive video representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6964\u20136974 (2021)","DOI":"10.1109\/CVPR46437.2021.00689"},{"key":"19_CR67","doi-asserted-by":"crossref","unstructured":"Qin, G., Eisner, J.: Learning how to ask: querying LMS with mixtures of soft prompts. arXiv preprint arXiv:2104.06599 (2021)","DOI":"10.18653\/v1\/2021.naacl-main.410"},{"key":"19_CR68","doi-asserted-by":"crossref","unstructured":"Qing, Z., et al.: Disentangling spatial and temporal learning for efficient image-to-video transfer learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 13934\u201313944, October 2023","DOI":"10.1109\/ICCV51070.2023.01281"},{"key":"19_CR69","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"19_CR70","doi-asserted-by":"crossref","unstructured":"Shin, T., Razeghi, Y., Logan\u00a0IV, R.L., Wallace, E., Singh, S.: AutoPrompt: eliciting knowledge from language models with automatically generated prompts. arXiv preprint arXiv:2010.15980 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.346"},{"key":"19_CR71","doi-asserted-by":"crossref","unstructured":"Sou\u010dek, T., Alayrac, J.B., Miech, A., Laptev, I., Sivic, J.: Look for the change: learning object states and state-modifying actions from untrimmed web videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13956\u201313966 (2022)","DOI":"10.1109\/CVPR52688.2022.01357"},{"key":"19_CR72","doi-asserted-by":"crossref","unstructured":"Sung, Y.L., Cho, J., Bansal, M.: VL-adapter: parameter-efficient transfer learning for vision-and-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5227\u20135237 (2022)","DOI":"10.1109\/CVPR52688.2022.00516"},{"key":"19_CR73","first-page":"10078","volume":"35","author":"Z Tong","year":"2022","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: VideoMAE: masked autoencoders are data-efficient learners for self-supervised video pre-training. Adv. Neural. Inf. Process. Syst. 35, 10078\u201310093 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"19_CR74","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"19_CR75","doi-asserted-by":"crossref","unstructured":"Tran, D., Wang, H., Torresani, L., Ray, J., LeCun, Y., Paluri, M.: A closer look at spatiotemporal convolutions for action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6450\u20136459 (2018)","DOI":"10.1109\/CVPR.2018.00675"},{"key":"19_CR76","doi-asserted-by":"crossref","unstructured":"Ventura, C., Bellver, M., Girbau, A., Salvador, A., Marques, F., Giro-i Nieto, X.: RVOS: end-to-end recurrent network for video object segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5277\u20135286 (2019)","DOI":"10.1109\/CVPR.2019.00542"},{"key":"19_CR77","unstructured":"Wang, M., Xing, J., Liu, Y.: ActionCLIP: a new paradigm for video action recognition. arXiv preprint arXiv:2109.08472 (2021)"},{"key":"19_CR78","doi-asserted-by":"crossref","unstructured":"Wang, W., et\u00a0al.: Image as a foreign language: Beit pretraining for vision and vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19175\u201319186 (2023)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"19_CR79","doi-asserted-by":"crossref","unstructured":"Wang, X., Farhadi, A., Gupta, A.: Actions$$\\tilde{\\,}$$ transformations. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2658\u20132667 (2016)","DOI":"10.1109\/CVPR.2016.291"},{"key":"19_CR80","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"413","DOI":"10.1007\/978-3-030-01228-1_25","volume-title":"Computer Vision \u2013 ECCV 2018","author":"X Wang","year":"2018","unstructured":"Wang, X., Gupta, A.: Videos as space-time region graphs. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11209, pp. 413\u2013431. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01228-1_25"},{"key":"19_CR81","doi-asserted-by":"crossref","unstructured":"Wang, X., Misra, I., Zeng, Z., Girdhar, R., Darrell, T.: VideoCutLER: surprisingly simple unsupervised video instance segmentation. arXiv preprint arXiv:2308.14710 (2023)","DOI":"10.1109\/CVPR52733.2024.02147"},{"key":"19_CR82","unstructured":"Xie, J., Xie, W., Zisserman, A.: Segmenting moving objects via an object-centric layered representation. In: Advances in Neural Information Processing Systems (2022)"},{"key":"19_CR83","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.: GroupViT: semantic segmentation emerges from text supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18134\u201318144 (2022)","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"19_CR84","doi-asserted-by":"crossref","unstructured":"Yan, S., et al.: Multiview transformers for video recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3333\u20133343 (2022)","DOI":"10.1109\/CVPR52688.2022.00333"},{"key":"19_CR85","doi-asserted-by":"crossref","unstructured":"Yang, C., Lamdouar, H., Lu, E., Zisserman, A., Xie, W.: Self-supervised video object segmentation by motion grouping. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 7177\u20137188, October 2021","DOI":"10.1109\/ICCV48922.2021.00709"},{"key":"19_CR86","unstructured":"Yang, T., Zhu, Y., Xie, Y., Zhang, A., Chen, C., Li, M.: AIM: adapting image models for efficient video action recognition. In: The Eleventh International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=CIoSZ_HKHS7"},{"key":"19_CR87","unstructured":"Yuan, L., et\u00a0al.: Florence: a new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)"},{"key":"19_CR88","unstructured":"Zadaianchuk, A., Kleindessner, M., Zhu, Y., Locatello, F., Brox, T.: Unsupervised semantic segmentation with self-supervised object-centric representations. arXiv preprint arXiv:2207.05027 (2022)"},{"key":"19_CR89","unstructured":"Zaken, E.B., Ravfogel, S., Goldberg, Y.: BitFit: simple parameter-efficient fine-tuning for transformer-based masked language-models. arXiv preprint arXiv:2106.10199 (2021)"},{"key":"19_CR90","doi-asserted-by":"crossref","unstructured":"Zareian, A., Rosa, K.D., Hu, D.H., Chang, S.F.: Open-vocabulary object detection using captions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14393\u201314402 (2021)","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"19_CR91","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: Object-centric video representation for long-term action anticipation. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 6751\u20136761 (2024)","DOI":"10.1109\/WACV57701.2024.00661"},{"key":"19_CR92","unstructured":"Zhang, C., Gupta, A., Zisserman, A.: Is an object-centric video representation beneficial for transfer? In: Proceedings of the Asian Conference on Computer Vision, pp. 1976\u20131994 (2022)"},{"key":"19_CR93","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Tokmakov, P., Hebert, M., Schmid, C.: A structured model for action detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9975\u20139984 (2019)","DOI":"10.1109\/CVPR.2019.01021"},{"key":"19_CR94","unstructured":"Zhou, J., et al.: iBOT: image BERT pre-training with online tokenizer. arXiv preprint arXiv:2111.07832 (2021)"},{"key":"19_CR95","doi-asserted-by":"crossref","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16816\u201316825 (2022)","DOI":"10.1109\/CVPR52688.2022.01631"},{"issue":"9","key":"19_CR96","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou, K., Yang, J., Loy, C.C., Liu, Z.: Learning to prompt for vision-language models. Int. J. Comput. Vision 130(9), 2337\u20132348 (2022)","journal-title":"Int. J. Comput. Vision"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72775-7_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:22:11Z","timestamp":1732828931000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72775-7_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031727740","9783031727757"],"references-count":96,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72775-7_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}