{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T07:02:54Z","timestamp":1773212574093,"version":"3.50.1"},"publisher-location":"Cham","reference-count":98,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726545","type":"print"},{"value":"9783031726552","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T00:00:00Z","timestamp":1733443200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T00:00:00Z","timestamp":1733443200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72655-2_10","type":"book-chapter","created":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T10:11:20Z","timestamp":1733393480000},"page":"166-185","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Referring Atomic Video Action Recognition"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5419-9292","authenticated-orcid":false,"given":"Kunyu","family":"Peng","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3798-8603","authenticated-orcid":false,"given":"Jia","family":"Fu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1090-667X","authenticated-orcid":false,"given":"Kailun","family":"Yang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1693-7912","authenticated-orcid":false,"given":"Di","family":"Wen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3670-4567","authenticated-orcid":false,"given":"Yufan","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5245-2277","authenticated-orcid":false,"given":"Ruiping","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-4390-3044","authenticated-orcid":false,"given":"Junwei","family":"Zheng","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3471-328X","authenticated-orcid":false,"given":"Jiaming","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"M. Saquib","family":"Sarfraz","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8046-4945","authenticated-orcid":false,"given":"Rainer","family":"Stiefelhagen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4724-9164","authenticated-orcid":false,"given":"Alina","family":"Roitberg","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,6]]},"reference":[{"key":"10_CR1","doi-asserted-by":"crossref","unstructured":"Bagad, P., Tapaswi, M., Snoek, C.G.M.: Test of time: instilling video-language models with a sense of time. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00247"},{"key":"10_CR2","first-page":"7208","volume":"25","author":"Y Bu","year":"2022","unstructured":"Bu, Y., et al.: Scene-text oriented referring expression comprehension. TMM 25, 7208\u20137221 (2022)","journal-title":"TMM"},{"key":"10_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"10_CR4","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"10_CR5","unstructured":"Castro, S., Deng, N., Huang, P., Burzo, M., Mihalcea, R.: In-the-wild video question answering. In: COLING (2022)"},{"key":"10_CR6","doi-asserted-by":"crossref","unstructured":"Chai, W., Guo, X., Wang, G., Lu, Y.: StableVideo: text-driven consistency-aware diffusion video editing. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.02106"},{"key":"10_CR7","unstructured":"Chen, J., Zhu, D., Haydarov, K., Li, X., Elhoseiny, M.: Video ChatCaptioner: towards enriched spatiotemporal descriptions. arXiv preprint arXiv:2304.04227 (2023)"},{"key":"10_CR8","unstructured":"Chen, X., et al.: Microsoft COCO captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"10_CR9","doi-asserted-by":"crossref","unstructured":"Chen, Y., Wang, J., Lin, L., Qi, Z., Ma, J., Shan, Y.: Tagging before alignment: integrating multi-modal tags for video-text retrieval. arXiv preprint arXiv:2301.12644 (2023)","DOI":"10.1609\/aaai.v37i1.25113"},{"key":"10_CR10","doi-asserted-by":"crossref","unstructured":"Chen, Z., Ma, L., Luo, W., Wong, K.Y.K.: Weakly-supervised spatio-temporally grounding natural sentence in video. arXiv preprint arXiv:1906.02549 (2019)","DOI":"10.18653\/v1\/P19-1183"},{"key":"10_CR11","doi-asserted-by":"crossref","unstructured":"Chung, J., Wuu, C.H., Yang, H.R., Tai, Y.W., Tang, C.K.: HAA500: human-centric atomic action dataset with curated videos. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01321"},{"key":"10_CR12","unstructured":"Dang, R., et al.: InstructDET: diversifying referring object detection with generalized instructions. arXiv preprint arXiv:2310.05136 (2023)"},{"key":"10_CR13","doi-asserted-by":"crossref","unstructured":"Deruyttere, T., Vandenhende, S., Grujicic, D., Van\u00a0Gool, L., Moens, M.F.: Talk2Car: taking control of your self-driving car. In: EMNLP (2019)","DOI":"10.18653\/v1\/D19-1215"},{"key":"10_CR14","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: ACL (2019)"},{"key":"10_CR15","unstructured":"Dosovitskiy, A., et al.: An image is worth $$16\\times 16$$ words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"10_CR16","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C.: X3D: expanding architectures for efficient video recognition. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"10_CR17","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: SlowFast networks for video recognition. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"10_CR18","doi-asserted-by":"crossref","unstructured":"Gandhi, M., Gul, M.O., Prakash, E., Grunde-McLaughlin, M., Krishna, R., Agrawala, M.: Measuring compositional consistency for video question answering. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00499"},{"key":"10_CR19","doi-asserted-by":"crossref","unstructured":"Gao, D., Zhou, L., Ji, L., Zhu, L., Yang, Y., Shou, M.Z.: MIST: multi-modal iterative spatial-temporal transformer for long-form video question answering. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01419"},{"key":"10_CR20","doi-asserted-by":"crossref","unstructured":"Garcia, N., Otani, M., Chu, C., Nakashima, Y.: KnowIT VQA: answering knowledge-based questions about videos. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.6713"},{"key":"10_CR21","doi-asserted-by":"crossref","unstructured":"Gavrilyuk, K., Ghodrati, A., Li, Z., Snoek, C.G.: Actor and action video segmentation from a sentence. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00624"},{"key":"10_CR22","doi-asserted-by":"crossref","unstructured":"Goyal, R., et al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"10_CR23","doi-asserted-by":"crossref","unstructured":"Gritsenko, A., et al.: End-to-end spatio-temporal action localisation with video transformers. arXiv preprint arXiv:2304.12160 (2023)","DOI":"10.1109\/CVPR52733.2024.01739"},{"key":"10_CR24","doi-asserted-by":"crossref","unstructured":"Gu, C., et al.: AVA: a video dataset of spatio-temporally localized atomic visual actions. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00633"},{"key":"10_CR25","first-page":"6730","volume":"30","author":"W Guo","year":"2021","unstructured":"Guo, W., Zhang, Y., Yang, J., Yuan, X.: End-to-end object detection with transformers. TIP 30, 6730\u20136743 (2021)","journal-title":"TIP"},{"key":"10_CR26","doi-asserted-by":"crossref","unstructured":"Han, D., Ye, T., Han, Y., Xia, Z., Song, S., Huang, G.: Agent attention: on the integration of softmax and linear attention. arXiv preprint arXiv:2312.08874 (2023)","DOI":"10.1007\/978-3-031-72973-7_8"},{"key":"10_CR27","first-page":"2742","volume":"29","author":"Y Ji","year":"2020","unstructured":"Ji, Y., Zhan, Y., Yang, Y., Xu, X., Shen, F., Shen, H.T.: A context knowledge map guided coarse-to-fine action recognition. TIP 29, 2742\u20132752 (2020)","journal-title":"TIP"},{"key":"10_CR28","doi-asserted-by":"crossref","unstructured":"Jiang, J., Chen, Z., Lin, H., Zhao, X., Gao, Y.: Divide and conquer: question-guided spatio-temporal contextual attention for video question answering. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.6766"},{"key":"10_CR29","doi-asserted-by":"crossref","unstructured":"Jin, L., et al.: RefCLIP: a universal teacher for weakly supervised referring expression comprehension. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00263"},{"key":"10_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"123","DOI":"10.1007\/978-3-030-20870-7_8","volume-title":"Computer Vision \u2013 ACCV 2018","author":"A Khoreva","year":"2019","unstructured":"Khoreva, A., Rohrbach, A., Schiele, B.: Video object segmentation with language referring expressions. In: Jawahar, C.V., Li, H., Mori, G., Schindler, K. (eds.) ACCV 2018. LNCS, vol. 11364, pp. 123\u2013141. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-20870-7_8"},{"key":"10_CR31","doi-asserted-by":"crossref","unstructured":"Kim, M., Spinola, F., Benz, P., Kim, T.H.: A*: atrous spatial temporal action recognition for real time applications. In: WACV (2024)","DOI":"10.1109\/WACV57701.2024.00686"},{"key":"10_CR32","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"10_CR33","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. In: CVPR (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"10_CR34","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: HMDB: a large video database for human motion recognition. In: ICCV (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"10_CR35","doi-asserted-by":"crossref","unstructured":"Laput, G., Harrison, C.: Sensing fine-grained hand activity with smartwatches. In: CHI (2019)","DOI":"10.1145\/3290605.3300568"},{"key":"10_CR36","doi-asserted-by":"crossref","unstructured":"Le, T.M., Le, V., Venkatesh, S., Tran, T.: Hierarchical conditional relation networks for video question answering. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00999"},{"key":"10_CR37","doi-asserted-by":"crossref","unstructured":"Lea, C., Vidal, R., Hager, G.D.: Learning convolutional action primitives for fine-grained action recognition. In: ICRA (2016)","DOI":"10.1109\/ICRA.2016.7487305"},{"key":"10_CR38","unstructured":"Lei, J., Berg, T.L., Bansal, M.: Revealing single frame bias for video-and-language learning. arXiv preprint arXiv:2206.03428 (2022)"},{"key":"10_CR39","doi-asserted-by":"crossref","unstructured":"Lei, J., et al.: Less is more: ClipBERT for video-and-language learning via sparse sampling. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"10_CR40","doi-asserted-by":"crossref","unstructured":"Li, G., Wei, Y., Tian, Y., Xu, C., Wen, J.R., Hu, D.: Learning to answer questions in dynamic audio-visual scenarios. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01852"},{"key":"10_CR41","doi-asserted-by":"crossref","unstructured":"Li, J., Niu, L., Zhang, L.: From representation to reasoning: towards both evidence and commonsense reasoning for video question-answering. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.02059"},{"key":"10_CR42","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: ICML (2023)"},{"key":"10_CR43","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: ICML (2022)"},{"key":"10_CR44","unstructured":"Li, K., et al.: VideoChat: chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)"},{"key":"10_CR45","doi-asserted-by":"crossref","unstructured":"Li, L., et al.: LAVENDER: unifying video-language understanding as masked language modeling. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02214"},{"key":"10_CR46","doi-asserted-by":"crossref","unstructured":"Li, R., et al.: Referring image segmentation via recurrent refinement networks. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00602"},{"key":"10_CR47","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: MViTv2: improved multiscale vision transformers for classification and detection. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"10_CR48","doi-asserted-by":"crossref","unstructured":"Lin, J., et al.: EchoTrack: auditory referring multi-object tracking for autonomous driving. arXiv preprint arXiv:2402.18302 (2024)","DOI":"10.1109\/TITS.2024.3437645"},{"key":"10_CR49","doi-asserted-by":"crossref","unstructured":"Lin, X., et al.: Towards fast adaptation of pretrained contrastive models for multi-channel video-language retrieval. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01426"},{"key":"10_CR50","doi-asserted-by":"crossref","unstructured":"Liu, J., Wang, L., Yang, M.H.: Referring expression generation and comprehension via attributes. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.520"},{"key":"10_CR51","doi-asserted-by":"crossref","unstructured":"Liu, R., et al.: Open scene understanding: grounded situation recognition meets segment anything for helping people with visual impairments. In: ICCVW (2023)","DOI":"10.1109\/ICCVW60793.2023.00200"},{"key":"10_CR52","doi-asserted-by":"crossref","unstructured":"Liu, R., Liu, C., Bai, Y., Yuille, A.L.: CLEVR-Ref+: diagnosing visual reasoning with referring expressions. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00431"},{"issue":"9","key":"10_CR53","first-page":"4761","volume":"44","author":"S Liu","year":"2021","unstructured":"Liu, S., Hui, T., Huang, S., Wei, Y., Li, B., Li, G.: Cross-modal progressive comprehension for referring segmentation. TPAMI 44(9), 4761\u20134775 (2021)","journal-title":"TPAMI"},{"issue":"10","key":"10_CR54","doi-asserted-by":"publisher","first-page":"11624","DOI":"10.1109\/TPAMI.2023.3284038","volume":"45","author":"Y Liu","year":"2023","unstructured":"Liu, Y., Li, G., Lin, L.: Cross-modal causal relational reasoning for event-level visual question answering. TPAMI 45(10), 11624\u201311641 (2023)","journal-title":"TPAMI"},{"key":"10_CR55","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","volume":"508","author":"H Luo","year":"2022","unstructured":"Luo, H., et al.: CLIP4Clip: an empirical study of CLIP for end to end video clip retrieval and captioning. Neurocomputing 508, 293\u2013304 (2022)","journal-title":"Neurocomputing"},{"key":"10_CR56","doi-asserted-by":"crossref","unstructured":"Ma, Y., Xu, G., Sun, X., Yan, M., Zhang, J., Ji, R.: X-CLIP: end-to-end multi-grained contrastive learning for video-text retrieval. In: MM (2022)","DOI":"10.1145\/3503161.3547910"},{"key":"10_CR57","doi-asserted-by":"publisher","first-page":"669","DOI":"10.1007\/978-3-031-28244-7_42","volume-title":"ECIR","author":"A Madasu","year":"2023","unstructured":"Madasu, A., Aflalo, E., Ben Melech Stan, G., Tseng, S.Y., Bertasius, G., Lal, V.: Improving video retrieval using multilingual knowledge transfer. In: Kamps, J., et al. (eds.) ECIR 2023. LNCS, vol. 13980, pp. 669\u2013684. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-28244-7_42"},{"key":"10_CR58","doi-asserted-by":"crossref","unstructured":"McIntosh, B., Duarte, K., Rawat, Y.S., Shah, M.: Visual-textual capsule routing for text-based video segmentation. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00996"},{"key":"10_CR59","unstructured":"Nagrani, A., Yang, S., Arnab, A., Jansen, A., Schmid, C., Sun, C.: Attention bottlenecks for multimodal fusion. In: NeuIPS (2021)"},{"key":"10_CR60","unstructured":"OpenAI: ChatGPT: optimizing language models for dialogue (2022). https:\/\/openai.com\/"},{"key":"10_CR61","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.: Im2Text: describing images using 1 million captioned photographs. In: NeurIPS (2011)"},{"key":"10_CR62","doi-asserted-by":"publisher","first-page":"160","DOI":"10.1007\/978-3-031-08648-9_19","volume-title":"ICCHP 2022","author":"W Ou","year":"2022","unstructured":"Ou, W., et al.: Indoor navigation assistance for visually impaired people via dynamic SLAM and panoptic segmentation with an RGB-D sensor. In: Miesenberger, K., Kouroupetroglou, G., Mavrou, K., Manduchi, R., Covarrubias Rodriguez, M., Pen\u00e1z, P. (eds.) ICCHP 2022. LNCS, vol. 13341, pp. 160\u2013168. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-08648-9_19"},{"key":"10_CR63","doi-asserted-by":"crossref","unstructured":"Peng, K., Roitberg, A., Yang, K., Zhang, J., Stiefelhagen, R.: TransDARC: transformer-based driver activity recognition with latent space feature calibration. In: IROS (2022)","DOI":"10.1109\/IROS47612.2022.9981445"},{"key":"10_CR64","doi-asserted-by":"crossref","unstructured":"Pramanick, P., Sarkar, C., Paul, S., dev Roychoudhury, R., Bhowmick, B.: DoRO: Disambiguation of referred object for embodied agents. RA-L 7(4), 10826\u201310833 (2022)","DOI":"10.1109\/LRA.2022.3195198"},{"key":"10_CR65","first-page":"625","volume":"24","author":"RRA Pramono","year":"2021","unstructured":"Pramono, R.R.A., Chen, Y.T., Fang, W.H.: Spatial-temporal action localization with hierarchical self-attention. TMM 24, 625\u2013639 (2021)","journal-title":"TMM"},{"key":"10_CR66","doi-asserted-by":"crossref","unstructured":"Qiu, H., et al.: Language-aware fine-grained object representation for referring expression comprehension. In: MM (2020)","DOI":"10.1145\/3394171.3413850"},{"key":"10_CR67","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"10_CR68","doi-asserted-by":"crossref","unstructured":"Rajasegaran, J., Pavlakos, G., Kanazawa, A., Feichtenhofer, C., Malik, J.: On the benefits of 3D pose and tracking for human action recognition. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00069"},{"key":"10_CR69","unstructured":"Ryali, C., et al.: Hiera: a hierarchical vision transformer without the bells-and-whistles. In: ICML (2023)"},{"key":"10_CR70","doi-asserted-by":"crossref","unstructured":"Saha, J., Chowdhury, C., Chowdury, I.R., Roy, P.: Fine grained activity recognition using smart handheld. In: ICDCN (2018)","DOI":"10.1145\/3170521.3170540"},{"key":"10_CR71","doi-asserted-by":"crossref","unstructured":"Seibold, C.M., Rei\u00df, S., Kleesiek, J., Stiefelhagen, R.: Reference-guided pseudo-label generation for medical semantic segmentation. In: AAAI (2022)","DOI":"10.1609\/aaai.v36i2.20114"},{"key":"10_CR72","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"208","DOI":"10.1007\/978-3-030-58555-6_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Seo","year":"2020","unstructured":"Seo, S., Lee, J.-Y., Han, B.: URVOS: unified referring video object segmentation network with a large-scale benchmark. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12360, pp. 208\u2013223. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58555-6_13"},{"key":"10_CR73","doi-asserted-by":"crossref","unstructured":"Shao, D., Zhao, Y., Dai, B., Lin, D.: FineGym: a hierarchical video dataset for fine-grained action understanding. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"10_CR74","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: ACL (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"10_CR75","doi-asserted-by":"crossref","unstructured":"Shi, H., Pan, W., Zhao, Z., Zhang, M., Wu, F.: Unsupervised domain adaptation for referring semantic segmentation. In: MM (2023)","DOI":"10.1145\/3581783.3611879"},{"key":"10_CR76","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"38","DOI":"10.1007\/978-3-030-01231-1_3","volume-title":"Computer Vision \u2013 ECCV 2018","author":"H Shi","year":"2018","unstructured":"Shi, H., Li, H., Meng, F., Wu, Q.: Key-word-aware network for referring expression image segmentation. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 38\u201354. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_3"},{"issue":"2","key":"10_CR77","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3546828","volume":"19","author":"Y Shi","year":"2023","unstructured":"Shi, Y., Xu, H., Yuan, C., Li, B., Hu, W., Zha, Z.J.: Learning video-text aligned representations for video captioning. TOMM 19(2), 1\u201321 (2023)","journal-title":"TOMM"},{"key":"10_CR78","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)"},{"key":"10_CR79","first-page":"1403","volume":"32","author":"Y Su","year":"2023","unstructured":"Su, Y., Wang, W., Liu, J., Ma, S., Yang, X.: Sequence as a whole: a unified framework for video action localization with long-range text query. TIP 32, 1403\u20131418 (2023)","journal-title":"TIP"},{"key":"10_CR80","doi-asserted-by":"crossref","unstructured":"Vasudevan, A.B., Dai, D., Van\u00a0Gool, L.: Object referring in videos with language and human gaze. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00434"},{"key":"10_CR81","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: VideoMAE V2: scaling video masked autoencoders with dual masking. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"10_CR82","doi-asserted-by":"crossref","unstructured":"Wang, M., Xing, J., Mei, J., Liu, Y., Jiang, Y.: ActionCLIP: adapting language-image pretrained models for video action recognition. TNNLS (2023)","DOI":"10.1109\/TNNLS.2023.3331841"},{"key":"10_CR83","doi-asserted-by":"crossref","unstructured":"Wang, R., et al.: Masked video distillation: rethinking masked feature modeling for self-supervised video representation learning. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00611"},{"issue":"12","key":"10_CR84","first-page":"7645","volume":"33","author":"S Wang","year":"2023","unstructured":"Wang, S., Yan, R., Huang, P., Dai, G., Song, Y., Shu, X.: Com-STAL: compositional spatio-temporal action localization. TCSVT 33(12), 7645\u20137657 (2023)","journal-title":"TCSVT"},{"key":"10_CR85","unstructured":"Wang, Y., et al.: InternVideo: general video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191 (2022)"},{"key":"10_CR86","doi-asserted-by":"crossref","unstructured":"Wu, D., Han, W., Wang, T., Dong, X., Zhang, X., Shen, J.: Referring multi-object tracking. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01406"},{"key":"10_CR87","doi-asserted-by":"crossref","unstructured":"Wu, W., Luo, H., Fang, B., Wang, J., Ouyang, W.: Cap4Video: what can auxiliary captions do for text-video retrieval? In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"10_CR88","doi-asserted-by":"crossref","unstructured":"Xiao, J., Shang, X., Yao, A., Chua, T.S.: NExT-QA: next phase of question-answering to explaining temporal actions. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"10_CR89","doi-asserted-by":"crossref","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Just ask: learning to answer questions from millions of narrated videos. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00171"},{"key":"10_CR90","doi-asserted-by":"crossref","unstructured":"Yang, P., et al.: AVQA: a dataset for audio-visual question answering on videos. In: MM (2022)","DOI":"10.1145\/3503161.3548291"},{"key":"10_CR91","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Yu","year":"2016","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 69\u201385. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5"},{"key":"10_CR92","doi-asserted-by":"crossref","unstructured":"Yu, Z., et al.: ActivityNet-QA: a dataset for understanding complex web videos via question answering. In: AAAI (2019)","DOI":"10.1609\/aaai.v33i01.33019127"},{"key":"10_CR93","doi-asserted-by":"crossref","unstructured":"Yuan, Z., et al.: InstanceRefer: cooperative holistic understanding for visual grounding on point clouds through instance multi-level contextual referring. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00181"},{"key":"10_CR94","doi-asserted-by":"publisher","first-page":"659","DOI":"10.1007\/978-3-031-19812-0_38","volume-title":"ECCV 2022","author":"F Zeng","year":"2022","unstructured":"Zeng, F., Dong, B., Zhang, Y., Wang, T., Zhang, X., Wei, Y.: MOTR: end-to-end multiple-object tracking with transformer. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13687, pp. 659\u2013675. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19812-0_38"},{"key":"10_CR95","doi-asserted-by":"crossref","unstructured":"Zhang, G., Ren, J., Gu, J., Tresp, V.: Multi-event video-text retrieval. In: CVPR (2023)","DOI":"10.1109\/ICCV51070.2023.02021"},{"key":"10_CR96","doi-asserted-by":"crossref","unstructured":"Zhang, H., Li, X., Bing, L.: Video-LLaMA: an instruction-tuned audio-visual language model for video understanding. In: EMNLP (2023)","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"10_CR97","doi-asserted-by":"crossref","unstructured":"Zheng, J., Zhang, J., Yang, K., Peng, K., Stiefelhagen, R.: MateRobot: material recognition in wearable robotics for people with visual impairments. In: ICRA (2024)","DOI":"10.1109\/ICRA57147.2024.10610333"},{"key":"10_CR98","doi-asserted-by":"crossref","unstructured":"Zong, D., Sun, S.: McOmet: multimodal fusion transformer for physical audiovisual commonsense reasoning. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i13.27728"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72655-2_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T11:29:40Z","timestamp":1733398180000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72655-2_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,6]]},"ISBN":["9783031726545","9783031726552"],"references-count":98,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72655-2_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,6]]},"assertion":[{"value":"6 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}