{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T19:09:52Z","timestamp":1757617792077,"version":"3.44.0"},"reference-count":70,"publisher":"Springer Science and Business Media LLC","issue":"8","license":[{"start":{"date-parts":[[2025,5,3]],"date-time":"2025-05-03T00:00:00Z","timestamp":1746230400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,3]],"date-time":"2025-05-03T00:00:00Z","timestamp":1746230400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s11263-025-02429-z","type":"journal-article","created":{"date-parts":[[2025,5,3]],"date-time":"2025-05-03T01:11:16Z","timestamp":1746234676000},"page":"5302-5325","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Effectively Leveraging CLIP for Generating Situational Summaries of Images and Videos"],"prefix":"10.1007","volume":"133","author":[{"given":"Dhruv","family":"Verma","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8779-1241","authenticated-orcid":false,"given":"Debaditya","family":"Roy","sequence":"additional","affiliation":[]},{"given":"Basura","family":"Fernando","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,3]]},"reference":[{"key":"2429_CR1","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J. B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., & Simonyan, K. (2022). Flamingo: A visual language model for few-shot learning. Advances in Neural Information Processing Systems, 35, 23716\u201323736.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2429_CR2","unstructured":"Ba, J. L., Kiros, J. R., & Hinton, G. E. (2016). Layer normalization. arXiv preprint arXiv:1607.06450"},{"key":"2429_CR3","doi-asserted-by":"crossref","unstructured":"Chiba, Y., & Higashinaka, R. (2021). Dialogue situation recognition for everyday conversation using multimodal information. In Interspeech (pp. 241\u2013245).","DOI":"10.21437\/Interspeech.2021-171"},{"key":"2429_CR4","doi-asserted-by":"crossref","unstructured":"Cho, J., Yoon, Y., & Kwak, S. (2022). Collaborative transformers for grounded situation recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 19659\u201319668).","DOI":"10.1109\/CVPR52688.2022.01904"},{"key":"2429_CR5","doi-asserted-by":"crossref","unstructured":"Cho, J., Yoon, Y., Lee, H., & Kwak, S. (2021). Grounded situation recognition with transformers. In British machine vision conference (BMVC).","DOI":"10.5244\/C.35.215"},{"key":"2429_CR6","doi-asserted-by":"crossref","unstructured":"Cong, Y., Liao, W., Ackermann, H., Rosenhahn, B., & Yang, M. Y. (2021). Spatial-temporal transformer for dynamic scene graph generation. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 16372\u201316382).","DOI":"10.1109\/ICCV48922.2021.01606"},{"key":"2429_CR7","doi-asserted-by":"crossref","unstructured":"Cooray, T., Cheung, N. M., & Lu, W. (2020). Attention-based context aware reasoning for situation recognition. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 4736\u20134745).","DOI":"10.1109\/CVPR42600.2020.00479"},{"key":"2429_CR8","doi-asserted-by":"crossref","unstructured":"Doveh, S., Arbelle, A., Harary, S., Schwartz, E., Herzig, R., Giryes, R., & Karlinsky, L. (2023). Teaching structured vision & language concepts to vision & language models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 2657\u20132668).","DOI":"10.1109\/CVPR52729.2023.00261"},{"key":"2429_CR9","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., & He, K. (2019). Slowfast networks for video recognition. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 6202\u20136211).","DOI":"10.1109\/ICCV.2019.00630"},{"key":"2429_CR10","doi-asserted-by":"crossref","unstructured":"Gu, X., Chen, G., Wang, Y., Zhang, L., Luo, T., & Wen, L. (2023). Text with knowledge graph augmented transformer for video captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 18941\u201318951).","DOI":"10.1109\/CVPR52729.2023.01816"},{"key":"2429_CR11","doi-asserted-by":"crossref","unstructured":"Gu, J., Zhao, H., Lin, Z., Li, S., Cai, J., & Ling, M. (2019). Scene graph generation with external knowledge and image reconstruction. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 1969\u20131978).","DOI":"10.1109\/CVPR.2019.00207"},{"key":"2429_CR12","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y. T., Parekh, Z., Pham, H., & Duerig, T. (2021). Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning, PMLR (pp. 4904\u20134916)."},{"key":"2429_CR13","doi-asserted-by":"publisher","first-page":"120698","DOI":"10.1016\/j.eswa.2023.120698","volume":"231","author":"J Jia","year":"2023","unstructured":"Jia, J., Ding, X., Pang, S., Gao, X., Xin, X., Hu, R., & Nie, J. (2023). Image captioning based on scene graphs: A survey. Expert Systems with Applications, 231, 120698.","journal-title":"Expert Systems with Applications"},{"key":"2429_CR14","first-page":"7277","volume":"2023","author":"T Jiang","year":"2023","unstructured":"Jiang, T., & Riloff, E. (2023). Exploiting commonsense knowledge about objects for visual activity recognition. Findings of the Association for Computational Linguistics: ACL, 2023, 7277\u20137285.","journal-title":"Findings of the Association for Computational Linguistics: ACL"},{"key":"2429_CR15","unstructured":"Kenton, J. D. M. W. C., & Toutanova, L. K. (2019). Bert: Pre-training of deep bidirectional transformers for language understanding. In Proceedings of NAACL-HLT (pp. 4171\u20134186)."},{"key":"2429_CR16","first-page":"8199","volume":"35","author":"Z Khan","year":"2022","unstructured":"Khan, Z., Jawahar, C., & Tapaswi, M. (2022). Grounded video situation recognition. Advances in Neural Information Processing Systems, 35, 8199\u20138210.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2429_CR17","doi-asserted-by":"crossref","unstructured":"Kim, J., Park, J., Park, J., Kim, J., Kim, S., & Kim, H. J. (2024). Groupwise query specialization and quality-aware multi-assignment for transformer-based visual relationship detection. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 28160\u201328169).","DOI":"10.1109\/CVPR52733.2024.02660"},{"key":"2429_CR18","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1007\/BF03037383","volume":"4","author":"R Kowalski","year":"1986","unstructured":"Kowalski, R., & Sergot, M. (1986). A logic-based calculus of events. New Generation Computing, 4, 67\u201395.","journal-title":"New Generation Computing"},{"key":"2429_CR19","doi-asserted-by":"crossref","unstructured":"Li, R., Tapaswi, M., Liao, R., Jia, J., Urtasun, R., & Fidler, S. (2017). Situation recognition with graph neural networks. In Proceedings of the IEEE international conference on computer vision (pp. 4173\u20134182).","DOI":"10.1109\/ICCV.2017.448"},{"key":"2429_CR20","doi-asserted-by":"crossref","unstructured":"Li, M., Xu, R., Wang, S., Zhou, L., Lin, X., Zhu, C., & Chang, S. F. (2022). Clip-event: Connecting text and images with event structures. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 16420\u201316429).","DOI":"10.1109\/CVPR52688.2022.01593"},{"key":"2429_CR21","unstructured":"Lin, C. Y. (2004). Rouge: A package for automatic evaluation of summaries. In Text summarization branches out (pp. 74\u201381)."},{"key":"2429_CR22","doi-asserted-by":"crossref","unstructured":"Lin, Z., Geng, S., Zhang, R. (2022). Frozen clip models are efficient video learners. In X. X. X. V. Part (Ed.), Computer Vision-ECCV 2022: 17th European conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings (pp. 388\u2013404). Springer.","DOI":"10.1007\/978-3-031-19833-5_23"},{"key":"2429_CR23","doi-asserted-by":"crossref","unstructured":"Lin, J., Yin, H., Ping, W., Molchanov, P., Shoeybi, M., & Han, S. (2024). Vila: On pre-training for visual language models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (CVPR) (pp. 26689\u201326699).","DOI":"10.1109\/CVPR52733.2024.02520"},{"issue":"3","key":"2429_CR24","doi-asserted-by":"publisher","first-page":"869","DOI":"10.1109\/TCDS.2021.3075862","volume":"14","author":"X Liu","year":"2021","unstructured":"Liu, X., Cao, Z., Yu, Y., Ren, G., Yu, J., & Tan, M. (2021). Robot navigation based on situational awareness. IEEE Transactions on Cognitive and Developmental Systems, 14(3), 869\u2013881.","journal-title":"IEEE Transactions on Cognitive and Developmental Systems"},{"key":"2429_CR25","first-page":"34892","volume":"36","author":"H Liu","year":"2023","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2023). Visual instruction tuning. Advances in Neural Information Processing Systems, 36, 34892\u201334916.","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"2","key":"2429_CR26","doi-asserted-by":"publisher","first-page":"1049","DOI":"10.1109\/TPAMI.2023.3327677","volume":"46","author":"G Li","year":"2023","unstructured":"Li, G., Ye, H., Qi, Y., Wang, S., Qing, L., Huang, Q., & Yang, M. H. (2023). Learning hierarchical modular networks for video captioning. IEEE Transactions on Pattern Analysis and Machine Intelligence, 46(2), 1049\u20131064.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2429_CR27","doi-asserted-by":"publisher","first-page":"127052","DOI":"10.1016\/j.neucom.2023.127052","volume":"566","author":"H Li","year":"2024","unstructured":"Li, H., Zhu, G., Zhang, L., Jiang, Y., Dang, Y., Hou, H., & Bennamoun, M. (2024). Scene graph generation: A comprehensive survey. Neurocomputing, 566, 127052.","journal-title":"Neurocomputing"},{"key":"2429_CR28","doi-asserted-by":"crossref","unstructured":"Lu, C., Krishna, R., Bernstein, M., & Fei-Fei, L. (2016). Visual relationship detection with language priors. In Computer vision\u2014ECCV 2016: 14th European conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part I 14 (pp. 852\u2013869). Springer.","DOI":"10.1007\/978-3-319-46448-0_51"},{"key":"2429_CR29","doi-asserted-by":"crossref","unstructured":"Lu, Y., Zhang, Z., Yuan, C., Li, P., Wang, Y., Li, B., & Hu, W. (2024). Set prediction guided by semantic concepts for diverse video captioning. In Proceedings of the AAAI conference on artificial intelligence (pp. 3909\u20133917).","DOI":"10.1609\/aaai.v38i4.28183"},{"key":"2429_CR30","doi-asserted-by":"crossref","unstructured":"Ma, Y., Xu, G., Sun, X., Yan, M., Zhang, J., & Ji, R. (2022). X-clip: End-to-end multi-grained contrastive learning for video-text retrieval. In Proceedings of the 30th ACM international conference on multimedia (pp. 638\u2013647).","DOI":"10.1145\/3503161.3547910"},{"key":"2429_CR31","doi-asserted-by":"crossref","unstructured":"Malla, S., Dariush, B., & Choi, C. (2020). Titan: Future forecast using action priors. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 11186\u201311196).","DOI":"10.1109\/CVPR42600.2020.01120"},{"key":"2429_CR32","doi-asserted-by":"crossref","unstructured":"Mallya, A., & Lazebnik, S. (2017). Recurrent models for situation recognition. In Proceedings of the IEEE international conference on computer vision (pp. 455\u2013463).","DOI":"10.1109\/ICCV.2017.57"},{"key":"2429_CR33","doi-asserted-by":"crossref","unstructured":"McCarthy, J.(1963). Situations, actions, and causal laws. Comtex Scientific.","DOI":"10.21236\/AD0785031"},{"key":"2429_CR34","doi-asserted-by":"crossref","unstructured":"McCarthy, J., & Hayes, P. J. (1981). Some philosophical problems from the standpoint of artificial intelligence. In Readings in artificial intelligence (pp. 431\u2013450). Elsevier.","DOI":"10.1016\/B978-0-934613-03-3.50033-7"},{"key":"2429_CR35","doi-asserted-by":"crossref","unstructured":"Moosavi, N. S., & Strube, M. (2016). Which coreference evaluation metric do you trust? A proposal for a link-based entity aware metric. In Proceedings of the 54th annual meeting of the association for computational linguistics (pp. 632\u2013642). Association for Computational Linguistics.","DOI":"10.18653\/v1\/P16-1060"},{"key":"2429_CR36","unstructured":"Nair, V., & Hinton, G. E. (2010). Rectified linear units improve restricted Boltzmann machines. In Proceedings of the 27th international conference on machine learning (ICML-10) (pp. 807\u2013814)."},{"key":"2429_CR37","doi-asserted-by":"crossref","unstructured":"Nguyen, K., Tripathi, S., & Du, B. (2021). In defense of scene graphs for image captioning. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 1407\u20131416).","DOI":"10.1109\/ICCV48922.2021.00144"},{"key":"2429_CR38","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., & Manning, C. D. (2014). Glove: Global vectors for word representation. In Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP) (pp. 1532\u20131543).","DOI":"10.3115\/v1\/D14-1162"},{"key":"2429_CR39","doi-asserted-by":"crossref","unstructured":"Pratt, S., Yatskar, M., & Weihs, L. (2020). Grounded situation recognition. In Computer vision\u2014ECCV 2020: 16th European conference, Glasgow, UK, August 23\u201328, 2020, proceedings, Part IV 16 (pp. 314\u2013332). Springer.","DOI":"10.1007\/978-3-030-58548-8_19"},{"key":"2429_CR40","unstructured":"Puig, X., Shu, T., & Li, S. (2020). Watch-and-help: A challenge for social perception and human\u2013Ai collaboration. In International conference on learning representations."},{"key":"2429_CR41","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., & Sutskever, I. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning, PMLR (pp. 8748\u20138763)."},{"key":"2429_CR42","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/4074.001.0001","volume-title":"Knowledge in action: Logical foundations for specifying and implementing dynamical systems","author":"R Reiter","year":"2001","unstructured":"Reiter, R. (2001). Knowledge in action: Logical foundations for specifying and implementing dynamical systems. MIT Press."},{"key":"2429_CR43","unstructured":"Roy, D., Verma, D., & Fernando, B. (2024). Clipsitu: Effectively leveraging clip for conditional predictions in situation recognition. In Proceedings of the IEEE\/CVF winter conference on applications of computer vision (pp. 444\u2013453)."},{"key":"2429_CR44","doi-asserted-by":"crossref","unstructured":"Sadhu, A., Gupta, T., Yatskar, M., Nevatia, R., & Kembhavi, A. (2021). Visual semantic role labeling for video understanding. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 5589\u20135600).","DOI":"10.1109\/CVPR46437.2021.00554"},{"key":"2429_CR45","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., & Soricut, R. (2018). Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In Proceedings of the 56th annual meeting of the association for computational linguistics (Volume 1: Long Papers, pp. 2556\u20132565).","DOI":"10.18653\/v1\/P18-1238"},{"key":"2429_CR46","doi-asserted-by":"crossref","unstructured":"Suhail, M., & Sigal, L. (2019). Mixture-kernel graph attention network for situation recognition. In Proceedings of the IEEE\/CVF international conference on computer vision (pp. 10363\u201310372).","DOI":"10.1109\/ICCV.2019.01046"},{"key":"2429_CR47","doi-asserted-by":"crossref","unstructured":"Sung, Y. L., Cho, J., & Bansal, M. (2022). Vl-adapter: Parameter-efficient transfer learning for vision-and-language tasks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 5227\u20135237).","DOI":"10.1109\/CVPR52688.2022.00516"},{"key":"2429_CR48","first-page":"38","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., & Polosukhin, I. (2017). Attention is all you need. Advances in Neural Information Processing Systems, 30, 38.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2429_CR49","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., & Parikh, D. (2015). Cider: Consensus-based image description evaluation. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4566\u20134575).","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"2429_CR50","doi-asserted-by":"publisher","first-page":"983","DOI":"10.1007\/s00371-012-0752-6","volume":"29","author":"S Vishwakarma","year":"2013","unstructured":"Vishwakarma, S., & Agrawal, A. (2013). A survey on activity recognition and behavior understanding in video surveillance. The Visual Computer, 29, 983\u20131009.","journal-title":"The Visual Computer"},{"key":"2429_CR51","unstructured":"Wang, P., Bai, S., & Tan, S., (2024). Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:2409.12191"},{"key":"2429_CR52","doi-asserted-by":"publisher","unstructured":"Wang, D., Beck, D., & Cohn, T. (2019). On the role of scene graphs in image captioning. In: Mogadala, A., Klakow, D., Pezzelle, S. (eds.) Proceedings of the beyond vision and LANguage: inTEgrating Real-world kNowledge (LANTERN) (pp. 29\u201334). Association for Computational Linguistics. https:\/\/doi.org\/10.18653\/v1\/D19-6405","DOI":"10.18653\/v1\/D19-6405"},{"key":"2429_CR53","doi-asserted-by":"publisher","unstructured":"Wang, T., Chen, W., & Tian, Y. (2023). Improving image captioning via predicting structured concepts. In: Bouamor. H., Pino, J., Bali, K. (Eds.) Proceedings of the 2023 conference on empirical methods in natural language processing (pp. 360\u2013370). Association for Computational Linguistics. https:\/\/doi.org\/10.18653\/v1\/2023.emnlp-main.25","DOI":"10.18653\/v1\/2023.emnlp-main.25"},{"key":"2429_CR54","first-page":"12014","volume":"35","author":"Y Wan","year":"2022","unstructured":"Wan, Y., Mao, J., & Tenenbaum, J. (2022). Handmethat: Human\u2013robot communication in physical and social environments. Advances in Neural Information Processing Systems, 35, 12014\u201312026.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2429_CR55","doi-asserted-by":"crossref","unstructured":"Wei, M., Chen, L., Ji, W., Yue, X., & Chua, T. S. (2022). Rethinking the two-stage framework for grounded situation recognition. In Proceedings of the AAAI conference on artificial intelligence (pp. 2651\u20132658).","DOI":"10.1609\/aaai.v36i3.20167"},{"key":"2429_CR56","doi-asserted-by":"crossref","unstructured":"Wortsman, M., Ilharco, G., Kim, J. W., Li, M., Kornblith, S., Roelofs, R., & Schmidt, L. (2022). Robust fine-tuning of zero-shot models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 7959\u20137971).","DOI":"10.1109\/CVPR52688.2022.00780"},{"key":"2429_CR57","doi-asserted-by":"crossref","unstructured":"Xiao, F., Kundu, K., Tighe, J., & Modolo, D. (2022). Hierarchical self-supervised representation learning for movie understanding. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 9727\u20139736).","DOI":"10.1109\/CVPR52688.2022.00950"},{"key":"2429_CR58","doi-asserted-by":"crossref","unstructured":"Yang, G., Li, M., Zhang, J., Lin, X., Ji, H., & Chang, S. F. (2023a). Video event extraction via tracking visual states of arguments. In Proceedings of the AAAI conference on artificial intelligence (pp. 3136\u20133144).","DOI":"10.1609\/aaai.v37i3.25418"},{"key":"2429_CR59","doi-asserted-by":"crossref","unstructured":"Yang, X., Peng, J., Wang, Z., Xu, H., Ye, Q., Li, C., & Zhang, Y. (2023b). Transforming visual scene graphs to image captions. In Proceedings of the 61st annual meeting of the association for computational linguistics (Vol. 1: Long Papers, pp. 12427\u201312440).","DOI":"10.18653\/v1\/2023.acl-long.694"},{"key":"2429_CR60","unstructured":"Yang, T., Zhu, Y., Xie, Y., Zhang, A., Chen, C., & Li, M. (2022). Aim: Adapting image models for efficient video action recognition. In The eleventh international conference on learning representations."},{"key":"2429_CR61","unstructured":"Yao, Y., Yu, T., Zhang, A., Wang, C., Cui, J., Zhu, H., & Sun, M. (2024). Minicpm-v: A gpt-4v level mllm on your phone. CoRR."},{"key":"2429_CR62","doi-asserted-by":"crossref","unstructured":"Yatskar, M., Ordonez, V., Zettlemoyer, L., & Farhadi, A. (2017). Commonly uncommon: Semantic sparsity in situation recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 7196\u20137205).","DOI":"10.1109\/CVPR.2017.671"},{"key":"2429_CR63","doi-asserted-by":"crossref","unstructured":"Yatskar, M., Zettlemoyer, L., & Farhadi, A. (2016). Situation recognition: Visual semantic role labeling for image understanding. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 5534\u20135542).","DOI":"10.1109\/CVPR.2016.597"},{"key":"2429_CR64","doi-asserted-by":"crossref","unstructured":"Ye, H., Li, G., Qi, Y., Wang, S., Huang, Q., & Yang, M. H. (2022). Hierarchical modular network for video captioning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 17939\u201317948).","DOI":"10.1109\/CVPR52688.2022.01741"},{"key":"2429_CR65","doi-asserted-by":"crossref","unstructured":"Yu, L., Lin, Z., Shen, X., Yang, J., Lu, X., Bansal, M., & Berg, T. L. (2018). Mattnet: Modular attention network for referring expression comprehension. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1307\u20131315).","DOI":"10.1109\/CVPR.2018.00142"},{"key":"2429_CR66","doi-asserted-by":"crossref","unstructured":"Yuan, T., Zhang, X., Liu, K., Liu, B., Chen, C., Jin, J., & Jiao, Z. (2024). Towards surveillance video-and-language understanding: New dataset baselines and challenges. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition (pp. 22052\u201322061).","DOI":"10.1109\/CVPR52733.2024.02082"},{"key":"2429_CR67","unstructured":"Zhang, Y., Wu, J., Li, W., Li, B., Ma, Z., Liu, Z., & Li, C. (2024). Video instruction tuning with synthetic data. arXiv preprint arXiv:2410.02713"},{"key":"2429_CR68","unstructured":"Zhang, R., Zhou, Y., Chen, J., Gu, J., Chen, C., & Sun, T. (2024). Llava-next: Improved reasoning, ocr, and world knowledge. https:\/\/llava-vl.github.io\/blog\/2024-01-30-llava-next\/"},{"key":"2429_CR69","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Fei, H., Cao, Y., Li, B., Zhang, M., Wei, J., & Chua, T. S. (2023). Constructing holistic spatio-temporal scene graph for video semantic role labeling. In Proceedings of the 31st ACM international conference on multimedia (pp. 5281\u20135291).","DOI":"10.1145\/3581783.3612096"},{"key":"2429_CR70","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Gordon, D., Kolve, E., Fox, D., Fei-Fei, L., Gupta, A., & Farhadi, A. (2017). Visual semantic planning using deep successor representations. In Proceedings of the IEEE international conference on computer vision (pp. 483\u2013492).","DOI":"10.1109\/ICCV.2017.60"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02429-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02429-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02429-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T12:45:35Z","timestamp":1757162735000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02429-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,3]]},"references-count":70,"journal-issue":{"issue":"8","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["2429"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02429-z","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2025,5,3]]},"assertion":[{"value":"19 July 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 March 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 May 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}