{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,17]],"date-time":"2026-02-17T16:06:54Z","timestamp":1771344414299,"version":"3.50.1"},"reference-count":82,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T00:00:00Z","timestamp":1768003200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T00:00:00Z","timestamp":1768003200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372014"],"award-info":[{"award-number":["62372014"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61925201"],"award-info":[{"award-number":["61925201"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62132001"],"award-info":[{"award-number":["62132001"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62432001"],"award-info":[{"award-number":["62432001"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004826","name":"Natural Science Foundation of Beijing Municipality","doi-asserted-by":"publisher","award":["4252040"],"award-info":[{"award-number":["4252040"]}],"id":[{"id":"10.13039\/501100004826","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004826","name":"Natural Science Foundation of Beijing Municipality","doi-asserted-by":"publisher","award":["L247006"],"award-info":[{"award-number":["L247006"]}],"id":[{"id":"10.13039\/501100004826","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1007\/s11263-025-02599-w","type":"journal-article","created":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T18:27:58Z","timestamp":1768069678000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Large-Scale Pre-Trained Models Empowering Phrase Generalization in Temporal Sentence Localization"],"prefix":"10.1007","volume":"134","author":[{"given":"Yang","family":"Liu","sequence":"first","affiliation":[]},{"given":"Minghang","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Qingchao","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Shaogang","family":"Gong","sequence":"additional","affiliation":[]},{"given":"Yuxin","family":"Peng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,10]]},"reference":[{"key":"2599_CR1","doi-asserted-by":"crossref","unstructured":"Wang, Z., Wang, L., Wu, T., Li, T., & Wu, G. (2022). Negative sample matters: A renaissance of metric learning for temporal grounding. In: Thirty-Sixth AAAI Conference on Artificial Intelligence, AAAI 2022, Thirty-Fourth Conference on Innovative Applications of Artificial Intelligence, IAAI 2022, The Twelveth Symposium on Educational Advances in Artificial Intelligence, EAAI 2022 Virtual Event, February 22 - March 1, 2022, pp. 2613\u20132623.","DOI":"10.1609\/aaai.v36i3.20163"},{"key":"2599_CR2","doi-asserted-by":"crossref","unstructured":"Otani, M., Nakashima, Y., Rahtu, E., & Heikkil\u00e4, J. (2020). Uncovering hidden challenges in query-based video moment retrieval. In: 31st British Machine Vision Conference 2020, BMVC 2020, Virtual Event, UK, September 7-10, 2020.","DOI":"10.5244\/C.34.84"},{"key":"2599_CR3","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Lan, X., Wang, X., Chen, L., Wang, Z., & Zhu, W. (2021). A closer look at temporal sentence grounding in videos: Dataset and metric. In: Proceedings of the 2nd International Workshop on Human-centric Multimedia Analysis, pp. 13\u201321.","DOI":"10.1145\/3475723.3484247"},{"key":"2599_CR4","doi-asserted-by":"crossref","unstructured":"Li, J., Xie, J., Qian, L., Zhu, L., Tang, S., Wu, F., Yang, Y., Zhuang, Y., & Wang, X.E. (2022). Compositional temporal grounding with structured variational cross-graph correspondence learning. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, New Orleans, LA, USA, June 18-24, 2022, pp. 3022\u20133031.","DOI":"10.1109\/CVPR52688.2022.00304"},{"key":"2599_CR5","doi-asserted-by":"crossref","unstructured":"Rasheed, H.A., Khattak, M.U., Maaz, M., Khan, S.H., & Khan, F.S. (2023). Fine-tuned CLIP models are efficient video learners. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023, Vancouver, BC, Canada, June 17-24, 2023, pp. 6545\u20136554.","DOI":"10.1109\/CVPR52729.2023.00633"},{"key":"2599_CR6","doi-asserted-by":"crossref","unstructured":"Zheng, M., Li, S., Chen, Q., Peng, Y., & Liu, Y. (2023). Phrase-level temporal relationship mining for temporal sentence localization. In: Williams, B., Chen, Y., Neville, J. (eds.) Thirty-Seventh AAAI Conference on Artificial Intelligence, AAAI 2023, Thirty-Fifth Conference on Innovative Applications of Artificial Intelligence, IAAI 2023, Thirteenth Symposium on Educational Advances in Artificial Intelligence, EAAI 2023, Washington, DC, USA, February 7-14, 2023, pp. 3669\u20133677.","DOI":"10.1609\/aaai.v37i3.25478"},{"key":"2599_CR7","doi-asserted-by":"crossref","unstructured":"Gao, J., Sun, C., Yang, Z., & Nevatia, R. (2017). TALL: temporal activity localization via language query. In: IEEE International Conference on Computer Vision, ICCV 2017, Venice, Italy, October 22-29, 2017, pp. 5277\u20135285.","DOI":"10.1109\/ICCV.2017.563"},{"key":"2599_CR8","doi-asserted-by":"crossref","unstructured":"Li, H., Cao, M., Cheng, X., Li, Y., Zhu, Z., & Zou, Y. (2023). G2L: semantically aligned and uniform video grounding via geodesic and game theory. In: IEEE\/CVF International Conference on Computer Vision, ICCV 2023, Paris, France, October 1-6, 2023, pp. 11998\u201312008.","DOI":"10.1109\/ICCV51070.2023.01105"},{"key":"2599_CR9","doi-asserted-by":"crossref","unstructured":"Lin, K.Q., Zhang, P., Chen, J., Pramanick, S., Gao, D., Wang, A.J., Yan, R., & Shou, M.Z. (2023). Univtg: Towards unified video-language temporal grounding. In: IEEE\/CVF International Conference on Computer Vision, ICCV 2023, Paris, France, October 1-6, 2023, pp. 2782\u20132792.","DOI":"10.1109\/ICCV51070.2023.00262"},{"key":"2599_CR10","doi-asserted-by":"crossref","unstructured":"Jang, J., Park, J., Kim, J., Kwon, H., & Sohn, K. (2023). Knowing where to focus: Event-aware transformer for video grounding. In: IEEE\/CVF International Conference on Computer Vision, ICCV 2023, Paris, France, October 1-6, 2023, pp. 13800\u201313810.","DOI":"10.1109\/ICCV51070.2023.01273"},{"key":"2599_CR11","doi-asserted-by":"crossref","unstructured":"Fang, X., Liu, D., Zhou, P., & Nan, G. (2023). You can ground earlier than see: An effective and efficient pipeline for temporal sentence grounding in compressed videos. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023, Vancouver, BC, Canada, June 17-24, 2023, pp. 2448\u20132460.","DOI":"10.1109\/CVPR52729.2023.00242"},{"key":"2599_CR12","doi-asserted-by":"crossref","unstructured":"Zhang, S., Peng, H., Fu, J., & Luo, J. (2020). Learning 2d temporal adjacent networks for moment localization with natural language. In: The Thirty-Fourth AAAI Conference on Artificial Intelligence, AAAI 2020, The Thirty-Second Innovative Applications of Artificial Intelligence Conference, IAAI 2020, The Tenth AAAI Symposium on Educational Advances in Artificial Intelligence, EAAI 2020, New York, NY, USA, February 7-12, 2020, pp. 12870\u201312877.","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"2599_CR13","doi-asserted-by":"crossref","unstructured":"Zhang, H., Sun, A., Jing, W., & Zhou, J.T. (2020). Span-based localizing network for natural language video localization. In: Jurafsky, D., Chai, J., Schluter, N., Tetreault, J. (eds.) Proc. of ACL, Online, pp. 6543\u20136554.","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"2599_CR14","doi-asserted-by":"crossref","unstructured":"Rodriguez-Opazo, C., Marrese-Taylor, E., Fernando, B., Li, H., & Gould, S. (2021). Dori: Discovering object relationships for moment localization of a natural language query in a video. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 1079\u20131088.","DOI":"10.1109\/WACV48630.2021.00112"},{"key":"2599_CR15","doi-asserted-by":"crossref","unstructured":"Liu, D., Qu, X., Zhou, P., & Liu, Y. (2022). Exploring motion and appearance information for temporal sentence grounding. In: Thirty-Sixth AAAI Conference on Artificial Intelligence, AAAI 2022, Thirty-Fourth Conference on Innovative Applications of Artificial Intelligence, IAAI 2022, The Twelveth Symposium on Educational Advances in Artificial Intelligence, EAAI 2022 Virtual Event, February 22 - March 1, 2022, pp. 1674\u20131682.","DOI":"10.1609\/aaai.v36i2.20059"},{"key":"2599_CR16","doi-asserted-by":"crossref","unstructured":"Mun, J., Cho, M., & Han, B. (2020). Local-global video-text interactions for temporal grounding. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2020, Seattle, WA, USA, June 13-19, 2020, pp. 10807\u201310816.","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"2599_CR17","doi-asserted-by":"crossref","unstructured":"Liu, D., Qu, X., Di, X., Cheng, Y., Xu, Z., & Zhou, P. (2022). Memory-guided semantic learning network for temporal sentence grounding. In: Thirty-Sixth AAAI Conference on Artificial Intelligence, AAAI 2022, Thirty-Fourth Conference on Innovative Applications of Artificial Intelligence, IAAI 2022, The Twelveth Symposium on Educational Advances in Artificial Intelligence, EAAI 2022 Virtual Event, February 22 - March 1, 2022, pp. 1665\u20131673.","DOI":"10.1609\/aaai.v36i2.20058"},{"key":"2599_CR18","doi-asserted-by":"crossref","unstructured":"Huang, J., Jin, H., Gong, S., & Liu, Y. (2022). Video activity localisation with uncertainties in temporal boundary. In: European Conference on Computer Vision, pp. 724\u2013740. Springer.","DOI":"10.1007\/978-3-031-19830-4_41"},{"key":"2599_CR19","doi-asserted-by":"crossref","unstructured":"Yang, L., Kong, Q., Yang, H., Kehl, W., Sato, Y., & Kobori, N. (2023). Deco: Decomposition and reconstruction for compositional temporal grounding via coarse-to-fine contrastive ranking. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023, Vancouver, BC, Canada, June 17-24, 2023, pp. 23130\u201323140.","DOI":"10.1109\/CVPR52729.2023.02215"},{"issue":"1","key":"2599_CR20","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/j.sigpro.2012.07.029","volume":"93","author":"X Song","year":"2013","unstructured":"Song, X., Jiao, L. C., Yang, S., Zhang, X., & Shang, F. (2013). Sparse coding and classifier ensemble based multi-instance learning for image categorization. Signal Processing,93(1), 1\u201311.","journal-title":"Signal Processing"},{"key":"2599_CR21","unstructured":"Xu, H., Venugopalan, S., Ramanishka, V., Rohrbach, M., & Saenko, K. (2015). A multi-scale multiple instance video description network. ArXiv preprint arXiv:abs\/1505.05914"},{"issue":"3","key":"2599_CR22","doi-asserted-by":"publisher","first-page":"591","DOI":"10.1016\/j.media.2014.01.010","volume":"18","author":"Y Xu","year":"2014","unstructured":"Xu, Y., Zhu, J.-Y., Chang, E.I.-C., Lai, M., & Tu, Z. (2014). Weakly supervised histopathology cancer image segmentation and classification. Medical Image Analysis,18(3), 591\u2013604.","journal-title":"Medical Image Analysis"},{"key":"2599_CR23","doi-asserted-by":"crossref","unstructured":"Huang, J., Liu, Y., Gong, S., & Jin, H. (2021). Cross-sentence temporal and semantic relations in video activity localisation. In: 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021, Montreal, QC, Canada, October 10-17, 2021, pp. 7179\u20137188.","DOI":"10.1109\/ICCV48922.2021.00711"},{"key":"2599_CR24","doi-asserted-by":"publisher","first-page":"3252","DOI":"10.1109\/TIP.2021.3058614","volume":"30","author":"W Yang","year":"2021","unstructured":"Yang, W., Zhang, T., Zhang, Y., & Wu, F. (2021). Local correspondence network for weakly supervised temporal sentence grounding. IEEE Transactions on Image Processing,30, 3252\u20133262.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2599_CR25","doi-asserted-by":"crossref","unstructured":"Zheng, M., Huang, Y., Chen, Q., & Liu, Y. (2022). Weakly supervised video moment localization with contrastive negative sample mining. In: Thirty-Sixth AAAI Conference on Artificial Intelligence, AAAI 2022, Thirty-Fourth Conference on Innovative Applications of Artificial Intelligence, IAAI 2022, The Twelveth Symposium on Educational Advances in Artificial Intelligence, EAAI 2022 Virtual Event, February 22 - March 1, 2022, pp. 3517\u20133525.","DOI":"10.1609\/aaai.v36i3.20263"},{"key":"2599_CR26","doi-asserted-by":"crossref","unstructured":"Zheng, M., Huang, Y., Chen, Q., Peng, Y., & Liu, Y. (2022). Weakly supervised temporal sentence grounding with gaussian-based contrastive proposal learning. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, New Orleans, LA, USA, June 18-24, 2022, pp. 15534\u201315543.","DOI":"10.1109\/CVPR52688.2022.01511"},{"key":"2599_CR27","doi-asserted-by":"crossref","unstructured":"Huang, Y., Yang, L., & Sato, Y. (2023). Weakly supervised temporal sentence grounding with uncertainty-guided self-training. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023, Vancouver, BC, Canada, June 17-24, 2023, pp. 18908\u201318918.","DOI":"10.1109\/CVPR52729.2023.01813"},{"key":"2599_CR28","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., & Sutskever, I. (2021). Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proc. of ICML. Proceedings of Machine Learning Research, vol. 139, pp. 8748\u20138763."},{"key":"2599_CR29","unstructured":"Li, J., Li, D., Xiong, C., & Hoi, S.C.H. (2022). BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: Chaudhuri, K., Jegelka, S., Song, L., Szepesv\u00e1ri, C., Niu, G., Sabato, S. (eds.) Proc. of ICML. Proceedings of Machine Learning Research, vol. 162, pp. 12888\u201312900."},{"key":"2599_CR30","unstructured":"Zeng, Y., Zhang, X., & Li, H. (2022). Multi-grained vision language pre-training: Aligning texts with visual concepts. In: Chaudhuri, K., Jegelka, S., Song, L., Szepesv\u00e1ri, C., Niu, G., Sabato, S. (eds.) Proc. of ICML. Proceedings of Machine Learning Research, vol. 162, pp. 25994\u201326009."},{"key":"2599_CR31","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., & Schmid, C. (2019). Videobert: A joint model for video and language representation learning. In: 2019 IEEE\/CVF International Conference on Computer Vision, ICCV 2019, Seoul, Korea (South), October 27 - November 2, 2019, pp. 7463\u20137472.","DOI":"10.1109\/ICCV.2019.00756"},{"key":"2599_CR32","doi-asserted-by":"crossref","unstructured":"Lei, J., Li, L., Zhou, L., Gan, Z., Berg, T.L., Bansal, M., & Liu, J. (2021). Less is more: Clipbert for video-and-language learning via sparse sampling. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021, Virtual, June 19-25, 2021, pp. 7331\u20137341.","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"2599_CR33","doi-asserted-by":"crossref","unstructured":"Xu, H., Ghosh, G., Huang, P.-Y., Arora, P., Aminzadeh, M., Feichtenhofer, C., Metze, F., & Zettlemoyer, L. (2021). VLM: Task-agnostic video-language model pre-training for video understanding. In: Zong, C., Xia, F., Li, W., Navigli, R. (eds.) Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021, Online, pp. 4227\u20134239.","DOI":"10.18653\/v1\/2021.findings-acl.370"},{"key":"2599_CR34","doi-asserted-by":"crossref","unstructured":"Ma, Y., Xu, G., Sun, X., Yan, M., Zhang, J., & Ji, R. (2022). X-CLIP:: End-to-end multi-grained contrastive learning for video-text retrieval. ArXiv preprint arXiv:abs\/2207.07285","DOI":"10.1145\/3503161.3547910"},{"key":"2599_CR35","unstructured":"Weng, Z., Yang, X., Li, A., Wu, Z., & Jiang, Y. (2023). Open-vclip: Transforming CLIP to an open-vocabulary video model via interpolated weight optimization. In: Krause, A., Brunskill, E., Cho, K., Engelhardt, B., Sabato, S., Scarlett, J. (eds.) Proc. of ICML. Proceedings of Machine Learning Research, vol. 202, pp. 36978\u201336989."},{"key":"2599_CR36","doi-asserted-by":"crossref","unstructured":"Subramanian, S., Merrill, W., Darrell, T., Gardner, M., Singh, S., & Rohrbach, A. (2022). ReCLIP: A strong zero-shot baseline for referring expression comprehension. In: Muresan, S., Nakov, P., Villavicencio, A. (eds.) Proc. of ACL, Dublin, Ireland, pp. 5198\u20135215.","DOI":"10.18653\/v1\/2022.acl-long.357"},{"key":"2599_CR37","doi-asserted-by":"crossref","unstructured":"Liu, Y., Zhang, J., Chen, Q., & Peng, Y. (2023). Confidence-aware pseudo-label learning for weakly supervised visual grounding. In: IEEE\/CVF International Conference on Computer Vision, ICCV 2023, Paris, France, October 1-6, 2023, pp. 2816\u20132826.","DOI":"10.1109\/ICCV51070.2023.00265"},{"key":"2599_CR38","doi-asserted-by":"crossref","unstructured":"Luo, D., Huang, J., Gong, S., Jin, H., & Liu, Y. (2023). Towards generalisable video moment retrieval: Visual-dynamic injection to image-text pre-training. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023, Vancouver, BC, Canada, June 17-24, 2023, pp. 23045\u201323055.","DOI":"10.1109\/CVPR52729.2023.02207"},{"key":"2599_CR39","doi-asserted-by":"crossref","unstructured":"Zheng, M., Gong, S., Jin, H., Peng, Y., & Liu, Y. (2023). Generating structured pseudo labels for noise-resistant zero-shot video sentence localization. In: Rogers, A., Boyd-Graber, J., Okazaki, N. (eds.) Proc. of ACL, Toronto, Canada, pp. 14197\u201314209.","DOI":"10.18653\/v1\/2023.acl-long.794"},{"key":"2599_CR40","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Hu, R., Darrell, T., & Schiele, B. (2016). Grounding of textual phrases in images by reconstruction. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part I 14, pp. 817\u2013834. Springer.","DOI":"10.1007\/978-3-319-46448-0_49"},{"key":"2599_CR41","doi-asserted-by":"crossref","unstructured":"Ryu, H., Kang, S., Kang, H., & Yoo, C.D. (2021). Semantic grouping network for video captioning. In: Thirty-Fifth AAAI Conference on Artificial Intelligence, AAAI 2021, Thirty-Third Conference on Innovative Applications of Artificial Intelligence, IAAI 2021, The Eleventh Symposium on Educational Advances in Artificial Intelligence, EAAI 2021, Virtual Event, February 2-9, 2021, pp. 2514\u20132522.","DOI":"10.1609\/aaai.v35i3.16353"},{"key":"2599_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, J., & Peng, Y. (2019). Hierarchical vision-language alignment for video captioning. In: MultiMedia Modeling: 25th International Conference, MMM 2019, Thessaloniki, Greece, January 8\u201311, 2019, Proceedings, Part I 25, pp. 42\u201354. Springer.","DOI":"10.1007\/978-3-030-05710-7_4"},{"key":"2599_CR43","doi-asserted-by":"crossref","unstructured":"Li, S., Li, C., Zheng, M., & Liu, Y. (2022). Phrase-level prediction for video temporal localization. In: International Conference on Multimedia Retrieval (ICMR), pp. 360\u2013368.","DOI":"10.1145\/3512527.3531382"},{"key":"2599_CR44","doi-asserted-by":"crossref","unstructured":"Mu, F., Mo, S., & Li, Y. (2024). Snag: Scalable and accurate video grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18930\u201318940.","DOI":"10.1109\/CVPR52733.2024.01791"},{"key":"2599_CR45","doi-asserted-by":"crossref","unstructured":"Pan, Y., He, X., Gong, B., Lv, Y., Shen, Y., Peng, Y., & Zhao, D. (2023). Scanning Only Once: An End-to-end Framework for Fast Temporal Grounding in Long Videos . In: 2023 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 13721\u201313731. IEEE Computer Society, Los Alamitos, CA, USA.","DOI":"10.1109\/ICCV51070.2023.01266"},{"key":"2599_CR46","unstructured":"Shi, P., & Lin, J.J. (2019). Simple bert models for relation extraction and semantic role labeling. ArXiv preprint arXiv:abs\/1904.05255"},{"key":"2599_CR47","unstructured":"Sanh, V., Debut, L., Chaumond, J., & Wolf, T. (2019). Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter. ArXiv preprint arXiv:abs\/1910.01108"},{"key":"2599_CR48","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L., & Polosukhin, I. (2017). Attention is all you need. In: Guyon, I., Luxburg, U., Bengio, S., Wallach, H.M., Fergus, R., Vishwanathan, S.V.N., Garnett, R. (eds.) Proc. of NeurIPS, pp. 5998\u20136008."},{"key":"2599_CR49","doi-asserted-by":"crossref","unstructured":"Lin, T., Goyal, P., Girshick, R.B., He, K., & Doll\u00e1r, P. (2017). Focal loss for dense object detection. In: IEEE International Conference on Computer Vision, ICCV 2017, Venice, Italy, October 22-29, 2017, pp. 2999\u20133007.","DOI":"10.1109\/ICCV.2017.324"},{"key":"2599_CR50","unstructured":"Oord, A.v.d., Li, Y., & Vinyals, O. (2018). Representation learning with contrastive predictive coding. ArXiv preprint arXiv:abs\/1807.03748"},{"issue":"01","key":"2599_CR51","doi-asserted-by":"publisher","first-page":"8199","DOI":"10.1609\/aaai.v33i01.33018199","volume":"33","author":"S Chen","year":"2019","unstructured":"Chen, S., & Jiang, Y.-G. (2019). Semantic proposal for activity localization in videos via sentence query. Proceedings of the AAAI Conference on Artificial Intelligence,33(01), 8199\u20138206.","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"2599_CR52","doi-asserted-by":"crossref","unstructured":"Zhang, D., Dai, X., Wang, X., Wang, Y., & Davis, L.S. (2019). MAN: moment alignment network for natural language moment retrieval via iterative graph adjustment. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2019, Long Beach, CA, USA, June 16-20, 2019, pp. 1247\u20131257.","DOI":"10.1109\/CVPR.2019.00134"},{"key":"2599_CR53","doi-asserted-by":"crossref","unstructured":"Gao, J., & Xu, C. (2021). Fast video moment retrieval. In: 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021, Montreal, QC, Canada, October 10-17, 2021, pp. 1503\u20131512.","DOI":"10.1109\/ICCV48922.2021.00155"},{"key":"2599_CR54","doi-asserted-by":"crossref","unstructured":"Zeng, R., Xu, H., Huang, W., Chen, P., Tan, M., & Gan, C. (2020). Dense regression network for video grounding. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2020, Seattle, WA, USA, June 13-19, 2020, pp. 10284\u201310293.","DOI":"10.1109\/CVPR42600.2020.01030"},{"key":"2599_CR55","doi-asserted-by":"crossref","unstructured":"Ding, X., Wang, N., Zhang, S., Cheng, D., Li, X., Huang, Z., Tang, M., & Gao, X. (2021). Support-set based cross-supervision for video grounding. In: 2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021, Montreal, QC, Canada, October 10-17, 2021, pp. 11553\u201311562.","DOI":"10.1109\/ICCV48922.2021.01137"},{"key":"2599_CR56","doi-asserted-by":"crossref","unstructured":"Liu, D., Qu, X., Dong, J., Zhou, P., Cheng, Y., Wei, W., Xu, Z., & Xie, Y. (2021). Context-aware biaffine localizing network for temporal sentence grounding. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021, Virtual, June 19-25, 2021, pp. 11235\u201311244.","DOI":"10.1109\/CVPR46437.2021.01108"},{"key":"2599_CR57","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Zhao, Z., Zhang, Z., & Lin, Z. (2021). Cascaded prediction network via segment tree for temporal video grounding. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021, Virtual, June 19-25, 2021, pp. 4197\u20134206.","DOI":"10.1109\/CVPR46437.2021.00418"},{"key":"2599_CR58","doi-asserted-by":"crossref","unstructured":"Wei, Z., Jiang, X., Wang, Z., Shen, F., & Xu, X. (2024). Ptan: Principal token-aware adjacent network for compositional temporal grounding. In: Proceedings of the 2024 International Conference on Multimedia Retrieval. ICMR \u201924, pp. 618\u2013627. Association for Computing Machinery, New York, NY, USA.","DOI":"10.1145\/3652583.3658113"},{"key":"2599_CR59","doi-asserted-by":"crossref","unstructured":"Xiao, S., Chen, L., Zhang, S., Ji, W., Shao, J., Ye, L., & Xiao, J. (2021). Boundary proposal network for two-stage natural language video localization. In: Thirty-Fifth AAAI Conference on Artificial Intelligence, AAAI 2021, Thirty-Third Conference on Innovative Applications of Artificial Intelligence, IAAI 2021, The Eleventh Symposium on Educational Advances in Artificial Intelligence, EAAI 2021, Virtual Event, February 2-9, 2021, pp. 2986\u20132994.","DOI":"10.1609\/aaai.v35i4.16406"},{"key":"2599_CR60","doi-asserted-by":"crossref","unstructured":"Zhou, H., Zhang, C., Luo, Y., Chen, Y., & Hu, C. (2021). Embracing uncertainty: Decoupling and de-bias for robust temporal grounding. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021, Virtual, June 19-25, 2021, pp. 8445\u20138454.","DOI":"10.1109\/CVPR46437.2021.00834"},{"key":"2599_CR61","doi-asserted-by":"crossref","unstructured":"Zhang, H., Sun, A., Jing, W., Zhen, L., Zhou, J.T., & Goh, S.M.R. (2021). Parallel attention network with sequence matching for video grounding. In: Zong, C., Xia, F., Li, W., Navigli, R. (eds.) Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021, Online, pp. 776\u2013790.","DOI":"10.18653\/v1\/2021.findings-acl.69"},{"key":"2599_CR62","doi-asserted-by":"crossref","unstructured":"Qu, X., Tang, P., Zou, Z., Cheng, Y., Dong, J., Zhou, P., & Xu, Z. (2020). Fine-grained iterative attention network for temporal language localization in videos. In: MM \u201920: The 28th ACM International Conference on Multimedia, Virtual Event \/ Seattle, WA, USA, October 12-16, 2020, pp. 4280\u20134288.","DOI":"10.1145\/3394171.3414053"},{"key":"2599_CR63","doi-asserted-by":"crossref","unstructured":"Wang, H., Zha, Z., Li, L., Liu, D., & Luo, J. (2021). Structured multi-level interaction network for video moment localization via language query. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021, Virtual, June 19-25, 2021, pp. 7026\u20137035.","DOI":"10.1109\/CVPR46437.2021.00695"},{"key":"2599_CR64","doi-asserted-by":"crossref","unstructured":"Seol, M., Kim, J., & Moon, J. (2023). Bmrn: Boundary matching and refinement network for temporal moment localization with natural language. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5570\u20135578.","DOI":"10.1109\/CVPRW59228.2023.00589"},{"key":"2599_CR65","doi-asserted-by":"crossref","unstructured":"Jing, W., Sun, A., Zhang, H., & Li, X. (2023). MS-DETR: Natural language video localization with sampling moment-moment interaction. In: Rogers, A., Boyd-Graber, J., Okazaki, N. (eds.) Proc. of ACL, Toronto, Canada, pp. 1387\u20131400.","DOI":"10.18653\/v1\/2023.acl-long.77"},{"key":"2599_CR66","doi-asserted-by":"publisher","first-page":"8265","DOI":"10.1109\/TIP.2021.3113791","volume":"30","author":"Z Zhang","year":"2021","unstructured":"Zhang, Z., Han, X., Song, X., Yan, Y., & Nie, L. (2021). Multi-modal interaction graph convolutional network for temporal language localization in videos. IEEE Transactions on Image Processing,30, 8265\u20138277.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2599_CR67","doi-asserted-by":"crossref","unstructured":"Gao, J., Sun, X., Xu, M., Zhou, X., & Ghanem, B. (2021). Relation-aware video reading comprehension for temporal language grounding. In: Moens, M.-F., Huang, X., Specia, L., Yih, S.W.-t. (eds.) Proc. of EMNLP, Online and Punta Cana, Dominican Republic, pp. 3978\u20133988.","DOI":"10.18653\/v1\/2021.emnlp-main.324"},{"key":"2599_CR68","unstructured":"Duan, X., Huang, W., Gan, C., Wang, J., Zhu, W., & Huang, J. (2018). Weakly supervised dense event captioning in videos. In: Bengio, S., Wallach, H.M., Larochelle, H., Grauman, K., Cesa-Bianchi, N., Garnett, R. (eds.) Proc. of NeurIPS, pp. 3063\u20133073."},{"key":"2599_CR69","doi-asserted-by":"crossref","unstructured":"Wu, J., Li, G., Liu, S., & Lin, L. (2020). Tree-structured policy based progressive reinforcement learning for temporally language grounding in video. In: The Thirty-Fourth AAAI Conference on Artificial Intelligence, AAAI 2020, The Thirty-Second Innovative Applications of Artificial Intelligence Conference, IAAI 2020, The Tenth AAAI Symposium on Educational Advances in Artificial Intelligence, EAAI 2020, New York, NY, USA, February 7-12, 2020, pp. 12386\u201312393.","DOI":"10.1609\/aaai.v34i07.6924"},{"key":"2599_CR70","doi-asserted-by":"publisher","unstructured":"Yang, L., Kong, Q., Yang, H.-K., Kehl, W., Sato, Y., & Kobori, N. (2023). Deco: Decomposition and reconstruction for compositional temporal grounding via coarse-to-fine contrastive ranking. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 23130\u201323140. https:\/\/doi.org\/10.1109\/CVPR52729.2023.02215","DOI":"10.1109\/CVPR52729.2023.02215"},{"key":"2599_CR71","doi-asserted-by":"crossref","unstructured":"Liu, B., Yeung, S., Chou, E., Huang, D.-A., Fei-Fei, L., & Niebles, J.C. (2018). Temporal modular networks for retrieving complex compositional activities in videos. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 552\u2013568.","DOI":"10.1007\/978-3-030-01219-9_34"},{"key":"2599_CR72","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., & Gupta, A.K. (2016). Hollywood in homes: Crowdsourcing data collection for activity understanding. ArXiv preprint arXiv:abs\/1604.01753","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"2599_CR73","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., & Niebles, J.C. (2017). Dense-captioning events in videos. In: IEEE International Conference on Computer Vision, ICCV 2017, Venice, Italy, October 22-29, 2017, pp. 706\u2013715.","DOI":"10.1109\/ICCV.2017.83"},{"key":"2599_CR74","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Liang, X., Wang, X., Yeung, D., & Gupta, A. (2017). Temporal dynamic graph LSTM for action-driven video object detection. In: IEEE International Conference on Computer Vision, ICCV 2017, Venice, Italy, October 22-29, 2017, pp. 1819\u20131828.","DOI":"10.1109\/ICCV.2017.200"},{"key":"2599_CR75","unstructured":"Simonyan, K., & Zisserman, A. (2015). Very deep convolutional networks for large-scale image recognition. In: Bengio, Y., LeCun, Y. (eds.) Proc. of ICLR."},{"key":"2599_CR76","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L.D., Fergus, R., Torresani, L., & Paluri, M. (2015). Learning spatiotemporal features with 3d convolutional networks. In: 2015 IEEE International Conference on Computer Vision, ICCV 2015, Santiago, Chile, December 7-13, 2015, pp. 4489\u20134497.","DOI":"10.1109\/ICCV.2015.510"},{"key":"2599_CR77","unstructured":"Wolf, T., Debut, L., Sanh, V., Chaumond, J., Delangue, C., Moi, A., Cistac, P., Rault, T., Louf, R., & Funtowicz, M., et al. (2019). Huggingface\u2019s transformers: State-of-the-art natural language processing. ArXiv preprint arXiv:abs\/1910.03771"},{"key":"2599_CR78","unstructured":"Loshchilov, I., & Hutter, F. (2019). Decoupled weight decay regularization. In: Proc. of ICLR."},{"key":"2599_CR79","doi-asserted-by":"crossref","unstructured":"Wang, L., Mittal, G., Sajeev, S., Yu, Y., Hall, M., Boddeti, V.N., & Chen, M. (2023). Prot\u00e9g\u00e9: Untrimmed pretraining for video temporal grounding by video temporal grounding. In: IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2023, Vancouver, BC, Canada, June 17-24, 2023, pp. 6575\u20136585.","DOI":"10.1109\/CVPR52729.2023.00636"},{"key":"2599_CR80","doi-asserted-by":"crossref","unstructured":"Soldan, M., Pardo, A., Alc\u00e1zar, J.L., Caba, F., Zhao, C., Giancola, S., & Ghanem, B. (2022). Mad: A scalable dataset for language grounding in videos from movie audio descriptions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5026\u20135035.","DOI":"10.1109\/CVPR52688.2022.00497"},{"key":"2599_CR81","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L.D., Fergus, R., Torresani, L., & Paluri, M. (2015). Learning spatiotemporal features with 3d convolutional networks. In: 2015 IEEE International Conference on Computer Vision, ICCV 2015, Santiago, Chile, December 7-13, 2015, pp. 4489\u20134497.","DOI":"10.1109\/ICCV.2015.510"},{"key":"2599_CR82","doi-asserted-by":"crossref","unstructured":"Carreira, J., & Zisserman, A. (2017). Quo vadis, action recognition? A new model and the kinetics dataset. In: 2017 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2017, Honolulu, HI, USA, July 21-26, 2017, pp. 4724\u20134733.","DOI":"10.1109\/CVPR.2017.502"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02599-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02599-w","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02599-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,17]],"date-time":"2026-02-17T15:20:06Z","timestamp":1771341606000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02599-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,10]]},"references-count":82,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2026,2]]}},"alternative-id":["2599"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02599-w","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1,10]]},"assertion":[{"value":"16 January 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 November 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 January 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"53"}}