{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T20:28:34Z","timestamp":1773260914235,"version":"3.50.1"},"reference-count":68,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2025,2,13]],"date-time":"2025-02-13T00:00:00Z","timestamp":1739404800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,2,13]],"date-time":"2025-02-13T00:00:00Z","timestamp":1739404800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,4]]},"DOI":"10.1007\/s00530-024-01587-3","type":"journal-article","created":{"date-parts":[[2025,2,12]],"date-time":"2025-02-12T22:24:01Z","timestamp":1739399041000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Dual-guided multi-modal bias removal strategy for temporal sentence grounding in video"],"prefix":"10.1007","volume":"31","author":[{"given":"Xiaowen","family":"Ruan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhaobo","family":"Qi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuanrong","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Weigang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,2,13]]},"reference":[{"key":"1587_CR1","doi-asserted-by":"crossref","unstructured":"Gao, J., Sun, C., Yang, Z., Nevatia, R.: Tall: Temporal activity localization via language query. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5267\u20135275 (2017)","DOI":"10.1109\/ICCV.2017.563"},{"key":"1587_CR2","doi-asserted-by":"crossref","unstructured":"Anne Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"1587_CR3","doi-asserted-by":"crossref","unstructured":"Liu, M., Wang, X., Nie, L., He, X., Chen, B., Chua, T.-S.: Attentive moment retrieval in videos. In: Proceedings of the 41st International ACM SIGIR Conference on Research & Development in Information Retrieval, pp. 15\u201324 (2018)","DOI":"10.1145\/3209978.3210003"},{"issue":"5","key":"1587_CR4","first-page":"2725","volume":"44","author":"Y Yuan","year":"2019","unstructured":"Yuan, Y., Ma, L., Wang, J., Liu, W., Zhu, W.: Semantic conditioned dynamic modulation for temporal sentence grounding in videos. IEEE Trans Pattern Anal Mach Intell 44(5), 2725\u20132741 (2019)","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"1587_CR5","doi-asserted-by":"crossref","unstructured":"Zhang, S., Peng, H., Fu, J., Luo, J.: Learning 2d temporal adjacent networks for moment localization with natural language. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 12870\u201312877 (2020)","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"1587_CR6","unstructured":"Otani, M., Nakashima, Y., Rahtu, E., Heikkil\u00e4, J.: Uncovering hidden challenges in query-based video moment retrieval. In: Proceedings of the British Machine Vision Conference, pp. 1\u201312 (2020)"},{"key":"1587_CR7","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Carlos Niebles, J.: Dense-captioning events in videos. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 706\u2013715 (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"1587_CR8","doi-asserted-by":"publisher","first-page":"164","DOI":"10.1016\/j.patrec.2022.12.010","volume":"166","author":"N Dagaev","year":"2023","unstructured":"Dagaev, N., Roads, B.D., Luo, X., Barry, D.N., Patil, K.R., Love, B.C.: A too-good-to-be-true prior to reduce shortcut reliance. Pattern Recogn Lett, 166, 164\u2013171 (2023)","journal-title":"Pattern Recognit. Lett."},{"key":"1587_CR9","doi-asserted-by":"crossref","unstructured":"Liu, D., Qu, X., Hu, W.: Reducing the vision and language bias for temporal sentence grounding. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 4092\u20134101 (2022)","DOI":"10.1145\/3503161.3547969"},{"key":"1587_CR10","doi-asserted-by":"crossref","unstructured":"Hao, J., Sun, H., Ren, P., Wang, J., Qi, Q., Liao, J.: Can shuffling video benefit temporal bias problem: a novel training framework for temporal grounding. In: Proceedings of the 30th European Conference on Computer Vision, pp. 130\u2013147 (2022)","DOI":"10.1007\/978-3-031-20059-5_8"},{"key":"1587_CR11","unstructured":"Zhang, H., Sun, A., Jing, W., Zhou, J.T.: Towards debiasing temporal sentence grounding in video. arXiv preprint arXiv:2111.04321 (2021)"},{"key":"1587_CR12","doi-asserted-by":"crossref","unstructured":"Qi, Z., Yuan, Y., Ruan, X., Wang, S., Zhang, W., Huang, Q.: Bias-conflict sample synthesis and adversarial removal debias strategy for temporal sentence grounding in video. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 4533\u20134541 (2024)","DOI":"10.1609\/aaai.v38i5.28252"},{"issue":"11","key":"1587_CR13","doi-asserted-by":"publisher","first-page":"10972","DOI":"10.1109\/TCSVT.2024.3413074","volume":"34","author":"Z Qi","year":"2024","unstructured":"Qi, Z., Yuan, Y., Ruan, X., Wang, S., Zhang, W., Huang, Q.: Collaborative debias strategy for temporal sentence grounding in video. IEEE Trans Circ Syst Video Technol 34(11), 10972\u201310986 (2024)","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"1587_CR14","unstructured":"Zhou, H., Zhang, C., Chen, Y., Hu, C.: Towards diverse temporal grounding under single positive labels. arXiv preprint arXiv:2303.06545 (2023)"},{"key":"1587_CR15","doi-asserted-by":"publisher","first-page":"72","DOI":"10.1016\/j.neucom.2022.01.085","volume":"483","author":"J Hao","year":"2022","unstructured":"Hao, J., Sun, H., Ren, P., Wang, J., Qi, Q., Liao, J.: Query-aware video encoder for video moment retrieval. Neurocomputing, 483, 72\u201386 (2022)","journal-title":"Neurocomputing"},{"key":"1587_CR16","doi-asserted-by":"crossref","unstructured":"Ghosh, S., Agarwal, A., Parekh, Z., Hauptmann, A.: Excl: Extractive clip localization using natural language descriptions. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics, pp. 1984\u20131990 (2019)","DOI":"10.18653\/v1\/N19-1198"},{"key":"1587_CR17","doi-asserted-by":"crossref","unstructured":"Zhang, H., Sun, A., Jing, W., Zhou, J.T.: Span-based localizing network for natural language video localization. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics, pp. 6543\u20136554 (2020)","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"1587_CR18","doi-asserted-by":"crossref","unstructured":"Hendricks, L.A., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with temporal language. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pp. 1380\u20131390 (2018)","DOI":"10.18653\/v1\/D18-1168"},{"key":"1587_CR19","doi-asserted-by":"crossref","unstructured":"Liu, M., Wang, X., Nie, L., Tian, Q., Chen, B., Chua, T.-S.: Cross-modal moment localization in videos. In: Proceedings of the 26th ACM International Conference on Multimedia, pp. 843\u2013851 (2018)","DOI":"10.1145\/3240508.3240549"},{"key":"1587_CR20","doi-asserted-by":"crossref","unstructured":"Jiang, B., Huang, X., Yang, C., Yuan, J.: Cross-modal video moment retrieval with spatial and language-temporal attention. In: Proceedings of the 2019 on International Conference on Multimedia Retrieval, pp. 217\u2013225 (2019)","DOI":"10.1145\/3323873.3325019"},{"key":"1587_CR21","doi-asserted-by":"crossref","unstructured":"Chen, J., Chen, X., Ma, L., Jie, Z., Chua, T.-S.: Temporally grounding natural sentence in video. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pp. 162\u2013171 (2018)","DOI":"10.18653\/v1\/D18-1015"},{"key":"1587_CR22","doi-asserted-by":"crossref","unstructured":"Wu, Z., Gao, J., Huang, S., Xu, C.: Diving into the relations: Leveraging semantic and visual structures for video moment retrieval. In: Proceedings of the 2021 IEEE International Conference on Multimedia and Expo, pp. 1\u20136 (2021)","DOI":"10.1109\/ICME51207.2021.9428369"},{"key":"1587_CR23","doi-asserted-by":"crossref","unstructured":"Chen, J., Ma, L., Chen, X., Jie, Z., Luo, J.: Localizing natural language in videos. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 8175\u20138182 (2019)","DOI":"10.1609\/aaai.v33i01.33018175"},{"key":"1587_CR24","doi-asserted-by":"crossref","unstructured":"Lu, C., Chen, L., Tan, C., Li, X., Xiao, J.: Debug: A dense bottom-up grounding approach for natural language video localization. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, pp. 5144\u20135153 (2019)","DOI":"10.18653\/v1\/D19-1518"},{"key":"1587_CR25","unstructured":"Yu, A.W., Dohan, D., Luong, M.-T., Zhao, R., Chen, K., Norouzi, M., Le, Q.V.: Qanet: combining local convolution with global self-attention for reading comprehension. In: Proceedings of the 6th International Conference on Learning Representations (2018)"},{"key":"1587_CR26","doi-asserted-by":"crossref","unstructured":"He, D., Zhao, X., Huang, J., Li, F., Liu, X., Wen, S.: Read, watch, and move: Reinforcement learning for temporally grounding natural language descriptions in videos. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 8393\u20138400 (2019)","DOI":"10.1609\/aaai.v33i01.33018393"},{"key":"1587_CR27","doi-asserted-by":"crossref","unstructured":"Wu, J., Li, G., Liu, S., Lin, L.: Tree-structured policy based progressive reinforce- ment learning for temporally language grounding in video. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 12386\u201312393 (2020)","DOI":"10.1609\/aaai.v34i07.6924"},{"key":"1587_CR28","doi-asserted-by":"crossref","unstructured":"Mithun, N.C., Paul, S., Roy-Chowdhury, A.K.: Weakly supervised video moment retrieval from text queries. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11592\u201311601 (2019)","DOI":"10.1109\/CVPR.2019.01186"},{"key":"1587_CR29","doi-asserted-by":"crossref","unstructured":"Gao, M., Davis, L.S., Socher, R., Xiong, C.: Wslln: Weakly supervised natural language localization networks. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, pp. 1481\u20131487 (2019)","DOI":"10.18653\/v1\/D19-1157"},{"key":"1587_CR30","doi-asserted-by":"crossref","unstructured":"Huang, J., Liu, Y., Gong, S., Jin, H.: Cross-sentence temporal and semantic relations in video activity localisation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7199\u20137208 (2021)","DOI":"10.1109\/ICCV48922.2021.00711"},{"key":"1587_CR31","doi-asserted-by":"crossref","unstructured":"Lin, Z., Zhao, Z., Zhang, Z., Wang, Q., Liu, H.: Weakly-supervised video moment retrieval via semantic completion network. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 11539\u201311546 (2020)","DOI":"10.1609\/aaai.v34i07.6820"},{"key":"1587_CR32","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Proceedings of the European Conference on Computer Vision, pp. 213\u2013229 (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"1587_CR33","doi-asserted-by":"crossref","unstructured":"Sun, H., Zhou, M., Chen, W., Xie, W.: Tr-detr: task-reciprocal transformer for joint moment retrieval and highlight detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 4998\u20135007 (2024)","DOI":"10.1609\/aaai.v38i5.28304"},{"key":"1587_CR34","unstructured":"Moon, W., Hyun, S., Lee, S., Heo, J.-P.: Correlation-guided query-dependency calibration in video representation learning for temporal grounding. arXiv preprint arXiv:2311.08835 (2023)"},{"key":"1587_CR35","doi-asserted-by":"crossref","unstructured":"Moon, W., Hyun, S., Park, S., Park, D., Heo, J.-P.: Query-dependent video representation for moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23023\u201323033 (2023)","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"1587_CR36","unstructured":"Jung, M., Jang, Y., Choi, S., Kim, J., Kim, J.-H., Zhang, B.-T.: Overcoming weak visual-textual alignment for video moment retrieval. arXiv preprint arXiv:2312.00083 (2023)"},{"key":"1587_CR37","doi-asserted-by":"crossref","unstructured":"Lee, P., Byun, H.: Bam-detr: Boundary-aligned moment detection transformer for temporal sentence grounding in videos. In: Proceedings of the European Conference on Computer Vision, pp. 220\u2013238 (2023)","DOI":"10.1007\/978-3-031-72627-9_13"},{"key":"1587_CR38","doi-asserted-by":"crossref","unstructured":"Yan, S., Xiong, X., Nagrani, A., Arnab, A., Wang, Z., Ge, W., Ross, D., Schmid, C.: Unloc: A unified framework for video localization tasks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13623\u201313633 (2023)","DOI":"10.1109\/ICCV51070.2023.01253"},{"key":"1587_CR39","doi-asserted-by":"crossref","unstructured":"Liu, Y., Li, S., Wu, Y., Chen, C.-W., Shan, Y., Qie, X.: Umt: Unified multi-modal transformers for joint video moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3042\u20133051 (2022)","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"1587_CR40","doi-asserted-by":"crossref","unstructured":"Xiao, Y., Luo, Z., Liu, Y., Ma, Y., Bian, H., Ji, Y., Yang, Y., Li, X.: Bridging the gap: A unified video comprehension framework for moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18709\u201318719 (2023)","DOI":"10.1109\/CVPR52733.2024.01770"},{"key":"1587_CR41","unstructured":"Li, P., Xie, C.-W., Xie, H., Zhao, L., Zhang, L., Zheng, Y., Zhao, D., Zhang, Y.: Momentdiff: generative video moment retrieval from random to real. In: Proceedings of Advances in Neural Information Processing Systems, pp. 65948\u201365966 (2024)"},{"key":"1587_CR42","doi-asserted-by":"crossref","unstructured":"Barrios, W., Soldan, M., Ceballos-Arroyo, A.M., Heilbron, F.C., Ghanem, B.: Localizing moments in long video via multimodal guidance. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13667\u201313678 (2023)","DOI":"10.1109\/ICCV51070.2023.01257"},{"key":"1587_CR43","doi-asserted-by":"crossref","unstructured":"Jang, J., Park, J., Kim, J., Kwon, H., Sohn, K.: Knowing where to focus: Event-aware transformer for video grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13846\u201313856 (2023)","DOI":"10.1109\/ICCV51070.2023.01273"},{"key":"1587_CR44","first-page":"20450","volume":"35","author":"A F\u00fcrst","year":"2022","unstructured":"F\u00fcrst, A., Rumetshofer, E., Lehner, J., Tran, V.T., Tang, F., Ramsauer, H., Kreil, D., Kopp, M., Klambauer, G., Bitto, A., et al.: Cloob: modern hopfield networks with infoloob outperform clip. In: Proceedings of Advances in Neural Information Processing Systems,  35, 20450\u201320468 (2022)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1587_CR45","unstructured":"Jia, C., Yang, Y., Xia, Y., Chen, Y.-T., Parekh, Z., Pham, H., Le, Q., Sung, Y.-H., Li, Z., Duerig, T.: Scaling up visual and vision-language representation learning with noisy text supervision. In: Proceedings of the International Conference on Machine Learning, pp. 4904\u20134916 (2021)"},{"key":"1587_CR46","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al.: Learning transferable visual models from natural language supervision. In: Proceedings of the International Conference on Machine Learning, pp. 8748\u20138763 (2021)"},{"key":"1587_CR47","doi-asserted-by":"crossref","unstructured":"Luo, D., Huang, J., Gong, S., Jin, H., Liu, Y.: Towards generalisable video moment retrieval: Visual-dynamic injection to image-text pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23045\u201323055 (2023)","DOI":"10.1109\/CVPR52729.2023.02207"},{"key":"1587_CR48","doi-asserted-by":"crossref","unstructured":"Wang, G., Wu, X., Liu, Z., Yan, J.: Prompt-based zero-shot video moment retrieval. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 413\u2013421 (2022)","DOI":"10.1145\/3503161.3548004"},{"key":"1587_CR49","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the v in vqa matter: Elevating the role of image understanding in visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6904\u20136913 (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"1587_CR50","doi-asserted-by":"crossref","unstructured":"Agrawal, A., Batra, D., Parikh, D., Kembhavi, A.: Don\u2019t just assume; look and answer: Overcoming priors for visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4971\u20134980 (2018)","DOI":"10.1109\/CVPR.2018.00522"},{"key":"1587_CR51","unstructured":"Ramakrishnan, S., Agrawal, A., Lee, S.: Overcoming language priors in visual question answering with adversarial regularization. In: Proceedings of Advances in Neural Information Processing Systems, pp. 1548\u20131558 (2018)"},{"key":"1587_CR52","doi-asserted-by":"crossref","unstructured":"Grand, G., Belinkov, Y.: Adversarial regularization for visual question answering: strengths, shortcomings, and side effects. In: Proceedings of the Second Workshop on Shortcomings in Vision and Language, pp. 1\u201313 (2019)","DOI":"10.18653\/v1\/W19-1801"},{"key":"1587_CR53","unstructured":"Cadene, R., Dancette, C., Cord, M., Parikh, D., et al.: Rubi: Reducing unimodal biases for visual question answering. In: Proceedings of Advances in Neural Information Processing Systems, pp. 841\u2013852 (2019)"},{"key":"1587_CR54","doi-asserted-by":"crossref","unstructured":"Clark, C., Yatskar, M., Zettlemoyer, L.: Don\u2019t take the easy way out: ensemble based methods for avoiding known dataset biases. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, pp. 4069\u20134082 (2019)","DOI":"10.18653\/v1\/D19-1418"},{"key":"1587_CR55","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Lan, X., Wang, X., Chen, L., Wang, Z., Zhu, W.: A closer look at temporal sentence grounding in videos: Dataset and metric. In: Proceedings of the 2nd International Workshop on Human-centric Multimedia Analysis, pp. 13\u201321 (2021)","DOI":"10.1145\/3475723.3484247"},{"key":"1587_CR56","doi-asserted-by":"crossref","unstructured":"Yang, X., Feng, F., Ji, W., Wang, M., Chua, T.-S.: Deconfounded video moment retrieval with causal intervention. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1\u201310 (2021)","DOI":"10.1145\/3404835.3462823"},{"key":"1587_CR57","doi-asserted-by":"crossref","unstructured":"Bao, P., Mu, Y.: Learning sample importance for cross-scenario video temporal grounding. In: Proceedings of the 2022 International Conference on Multimedia Retrieval, pp. 322\u2013329 (2022)","DOI":"10.1145\/3512527.3531403"},{"key":"1587_CR58","doi-asserted-by":"crossref","unstructured":"Zhu, J., Liu, D., Zhou, P., Di, X., Cheng, Y., Yang, S., Xu, W., Xu, Z., Wan, Y., Sun, L., et al.: Rethinking the video sampling and reasoning strategies for temporal sentence grounding. In: Proceedings of the Findings of the Association for Computational Linguistics, pp. 590\u2013600 (2023)","DOI":"10.18653\/v1\/2022.findings-emnlp.41"},{"key":"1587_CR59","doi-asserted-by":"crossref","unstructured":"Wang, Y., Bilinski, P., Bremond, F., Dantcheva, A.: G3an: Disentangling appearance and motion for video generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5264\u20135273 (2020)","DOI":"10.1109\/CVPR42600.2020.00531"},{"key":"1587_CR60","doi-asserted-by":"crossref","unstructured":"Gao, J., Chen, M., Xu, C.: Vectorized evidential learning for weakly-supervised temporal action localization. IEEE Trans Pattern Anal Mach Intell, 45(12), 15949\u201315963 (2023)","DOI":"10.1109\/TPAMI.2023.3311447"},{"key":"1587_CR61","doi-asserted-by":"crossref","unstructured":"Gao, J., Zhang, T., Xu, C.: Learning to model relationships for zero-shot video classification. IEEE Trans Pattern Anal Mach Intell, 43(10), 3476\u20133491 (2021)","DOI":"10.1109\/TPAMI.2020.2985708"},{"key":"1587_CR62","doi-asserted-by":"crossref","unstructured":"Hu, Y., Gao, J., Dong, J., Fan, B., Liu, H.: Exploring rich semantics for open-set action recognition. IEEE Trans Multimedia, 26, 5410\u20135421 (2023)","DOI":"10.1109\/TMM.2023.3333206"},{"key":"1587_CR63","doi-asserted-by":"crossref","unstructured":"Pennington,J., Socher, R., Manning,C.D.: Glove: Global vectorsfor word representation.In: Proceedings of the 2014 EMNLP Conference on Empirical Methods in Natural Language Processing, pp.1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"1587_CR64","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman,A.: Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"1587_CR65","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. In: Proceedings of the 2nd International Conference on Learning Representations (2014)"},{"key":"1587_CR66","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Mei, T., Zhu, W.: To find where you talk: temporal sentence localization in video with attention based location regression. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 9159\u20139166 (2019)","DOI":"10.1609\/aaai.v33i01.33019159"},{"key":"1587_CR67","doi-asserted-by":"crossref","unstructured":"Zeng, R., Xu, H., Huang, W., Chen, P., Tan, M., Gan, C.: Dense regression network for video grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10287\u201310296 (2020)","DOI":"10.1109\/CVPR42600.2020.01030"},{"key":"1587_CR68","doi-asserted-by":"crossref","unstructured":"Mun, J., Cho, M., Han, B.: Local-global video-text interactions for temporal grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10810\u201310819 (2020)","DOI":"10.1109\/CVPR42600.2020.01082"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-024-01587-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-024-01587-3\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-024-01587-3.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,21]],"date-time":"2025-04-21T15:36:36Z","timestamp":1745249796000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-024-01587-3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,2,13]]},"references-count":68,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2025,4]]}},"alternative-id":["1587"],"URL":"https:\/\/doi.org\/10.1007\/s00530-024-01587-3","relation":{"has-preprint":[{"id-type":"doi","id":"10.21203\/rs.3.rs-4934017\/v1","asserted-by":"object"}]},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,2,13]]},"assertion":[{"value":"18 August 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 November 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 February 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"108"}}