{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T12:02:38Z","timestamp":1774440158059,"version":"3.50.1"},"reference-count":67,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2026,1,4]],"date-time":"2026-01-04T00:00:00Z","timestamp":1767484800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,4]],"date-time":"2026-01-04T00:00:00Z","timestamp":1767484800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1007\/s00530-025-02120-w","type":"journal-article","created":{"date-parts":[[2026,1,4]],"date-time":"2026-01-04T02:42:24Z","timestamp":1767494544000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal-guided mixture-of-experts bias removal strategy for natural language video localization"],"prefix":"10.1007","volume":"32","author":[{"given":"Xiaowen","family":"Ruan","sequence":"first","affiliation":[]},{"given":"Zhaobo","family":"Qi","sequence":"additional","affiliation":[]},{"given":"Ruisi","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Yuanrong","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Beichen","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Weigang","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,4]]},"reference":[{"key":"2120_CR1","doi-asserted-by":"crossref","unstructured":"Guo, S., Zhong, H., Wang, Q., Chen, Z., Gao, Y., Yuan, J., Zhang, C., Xie, R., Song, L.: A new people-object interaction dataset and nvs benchmarks. In: 2024 IEEE International Conference on Image Processing (ICIP), pp. 8\u201314 (2024). IEEE","DOI":"10.1109\/ICIP51287.2024.10647515"},{"key":"2120_CR2","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Zhang, G., Tan, J., Wu, G., Wang, L.: Dual detrs for multi-label temporal action detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18559\u201318569 (2024)","DOI":"10.1109\/CVPR52733.2024.01756"},{"key":"2120_CR3","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhang, C.-L., Zhao, C., Ghanem, B.: End-to-end temporal action detection with 1b parameters across 1000 frames. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18591\u201318601 (2024)","DOI":"10.1109\/CVPR52733.2024.01759"},{"key":"2120_CR4","doi-asserted-by":"crossref","unstructured":"Xu, J., Huang, Y., Hou, J., Chen, G., Zhang, Y., Feng, R., Xie, W.: Retrieval-augmented egocentric video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13525\u201313536 (2024)","DOI":"10.1109\/CVPR52733.2024.01284"},{"key":"2120_CR5","doi-asserted-by":"crossref","unstructured":"Islam, M.M., Ho, N., Yang, X., Nagarajan, T., Torresani, L., Bertasius, G.: Video recap: Recursive captioning of hour-long videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18198\u201318208 (2024)","DOI":"10.1109\/CVPR52733.2024.01723"},{"key":"2120_CR6","doi-asserted-by":"crossref","unstructured":"Lin, K., Li, L., Lin, C.-C., Ahmed, F., Gan, Z., Liu, Z., Lu, Y., Wang, L.: Swinbert: End-to-end transformers with sparse attention for video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17949\u201317958 (2022)","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"2120_CR7","doi-asserted-by":"crossref","unstructured":"Wang, H., Xu, Z., Cheng, Y., Diao, S., Zhou, Y., Cao, Y., Wang, Q., Ge, W., Huang, L.: Grounded-videollm: Sharpening fine-grained temporal grounding in video large language models. arXiv preprint arXiv:2410.03290 (2024)","DOI":"10.18653\/v1\/2025.findings-emnlp.50"},{"key":"2120_CR8","doi-asserted-by":"crossref","unstructured":"Li, H., Shu, X., He, S., Qiao, R., Wen, W., Guo, T., Gan, B., Sun, X.: D3g: Exploring gaussian prior for temporal sentence grounding with glance annotation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13734\u201313746 (2023)","DOI":"10.1109\/ICCV51070.2023.01263"},{"key":"2120_CR9","unstructured":"Liang, R., Yang, Y., Lu, H., Li, L.: Efficient temporal sentence grounding in videos with multi-teacher knowledge distillation. arXiv preprint arXiv:2308.03725 (2023)"},{"key":"2120_CR10","doi-asserted-by":"crossref","unstructured":"Otani, M., Nakashima, Y., Rahtu, E., Heikkil\u00e4, J.: Uncovering hidden challenges in query-based video moment retrieval. arXiv preprint arXiv:2009.00325 (2020)","DOI":"10.5244\/C.34.84"},{"key":"2120_CR11","doi-asserted-by":"publisher","first-page":"164","DOI":"10.1016\/j.patrec.2022.12.010","volume":"166","author":"N Dagaev","year":"2023","unstructured":"Dagaev, N., Roads, B.D., Luo, X., Barry, D.N., Patil, K.R., Love, B.C.: A too-good-to-be-true prior to reduce shortcut reliance. Pattern Recogn. Lett. 166, 164\u2013171 (2023)","journal-title":"Pattern Recogn. Lett."},{"key":"2120_CR12","doi-asserted-by":"crossref","unstructured":"Liu, D., Qu, X., Hu, W.: Reducing the vision and language bias for temporal sentence grounding. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 4092\u20134101 (2022)","DOI":"10.1145\/3503161.3547969"},{"key":"2120_CR13","first-page":"130","volume-title":"European Conference on Computer Vision","author":"J Hao","year":"2022","unstructured":"Hao, J., Sun, H., Ren, P., Wang, J., Qi, Q., Liao, J.: Can shuffling video benefit temporal bias problem: A novel training framework for temporal grounding. In: European Conference on Computer Vision, pp. 130\u2013147. Springer, Singapore (2022)"},{"key":"2120_CR14","doi-asserted-by":"crossref","unstructured":"Zhou, H., Zhang, C., Luo, Y., Chen, Y., Hu, C.: Embracing uncertainty: Decoupling and de-bias for robust temporal grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8445\u20138454 (2021)","DOI":"10.1109\/CVPR46437.2021.00834"},{"key":"2120_CR15","unstructured":"Zhou, H., Zhang, C., Chen, Y., Hu, C.: Towards diverse temporal grounding under single positive labels. arXiv preprint arXiv:2303.06545 (2023)"},{"key":"2120_CR16","doi-asserted-by":"publisher","first-page":"65948","DOI":"10.52202\/075280-2880","volume":"36","author":"P Li","year":"2023","unstructured":"Li, P., Xie, C.-W., Xie, H., Zhao, L., Zhang, L., Zheng, Y., Zhao, D., Zhang, Y.: Momentdiff: Generative video moment retrieval from random to real. Adv. Neural. Inf. Process. Syst. 36, 65948\u201365966 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2120_CR17","unstructured":"Ghosh, S., Agarwal, A., Parekh, Z., Hauptmann, A.: ExCL: Extractive Clip Localization Using Natural Language Descriptions. In: Burstein, J., Doran, C., Solorio, T. (eds.) Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 1984\u20131990. Association for Computational Linguistics, ??? (2019)"},{"key":"2120_CR18","doi-asserted-by":"crossref","unstructured":"Zhang, H., Sun, A., Jing, W., Zhou, J.T.: Span-based localizing network for natural language video localization. arXiv preprint arXiv:2004.13931 (2020)","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"2120_CR19","doi-asserted-by":"publisher","first-page":"72","DOI":"10.1016\/j.neucom.2022.01.085","volume":"483","author":"J Hao","year":"2022","unstructured":"Hao, J., Sun, H., Ren, P., Wang, J., Qi, Q., Liao, J.: Query-aware video encoder for video moment retrieval. Neurocomputing 483, 72\u201386 (2022)","journal-title":"Neurocomputing"},{"key":"2120_CR20","doi-asserted-by":"crossref","unstructured":"Mu, F., Mo, S., Li, Y.: Snag: Scalable and accurate video grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18930\u201318940 (2024)","DOI":"10.1109\/CVPR52733.2024.01791"},{"key":"2120_CR21","doi-asserted-by":"crossref","unstructured":"Gao, J., Sun, C., Yang, Z., Nevatia, R.: Tall: Temporal activity localization via language query. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5267\u20135275 (2017)","DOI":"10.1109\/ICCV.2017.563"},{"key":"2120_CR22","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"2120_CR23","doi-asserted-by":"crossref","unstructured":"Hendricks, L.A., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with temporal language. arXiv preprint arXiv:1809.01337 (2018)","DOI":"10.18653\/v1\/D18-1168"},{"key":"2120_CR24","doi-asserted-by":"crossref","unstructured":"Liu, M., Wang, X., Nie, L., Tian, Q., Chen, B., Chua, T.-S.: Cross-modal moment localization in videos. In: Proceedings of the 26th ACM International Conference on Multimedia, pp. 843\u2013851 (2018)","DOI":"10.1145\/3240508.3240549"},{"key":"2120_CR25","doi-asserted-by":"crossref","unstructured":"Jiang, B., Huang, X., Yang, C., Yuan, J.: Cross-modal video moment retrieval with spatial and language-temporal attention. In: Proceedings of the 2019 on International Conference on Multimedia Retrieval, pp. 217\u2013225 (2019)","DOI":"10.1145\/3323873.3325019"},{"key":"2120_CR26","doi-asserted-by":"crossref","unstructured":"Xu, H., Das, A., Saenko, K.: R-c3d: Region convolutional 3d network for temporal activity detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5783\u20135792 (2017)","DOI":"10.1109\/ICCV.2017.617"},{"key":"2120_CR27","first-page":"9062","volume":"33","author":"H Xu","year":"2019","unstructured":"Xu, H., He, K., Plummer, B.A., Sigal, L., Sclaroff, S., Saenko, K.: Multilevel language and vision integration for text-to-clip retrieval. Proceed. AAAI Conf. Artif. Intell. 33, 9062\u20139069 (2019)","journal-title":"Proceed. AAAI Conf. Artif. Intell."},{"key":"2120_CR28","first-page":"2986","volume":"35","author":"S Xiao","year":"2021","unstructured":"Xiao, S., Chen, L., Zhang, S., Ji, W., Shao, J., Ye, L., Xiao, J.: Boundary proposal network for two-stage natural language video localization. Proc. AAAI Conf. Artif. Intell. 35, 2986\u20132994 (2021)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"2120_CR29","doi-asserted-by":"crossref","unstructured":"Chen, J., Chen, X., Ma, L., Jie, Z., Chua, T.-S.: Temporally grounding natural sentence in video. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pp. 162\u2013171 (2018)","DOI":"10.18653\/v1\/D18-1015"},{"key":"2120_CR30","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Ma, L., Wang, J., Liu, W., Zhu, W.: Semantic conditioned dynamic modulation for temporal sentence grounding in videos. Advances in Neural Information Processing Systems 32 (2019)","DOI":"10.1109\/TPAMI.2020.3038993"},{"key":"2120_CR31","first-page":"12870","volume":"34","author":"S Zhang","year":"2020","unstructured":"Zhang, S., Peng, H., Fu, J., Luo, J.: Learning 2d temporal adjacent networks for moment localization with natural language. Proc. AAAI Conf. Artif. Intell. 34, 12870\u201312877 (2020)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"issue":"2","key":"2120_CR32","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3544493","volume":"19","author":"Q Zheng","year":"2023","unstructured":"Zheng, Q., Dong, J., Qu, X., Yang, X., Wang, Y., Zhou, P., Liu, B., Wang, X.: Progressive localization networks for language-based moment localization. ACM Trans. Multimed. Comput. Commun. Appl. 19(2), 1\u201321 (2023)","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"2120_CR33","doi-asserted-by":"crossref","unstructured":"Wu, Z., Gao, J., Huang, S., Xu, C.: Diving into the relations: Leveraging semantic and visual structures for video moment retrieval. In: 2021 IEEE International Conference on Multimedia and Expo (ICME), pp. 1\u20136 (2021). IEEE","DOI":"10.1109\/ICME51207.2021.9428369"},{"key":"2120_CR34","doi-asserted-by":"crossref","unstructured":"Lu, C., Chen, L., Tan, C., Li, X., Xiao, J.: Debug: A dense bottom-up grounding approach for natural language video localization. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 5144\u20135153 (2019)","DOI":"10.18653\/v1\/D19-1518"},{"key":"2120_CR35","first-page":"8175","volume":"33","author":"J Chen","year":"2019","unstructured":"Chen, J., Ma, L., Chen, X., Jie, Z., Luo, J.: Localizing natural language in videos. Proc. AAAI. Conf. Artif. Intell. 33, 8175\u20138182 (2019)","journal-title":"Proc. AAAI. Conf. Artif. Intell."},{"key":"2120_CR36","first-page":"8393","volume":"33","author":"D He","year":"2019","unstructured":"He, D., Zhao, X., Huang, J., Li, F., Liu, X., Wen, S.: Read, watch, and move: Reinforcement learning for temporally grounding natural language descriptions in videos. Proc. AAAI Conf. Artif. Intell. 33, 8393\u20138400 (2019)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"2120_CR37","first-page":"12386","volume":"34","author":"J Wu","year":"2020","unstructured":"Wu, J., Li, G., Liu, S., Lin, L.: Tree-structured policy based progressive reinforcement learning for temporally language grounding in video. Proc. AAAI Conf. Artif. Intell. 34, 12386\u201312393 (2020)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"2120_CR38","doi-asserted-by":"crossref","unstructured":"Mithun, N.C., Paul, S., Roy-Chowdhury, A.K.: Weakly supervised video moment retrieval from text queries. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11592\u201311601 (2019)","DOI":"10.1109\/CVPR.2019.01186"},{"key":"2120_CR39","doi-asserted-by":"crossref","unstructured":"Gao, M., Davis, L.S., Socher, R., Xiong, C.: Wslln: Weakly supervised natural language localization networks. arXiv preprint arXiv:1909.00239 (2019)","DOI":"10.18653\/v1\/D19-1157"},{"key":"2120_CR40","doi-asserted-by":"crossref","unstructured":"Huang, J., Liu, Y., Gong, S., Jin, H.: Cross-sentence temporal and semantic relations in video activity localisation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7199\u20137208 (2021)","DOI":"10.1109\/ICCV48922.2021.00711"},{"key":"2120_CR41","first-page":"11539","volume":"34","author":"Z Lin","year":"2020","unstructured":"Lin, Z., Zhao, Z., Zhang, Z., Wang, Q., Liu, H.: Weakly-supervised video moment retrieval via semantic completion network. Proc. AAAI Conf. Artif. Intell. 34, 11539\u201311546 (2020)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"2120_CR42","first-page":"213","volume-title":"European Conference on Computer Vision","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: European Conference on Computer Vision, pp. 213\u2013229. Springer, Singapore (2020)"},{"key":"2120_CR43","unstructured":"Jung, M., Jang, Y., Choi, S., Kim, J., Kim, J.-H., Zhang, B.-T.: Overcoming weak visual-textual alignment for video moment retrieval. arXiv preprint arXiv:2306.02728 (2023)"},{"key":"2120_CR44","unstructured":"Moon, W., Hyun, S., Lee, S., Heo, J.-P.: Correlation-guided query-dependency calibration for video temporal grounding. arXiv preprint arXiv:2311.08835 (2023)"},{"key":"2120_CR45","doi-asserted-by":"crossref","unstructured":"Lee, P., Byun, H.: Bam-detr: Boundary-aligned moment detection transformer for temporal sentence grounding in videos. In: European Conference on Computer Vision, pp. 220\u2013238 (2024). Springer","DOI":"10.1007\/978-3-031-72627-9_13"},{"key":"2120_CR46","doi-asserted-by":"publisher","first-page":"4998","DOI":"10.1609\/aaai.v38i5.28304","volume":"38","author":"H Sun","year":"2024","unstructured":"Sun, H., Zhou, M., Chen, W., Xie, W.: Tr-detr: Task-reciprocal transformer for joint moment retrieval and highlight detection. Proceedings of the AAAI Conference on Artificial Intelligence 38, 4998\u20135007 (2024)","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"2120_CR47","doi-asserted-by":"crossref","unstructured":"Yang, J., Wei, P., Li, H., Ren, Z.: Task-driven exploration: Decoupling and inter-task feedback for joint moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18308\u201318318 (2024)","DOI":"10.1109\/CVPR52733.2024.01733"},{"key":"2120_CR48","first-page":"11846","volume":"34","author":"J Lei","year":"2021","unstructured":"Lei, J., Berg, T.L., Bansal, M.: Detecting moments and highlights in videos via natural language queries. Adv. Neural. Inf. Process. Syst. 34, 11846\u201311858 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2120_CR49","doi-asserted-by":"crossref","unstructured":"Liu, Y., Li, S., Wu, Y., Chen, C.-W., Shan, Y., Qie, X.: Umt: Unified multi-modal transformers for joint video moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3042\u20133051 (2022)","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"2120_CR50","doi-asserted-by":"crossref","unstructured":"Moon, W., Hyun, S., Park, S., Park, D., Heo, J.-P.: Query-dependent video representation for moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23023\u201323033 (2023)","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"2120_CR51","doi-asserted-by":"crossref","unstructured":"Xiao, Y., Luo, Z., Liu, Y., Ma, Y., Bian, H., Ji, Y., Yang, Y., Li, X.: Bridging the gap: A unified video comprehension framework for moment retrieval and highlight detection. arXiv preprint arXiv:2311.16464 (2023)","DOI":"10.1109\/CVPR52733.2024.01770"},{"key":"2120_CR52","doi-asserted-by":"crossref","unstructured":"Ren, S., Yao, L., Li, S., Sun, X., Hou, L.: Timechat: A time-sensitive multimodal large language model for long video understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14313\u201314323 (2024)","DOI":"10.1109\/CVPR52733.2024.01357"},{"key":"2120_CR53","doi-asserted-by":"crossref","unstructured":"Huang, B., Wang, X., Chen, H., Song, Z., Zhu, W.: Vtimellm: Empower llm to grasp video moments. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14271\u201314280 (2024)","DOI":"10.1109\/CVPR52733.2024.01353"},{"key":"2120_CR54","doi-asserted-by":"crossref","unstructured":"Huang, D.-A., Liao, S., Radhakrishnan, S., Yin, H., Molchanov, P., Yu, Z., Kautz, J.: Lita: Language instructed temporal-localization assistant. In: European Conference on Computer Vision, pp. 202\u2013218 (2024). Springer","DOI":"10.1007\/978-3-031-73039-9_12"},{"key":"2120_CR55","doi-asserted-by":"crossref","unstructured":"Guo, Y., Liu, J., Li, M., Cheng, D., Tang, X., Sui, D., Liu, Q., Chen, X., Zhao, K.: Vtg-llm: Integrating timestamp knowledge into video llms for enhanced video temporal grounding. arXiv preprint arXiv:2405.13382 (2024)","DOI":"10.1609\/aaai.v39i3.32341"},{"key":"2120_CR56","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Lan, X., Wang, X., Chen, L., Wang, Z., Zhu, W.: A closer look at temporal sentence grounding in videos: Dataset and metric. In: Proceedings of the 2nd International Workshop on Human-centric Multimedia Analysis, pp. 13\u201321 (2021)","DOI":"10.1145\/3475723.3484247"},{"key":"2120_CR57","unstructured":"Zhang, H., Sun, A., Jing, W., Zhou, J.T.: Towards debiasing temporal sentence grounding in video. arXiv preprint arXiv:2111.04321 (2021)"},{"key":"2120_CR58","doi-asserted-by":"crossref","unstructured":"Qi, Z., Yuan, Y., Ruan, X., Wang, S., Zhang, W., Huang, Q.: Bias-conflict sample synthesis and adversarial removal debias strategy for temporal sentence grounding in video. arXiv preprint arXiv:2401.07567 (2024)","DOI":"10.1609\/aaai.v38i5.28252"},{"key":"2120_CR59","doi-asserted-by":"crossref","unstructured":"Qi, Z., Yuan, Y., Ruan, X., Wang, S., Zhang, W., Huang, Q.: Collaborative debias strategy for temporal sentence grounding in video. IEEE Transactions on Circuits and Systems for Video Technology (2024)","DOI":"10.1109\/TCSVT.2024.3413074"},{"key":"2120_CR60","unstructured":"Shazeer, N., Mirhoseini, A., Maziarz, K., Davis, A., Le, Q., Hinton, G., Dean, J.: Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538 (2017)"},{"key":"2120_CR61","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"2120_CR62","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: Global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"issue":"9","key":"2120_CR63","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0106531","volume":"9","author":"H Poormohammadi","year":"2014","unstructured":"Poormohammadi, H., Eslahchi, C., Tusserkani, R.: Tripnet: a method for constructing rooted phylogenetic networks from rooted triplets. PLoS ONE 9(9), 106531 (2014)","journal-title":"PLoS ONE"},{"key":"2120_CR64","unstructured":"Rodriguez, C., Marrese-Taylor, E., Saleh, F.S., Li, H., Gould, S.: Proposal-free temporal moment localization of a natural-language query in video using guided attention. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2464\u20132473 (2020)"},{"key":"2120_CR65","doi-asserted-by":"crossref","unstructured":"Mun, J., Cho, M., Han, B.: Local-global video-text interactions for temporal grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10810\u201310819 (2020)","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"2120_CR66","first-page":"28442","volume":"34","author":"Y-W Chen","year":"2021","unstructured":"Chen, Y.-W., Tsai, Y.-H., Yang, M.-H.: End-to-end multi-modal video temporal grounding. Adv. Neural. Inf. Process. Syst. 34, 28442\u201328453 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2120_CR67","doi-asserted-by":"crossref","unstructured":"Zeng, R., Xu, H., Huang, W., Chen, P., Tan, M., Gan, C.: Dense regression network for video grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10287\u201310296 (2020)","DOI":"10.1109\/CVPR42600.2020.01030"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-02120-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-02120-w","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-02120-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,25]],"date-time":"2026-03-25T08:43:36Z","timestamp":1774428216000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-02120-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,4]]},"references-count":67,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2026,2]]}},"alternative-id":["2120"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-02120-w","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,1,4]]},"assertion":[{"value":"31 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 November 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"4 January 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"61"}}