{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T05:09:45Z","timestamp":1777871385570,"version":"3.51.4"},"reference-count":56,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/100012542","name":"Sichuan Provincial Science and Technology Support Program","doi-asserted-by":"publisher","award":["2024NSFSC1463"],"award-info":[{"award-number":["2024NSFSC1463"]}],"id":[{"id":"10.13039\/100012542","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62306067"],"award-info":[{"award-number":["62306067"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62220106008"],"award-info":[{"award-number":["62220106008"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100021171","name":"Basic and Applied Basic Research Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2025A1515010108"],"award-info":[{"award-number":["2025A1515010108"]}],"id":[{"id":"10.13039\/501100021171","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Knowledge-Based Systems"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.knosys.2026.115856","type":"journal-article","created":{"date-parts":[[2026,3,24]],"date-time":"2026-03-24T16:04:19Z","timestamp":1774368259000},"page":"115856","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["GASE: Generalized adaptive static enhancement for temporal sentence grounding"],"prefix":"10.1016","volume":"342","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9350-1389","authenticated-orcid":false,"given":"Ran","family":"Ran","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9769-3141","authenticated-orcid":false,"given":"Kaiwen","family":"Shen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3912-1742","authenticated-orcid":false,"given":"Jiwei","family":"Wei","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6248-9938","authenticated-orcid":false,"given":"Ruikun","family":"Chai","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6287-6969","authenticated-orcid":false,"given":"Shiyuan","family":"He","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1846-8889","authenticated-orcid":false,"given":"Zeyu","family":"Ma","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4915-0486","authenticated-orcid":false,"given":"Yuyang","family":"Zhou","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2345-0974","authenticated-orcid":false,"given":"Malu","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5070-4511","authenticated-orcid":false,"given":"Yang","family":"Yang","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.knosys.2026.115856_bib0001","series-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV)","first-page":"5267","article-title":"Tall: temporal activity localization via language query","author":"Gao","year":"2017"},{"key":"10.1016\/j.knosys.2026.115856_bib0002","article-title":"Semantic conditioned dynamic modulation for temporal sentence grounding in videos","volume":"32","author":"Yuan","year":"2019","journal-title":"Proceedings of the Conference on Neural Inforation Processing Systems (NeurIPS)"},{"key":"10.1016\/j.knosys.2026.115856_bib0003","series-title":"Proceedings of the ACM International Conference on Multimedia","first-page":"3041","article-title":"Unifying two-stream encoders with transformers for cross-modal retrieval","author":"Bin","year":"2023"},{"key":"10.1016\/j.knosys.2026.115856_bib0004","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111621","article-title":"Adversarial temporal sentence grounding by learning from external data","volume":"165","author":"Han","year":"2025","journal-title":"Pattern Recognit."},{"issue":"5","key":"10.1016\/j.knosys.2026.115856_bib0005","doi-asserted-by":"crossref","first-page":"2491","DOI":"10.1109\/TCSVT.2022.3223725","article-title":"Few-shot temporal sentence grounding via memory-guided semantic learning","volume":"33","author":"Liu","year":"2022","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.115856_bib0006","doi-asserted-by":"crossref","DOI":"10.1109\/TCSVT.2025.3575957","article-title":"Fine-grained alignment and interaction for video grounding with cross-modal semantic hierarchical graph","author":"Ran","year":"2025","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"10.1016\/j.knosys.2026.115856_bib0007","series-title":"Proceedings of the Empirical Methods in Natural Language Processing (EMNLP)","first-page":"162","article-title":"Temporally grounding natural sentence in video","author":"Chen","year":"2018"},{"key":"10.1016\/j.knosys.2026.115856_bib0008","series-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV)","first-page":"12032","article-title":"G2L: Semantically aligned and uniform video grounding via geodesic and game theory","author":"Li","year":"2023"},{"key":"10.1016\/j.knosys.2026.115856_bib0009","doi-asserted-by":"crossref","first-page":"3263","DOI":"10.1109\/TMM.2023.3309551","article-title":"Hierarchical local-global transformer for temporal sentence grounding","volume":"26","author":"Fang","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.knosys.2026.115856_bib0010","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"10810","article-title":"Local-global video-text interactions for temporal grounding","author":"Mun","year":"2020"},{"key":"10.1016\/j.knosys.2026.115856_bib0011","series-title":"Proceedings of the ACM International Conference on Multimedia","first-page":"4092","article-title":"Reducing the vision and language bias for temporal sentence grounding","author":"Liu","year":"2022"},{"key":"10.1016\/j.knosys.2026.115856_bib0012","series-title":"Proceedings of the Empirical Methods in Natural Language Processing (EMNLP)","first-page":"9810","article-title":"On pursuit of designing multi-modal transformer for video grounding","author":"Cao","year":"2021"},{"key":"10.1016\/j.knosys.2026.115856_bib0013","series-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV)","first-page":"2794","article-title":"Univtg: towards unified video-language temporal grounding","author":"Lin","year":"2023"},{"key":"10.1016\/j.knosys.2026.115856_bib0014","series-title":"Proceedings of the Empirical Methods in Natural Language Processing (EMNLP)","first-page":"590","article-title":"Rethinking the video sampling and reasoning strategies for temporal sentence grounding","author":"Zhu","year":"2022"},{"key":"10.1016\/j.knosys.2026.115856_bib0015","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"23045","article-title":"Towards generalisable video moment retrieval: visual-dynamic injection to image-text pre-training","author":"Luo","year":"2023"},{"key":"10.1016\/j.knosys.2026.115856_bib0016","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"14794","article-title":"Text-visual prompting for efficient 2d temporal video grounding","author":"Zhang","year":"2023"},{"key":"10.1016\/j.knosys.2026.115856_bib0017","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.knosys.2026.115856_bib0018","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"23034","article-title":"Vita-CLIP: video and text adaptive CLIP via multimodal prompting","author":"Wasim","year":"2023"},{"key":"10.1016\/j.knosys.2026.115856_bib0019","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109985","article-title":"Temporal segment dropout for human action video recognition","volume":"146","author":"Zhang","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.knosys.2026.115856_bib0020","series-title":"Proceedings of the Association for the Advancement of Artificial Intelligence (AAAI)","first-page":"2256","article-title":"Bliva: a simple multimodal llm for better handling of text-rich visual questions","volume":"38","author":"Hu","year":"2024"},{"issue":"10","key":"10.1016\/j.knosys.2026.115856_bib0021","doi-asserted-by":"crossref","first-page":"6534","DOI":"10.1109\/TPAMI.2021.3088863","article-title":"Universal weighting metric learning for cross-modal retrieval","volume":"44","author":"Wei","year":"2022","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.115856_bib0022","series-title":"Proceedings of the 29th ACM International Conference on Multimedia","first-page":"3835","article-title":"Meta self-paced learning for cross-modal matching","author":"Wei","year":"2021"},{"key":"10.1016\/j.knosys.2026.115856_bib0023","series-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV)","first-page":"5803","article-title":"Localizing moments in video with natural language","author":"Anne Hendricks","year":"2017"},{"issue":"8","key":"10.1016\/j.knosys.2026.115856_bib0024","doi-asserted-by":"crossref","first-page":"10443","DOI":"10.1109\/TPAMI.2023.3258628","article-title":"Temporal sentence grounding in videos: a survey and future directions","volume":"45","author":"Zhang","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.115856_bib0025","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110819","article-title":"Triadic temporal-semantic alignment for weakly-supervised video moment retrieval","volume":"156","author":"Liu","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.knosys.2026.115856_bib0026","article-title":"Adaptive prototype learning for weakly-supervised temporal action localization","author":"Luo","year":"2024","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.knosys.2026.115856_bib0027","series-title":"Proceedings of the Association for the Advancement of Artificial Intelligence (AAAI)","first-page":"6684","article-title":"CDTR: Semantic alignment for video moment retrieval using concept decomposition transformer","volume":"39","author":"Ran","year":"2025"},{"key":"10.1016\/j.knosys.2026.115856_bib0028","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"11235","article-title":"Context-aware biaffine localizing network for temporal sentence grounding","author":"Liu","year":"2021"},{"key":"10.1016\/j.knosys.2026.115856_bib0029","series-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","first-page":"6543","article-title":"Span-based localizing network for natural language video localization","author":"Zhang","year":"2020"},{"key":"10.1016\/j.knosys.2026.115856_bib0030","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"2765","article-title":"Interventional video grounding with dual contrastive learning","author":"Nan","year":"2021"},{"key":"10.1016\/j.knosys.2026.115856_bib0031","series-title":"Proceedings of the International Conference on Learning Representations (ICLR)","article-title":"ImagenHub: standardizing the evaluation of conditional image generation models","author":"Ku","year":"2023"},{"key":"10.1016\/j.knosys.2026.115856_bib0032","series-title":"Proceedings of the ACM International Conference on Multimedia","first-page":"426","article-title":"Cross-modality representation interactive learning for multimodal sentiment analysis","author":"Huang","year":"2023"},{"key":"10.1016\/j.knosys.2026.115856_bib0033","series-title":"Proceedings of the 32nd ACM International Conference on Multimedia","first-page":"7249","article-title":"Prior knowledge integration via llm encoding and pseudo event regulation for video moment retrieval","author":"Jiang","year":"2024"},{"key":"10.1016\/j.knosys.2026.115856_bib0034","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111583","article-title":"Layerlink: bridging remote sensing object detection and large vision models with efficient fine-tuning","volume":"165","author":"Zhu","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.knosys.2026.115856_bib0035","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111460","article-title":"Prompt-Ladder: memory-efficient prompt tuning for vision-language models on edge devices","volume":"163","author":"Cai","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.knosys.2026.115856_bib0036","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111300","article-title":"CTPT: continual test-time prompt tuning for vision-language models","volume":"161","author":"Wang","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.knosys.2026.115856_bib0037","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"19113","article-title":"Maple: multi-modal prompt learning","author":"Khattak","year":"2023"},{"key":"10.1016\/j.knosys.2026.115856_bib0038","article-title":"Language-aware spatial-temporal collaboration for referring video segmentation","author":"Hui","year":"2023","journal-title":"IEEE Trans. Pattern Anal Mach. Intell."},{"issue":"8","key":"10.1016\/j.knosys.2026.115856_bib0039","doi-asserted-by":"crossref","first-page":"10055","DOI":"10.1109\/TPAMI.2023.3262578","article-title":"Local-global context aware transformer for language-guided video segmentation","volume":"45","author":"Liang","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.knosys.2026.115856_bib0040","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"658","article-title":"Generalized intersection over union: a metric and a loss for bounding box regression","author":"Rezatofighi","year":"2019"},{"key":"10.1016\/j.knosys.2026.115856_bib0041","series-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV)","first-page":"706","article-title":"Dense-captioning events in videos","author":"Krishna","year":"2017"},{"key":"10.1016\/j.knosys.2026.115856_bib0042","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"510","article-title":"Hollywood in homes: crowdsourcing data collection for activity understanding","author":"Sigurdsson","year":"2016"},{"key":"10.1016\/j.knosys.2026.115856_bib0043","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"3032","article-title":"Compositional temporal grounding with structured variational cross-graph correspondence learning","author":"Li","year":"2022"},{"key":"10.1016\/j.knosys.2026.115856_bib0044","series-title":"European Conference on Computer Vision","first-page":"398","article-title":"SHINE: Saliency-aware HIerarchical NEgative ranking for compositional temporal grounding","author":"Cheng","year":"2024"},{"key":"10.1016\/j.knosys.2026.115856_bib0045","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"10287","article-title":"Dense regression network for video grounding","author":"Zeng","year":"2020"},{"key":"10.1016\/j.knosys.2026.115856_bib0046","series-title":"Proceedings of the ACM International Conference on Multimedia","first-page":"4116","article-title":"Dual path interaction network for video moment localization","author":"Wang","year":"2020"},{"key":"10.1016\/j.knosys.2026.115856_bib0047","series-title":"Proceedings of the Association for the Advancement of Artificial Intelligence (AAAI)","first-page":"2613","article-title":"Negative sample matters: a renaissance of metric learning for temporal grounding","volume":"36","author":"Wang","year":"2022"},{"key":"10.1016\/j.knosys.2026.115856_bib0048","doi-asserted-by":"crossref","first-page":"8297","DOI":"10.1109\/TMM.2023.3303712","article-title":"Relational network via cascade CRF for video language grounding","volume":"26","author":"Zhang","year":"2024","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.knosys.2026.115856_bib0049","series-title":"Proceedings of the Association for the Advancement of Artificial Intelligence (AAAI)","first-page":"12870","article-title":"Learning 2d temporal adjacent networks for moment localization with natural language","volume":"34","author":"Zhang","year":"2020"},{"key":"10.1016\/j.knosys.2026.115856_bib0050","series-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV)","first-page":"11573","article-title":"Support-set based cross-supervision for video grounding","author":"Ding","year":"2021"},{"key":"10.1016\/j.knosys.2026.115856_bib0051","series-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV)","first-page":"1523","article-title":"Fast video moment retrieval","author":"Gao","year":"2021"},{"key":"10.1016\/j.knosys.2026.115856_bib0052","series-title":"Proceedings of the Association for the Advancement of Artificial Intelligence (AAAI)","article-title":"Phrase-level temporal relationship mining for temporal sentence localization","author":"Zheng","year":"2023"},{"key":"10.1016\/j.knosys.2026.115856_bib0053","article-title":"Context-enhanced video moment retrieval with large language models","author":"Liu","year":"2025","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.knosys.2026.115856_bib0054","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","first-page":"6299","article-title":"Quo vadis, action recognition? A new model and the kinetics dataset","author":"Carreira","year":"2017"},{"key":"10.1016\/j.knosys.2026.115856_bib0055","series-title":"Proceedings of the IEEE International Conference on Computer Vision (ICCV)","first-page":"6202","article-title":"Slowfast networks for video recognition","author":"Feichtenhofer","year":"2019"},{"key":"10.1016\/j.knosys.2026.115856_bib0056","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"17675","article-title":"MC-bench: a benchmark for multi-context visual grounding in the era of mllms","author":"Xu","year":"2025"}],"container-title":["Knowledge-Based Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126005824?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0950705126005824?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T17:15:10Z","timestamp":1777569310000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0950705126005824"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":56,"alternative-id":["S0950705126005824"],"URL":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115856","relation":{},"ISSN":["0950-7051"],"issn-type":[{"value":"0950-7051","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"GASE: Generalized adaptive static enhancement for temporal sentence grounding","name":"articletitle","label":"Article Title"},{"value":"Knowledge-Based Systems","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.knosys.2026.115856","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"115856"}}