{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T12:15:03Z","timestamp":1775132103011,"version":"3.50.1"},"reference-count":53,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T00:00:00Z","timestamp":1770076800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T00:00:00Z","timestamp":1770076800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1007\/s00530-025-02147-z","type":"journal-article","created":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T03:41:49Z","timestamp":1770090109000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Distinguishing semantically similar queries in temporal video grounding via LLM-generated query"],"prefix":"10.1007","volume":"32","author":[{"given":"Yibo","family":"Dang","sequence":"first","affiliation":[]},{"given":"Zhaobo","family":"Qi","sequence":"additional","affiliation":[]},{"given":"Xinyan","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Xinzhe","family":"Han","sequence":"additional","affiliation":[]},{"given":"Fei","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Weigang","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,3]]},"reference":[{"key":"2147_CR1","unstructured":"Badamdorj, T., Rochan, M., Wang, Y., Cheng, L.: Contrastive learning for unsupervised video highlight detection"},{"key":"2147_CR2","doi-asserted-by":"crossref","unstructured":"Gao, J., Sun, C., Yang, Z., Nevatia, R.: Tall: Temporal activity localization via language query. In: 2017 IEEE International Conference on Computer Vision (ICCV) (2017)","DOI":"10.1109\/ICCV.2017.563"},{"key":"2147_CR3","doi-asserted-by":"crossref","unstructured":"Liu, W., Mei, T., Zhang, Y., Che, C., Luo, J.: Multi-task deep visual-semantic embedding for video thumbnail selection. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298994"},{"key":"2147_CR4","unstructured":"Escorcia, V., Soldan, M., Sivic, J., Ghanem, B., Russell, B.: Temporal localization of moments in video collections with natural language. Vision and Pattern Recognition, arXiv Computer Vision and Pattern Recognition. arXiv Computer (2019)"},{"key":"2147_CR5","unstructured":"Lei, J., Berg, T., Bansal, M.: Detecting moments and highlights in videos via natural language queries. Neur Inform Process Syst (2021)"},{"key":"2147_CR6","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-End Object Detection with Transformers. In: Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part I, pp. 213\u2013229. (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"2147_CR7","doi-asserted-by":"crossref","unstructured":"Moon, W., Hyun, S., Park, S., Park, D., Heo, J.-P.: Query-dependent video representation for moment retrieval and highlight detection (2023)","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"2147_CR8","doi-asserted-by":"crossref","unstructured":"Xu, Y., Sun, Y., Li, Y., Shi, Y., Zhu, X., Du, S.: Mh-detr: Video moment and highlight detection with cross-modal transformer (2023)","DOI":"10.1109\/IJCNN60899.2024.10650814"},{"key":"2147_CR9","doi-asserted-by":"crossref","unstructured":"Lin, K., Zhang, P., Chen, J., Pramanick, S., Gao, D., Wang, A., Yan, R., Shou, M.: Univtg: Towards unified video-language temporal grounding (2023)","DOI":"10.1109\/ICCV51070.2023.00262"},{"key":"2147_CR10","unstructured":"Lei, J., Berg, T., Bansal, M.: Qvhighlights: Detecting moments and highlights in videos via natural language queries. Cornell University - arXiv, Cornell University (2021)"},{"key":"2147_CR11","unstructured":"Liu, Y., Li, S., Wu, Y., Chen, C., Shan, Y., Qie, X.: Umt: Unified multi-modal transformers for joint video moment retrieval and highlight detection"},{"key":"2147_CR12","doi-asserted-by":"crossref","unstructured":"Wang, Z., Wang, L., Wu, T., Li, T., Wu, G.: Negative sample matters: A renaissance of metric learning for temporal grounding. Proceedings of the AAAI Conference on Artificial Intelligence, 2613\u20132623 (2022)","DOI":"10.1609\/aaai.v36i3.20163"},{"key":"2147_CR13","doi-asserted-by":"crossref","unstructured":"Nan, G., Qiao, R., Xiao, Y., Liu, J., Leng, S., Zhang, H., Lu, W.: Interventional video grounding with dual contrastive learning. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00279"},{"key":"2147_CR14","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Ma, L., Wang, J., Liu, W., Zhu, W.: Semantic conditioned dynamic modulation for temporal sentence grounding in videos. IEEE Transactions on Pattern Analysis and Machine Intelligence, 1\u20131 (2020)","DOI":"10.1109\/TPAMI.2020.3038993"},{"key":"2147_CR15","doi-asserted-by":"crossref","unstructured":"Li, P., Xie, C.-W., Xie, H., Zhao, L., Zhang, L., Zheng, Y., Zhao, D., Zhang, Y.: Momentdiff: Generative video moment retrieval from random to real (2023)","DOI":"10.52202\/075280-2880"},{"key":"2147_CR16","unstructured":"Moon, W., Hyun, S., Lee, S., Heo, J.-P.: Correlation-guided query-dependency calibration in video representation learning for temporal grounding (2023)"},{"key":"2147_CR17","doi-asserted-by":"crossref","unstructured":"Xiao, Y., Luo, Z., Liu, Y., Ma, Y., Bian, H., Ji, Y., Yang, Y., Li, X.: Bridging the gap: A unified video comprehension framework for moment retrieval and highlight detection (2023)","DOI":"10.1109\/CVPR52733.2024.01770"},{"key":"2147_CR18","unstructured":"Paul, D., Parvez, M.R., Mohammed, N., Rahman, S.: Videolights: Feature refinement and cross-task alignment transformer for joint video highlight detection and moment retrieval. arXiv preprint arXiv:2412.01558 (2024)"},{"issue":"11","key":"2147_CR19","doi-asserted-by":"publisher","first-page":"10972","DOI":"10.1109\/TCSVT.2024.3413074","volume":"34","author":"Z Qi","year":"2024","unstructured":"Qi, Z., Yuan, Y., Ruan, X., Wang, S., Zhang, W., Huang, Q.: Collaborative debias strategy for temporal sentence grounding in video. IEEE Trans. Circuits Syst. Video Technol. 34(11), 10972\u201310986 (2024)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"2147_CR20","doi-asserted-by":"crossref","unstructured":"Qi, Z., Yuan, Y., Ruan, X., Wang, S., Zhang, W., Huang, Q.: Bias-conflict sample synthesis and adversarial removal debias strategy for temporal sentence grounding in video. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, pp. 4533\u20134541 (2024)","DOI":"10.1609\/aaai.v38i5.28252"},{"key":"2147_CR21","doi-asserted-by":"crossref","unstructured":"Badamdorj, T., Rochan, M., Wang, Y., Cheng, L.: Joint visual and audio learning for video highlight detection. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00802"},{"key":"2147_CR22","doi-asserted-by":"crossref","unstructured":"Xu, M., Wang, H., Ni, B., Zhu, R., Sun, Z., Wang, C.: Cross-category video highlight detection via set-based learning. Cornell University - arXiv, Cornell University (2021)","DOI":"10.1109\/ICCV48922.2021.00787"},{"key":"2147_CR23","doi-asserted-by":"crossref","unstructured":"Sun, M., Farhadi, A., Seitz, S.: Ranking domain-specific highlights by analyzing Edited Videos, pp. 787\u2013802 (2014)","DOI":"10.1007\/978-3-319-10590-1_51"},{"key":"2147_CR24","doi-asserted-by":"crossref","unstructured":"Hong, F.-T., Huang, X., Li, W.-H., Zheng, W.-S.: MINI-Net: multiple instance ranking network for video highlight detection, pp. 345\u2013360 (2020)","DOI":"10.1007\/978-3-030-58601-0_21"},{"key":"2147_CR25","unstructured":"Song, Y., Vallmitjana, J., Stent, A., Jaimes, A.: Tvsum: Summarizing web videos using titles. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)"},{"key":"2147_CR26","doi-asserted-by":"crossref","unstructured":"Gygli, M., Grabner, H., Riemenschneider, H., Van\u00a0Gool, L.: Creating summaries from user videos, pp. 505\u2013520 (2014)","DOI":"10.1007\/978-3-319-10584-0_33"},{"key":"2147_CR27","unstructured":"Radford, A., Kim, J., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Amanda, A., Mishkin, P., Clark, J., Krueger, G., Sutskever, I.: Learning transferable visual models from natural language supervision. Cornell University - arXiv, Cornell University (2021)"},{"key":"2147_CR28","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: 2019 IEEE\/CVF International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"2147_CR29","doi-asserted-by":"crossref","unstructured":"Sun, H., Zhou, M., Chen, W., Xie, W.: Tr-detr: Task-reciprocal transformer for joint moment retrieval and highlight detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 38, pp. 4998\u20135007 (2024)","DOI":"10.1609\/aaai.v38i5.28304"},{"key":"2147_CR30","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2025.3558854","author":"J Hu","year":"2025","unstructured":"Hu, J., Guo, D., Li, K., Si, Z., Yang, X., Chang, X., Wang, M.: Unified static and dynamic network: efficient temporal filtering for video grounding. IEEE Transactions on Pattern Analysis and Machine Intelligence (2025). https:\/\/doi.org\/10.1109\/TPAMI.2025.3558854","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2147_CR31","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.110898","volume":"157","author":"Y Su","year":"2025","unstructured":"Su, Y., Tan, Y., An, S., Xing, M., Feng, Z.: Semantic-driven dual consistency learning for weakly supervised video anomaly detection. Pattern Recognition 157, 110898 (2025)","journal-title":"Pattern Recognition"},{"key":"2147_CR32","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.111978","volume":"299","author":"Y Su","year":"2024","unstructured":"Su, Y., Tan, Y., Xing, M., An, S.: Vpe-wsvad: visual prompt exemplars for weakly-supervised video anomaly detection. Knowledge-Based Systems 299, 111978 (2024)","journal-title":"Knowledge-Based Systems"},{"issue":"7","key":"2147_CR33","doi-asserted-by":"publisher","first-page":"5177","DOI":"10.1109\/TPAMI.2024.3415087","volume":"47","author":"K Liu","year":"2024","unstructured":"Liu, K., Qu, M., Liu, Y., Wei, Y., Zhe, W., Zhao, Y., Liu, W.: Single-frame supervision for spatio-temporal video grounding. IEEE Trans. Pattern Anal. Mach. Intell. 47(7), 5177\u20135191 (2024)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2147_CR34","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., Houlsby, N.: An image is worth 16x16 words: Transformers for image recognition at scale. Vision and Pattern Recognition, arXiv Computer Vision and Pattern Recognition. arXiv Computer (2020)"},{"key":"2147_CR35","doi-asserted-by":"crossref","unstructured":"Dai, X., Chen, Y., Yang, J., Zhang, P., Yuan, L., Zhang, L.: Dynamic detr: End-to-end object detection with dynamic attention. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00298"},{"key":"2147_CR36","unstructured":"Li, F., Zhang, H., Liu, S., Guo, J., Ni, L., Zhang, L.: Dn-detr: Accelerate detr training by introducing query denoising"},{"key":"2147_CR37","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable detr: Deformable transformers for end-to-end object detection. arXiv Computer Vision and Pattern Recognition, arXiv Computer Vision and Pattern Recognition (2020)"},{"key":"2147_CR38","unstructured":"Huang, K.-C., Wu, T.-H., Su, H.-T., Hsu, W.: Monodtr: Monocular 3d object detection with depth-aware transformer"},{"key":"2147_CR39","unstructured":"Cheng, B., Schwing, A., Kirillov, A.: Per-pixel classification is not all you need for semantic segmentation. Cornell University - arXiv, Cornell University - arXiv (2021)"},{"key":"2147_CR40","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"2147_CR41","unstructured":"Zhang, R., Qiu, H., Wang, T., Xu, X., Guo, Z., Qiao, Y., Gao, P., Li, H.: Monodetr: depth-aware transformer for monocular 3d object detection"},{"key":"2147_CR42","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Tubedetr: Spatio-temporal video grounding with transformers"},{"key":"2147_CR43","unstructured":"Liu, S., Li, F., Zhang, H., Yang, X., Qi, X., Su, H., Zhu, J., Zhang, L.: Dab-detr: Dynamic anchor boxes are better queries for detr"},{"key":"2147_CR44","doi-asserted-by":"crossref","unstructured":"Meng, D., Chen, X., Fan, Z., Zeng, G., Li, H., Yuan, Y., Sun, L., Wang, J.: Conditional detr for fast training convergence. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00363"},{"key":"2147_CR45","doi-asserted-by":"crossref","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., Carion, N.: Mdetr - modulated detection for end-to-end multi-modal understanding. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"2147_CR46","unstructured":"Li, K., He, Y., Wang, Y., Li, Y., Wang, W., Luo, P., Wang, Y., Wang, L., Qiao, Y.: Videochat: Chat-centric video understanding. arXiv preprint arXiv:2305.06355 (2023)"},{"key":"2147_CR47","doi-asserted-by":"crossref","unstructured":"Lin, B., Ye, Y., Zhu, B., Cui, J., Ning, M., Jin, P., Yuan, L.: Video-llava: Learning united visual representation by alignment before projection. In: Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pp. 5971\u20135984 (2024)","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"2147_CR48","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., Khan, F.: Video-chatgpt: Towards detailed video understanding via large vision and language models. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 12585\u201312602 (2024)","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"2147_CR49","doi-asserted-by":"crossref","unstructured":"Wang, J., Zhang, Z., Liu, Z., Li, Y., Ge, J., Xie, H., Zhang, Y.: Spacevllm: Endowing multimodal large language model with spatio-temporal video grounding capability. arXiv preprint arXiv:2503.13983 (2025)","DOI":"10.1609\/aaai.v40i12.37956"},{"key":"2147_CR50","doi-asserted-by":"crossref","unstructured":"Cai, C., Zhang, R., Gao, J., Wu, K., Yap, K.-H., Wang, Y.: Temporal sentence grounding with temporally global textual knowledge. In: 2024 IEEE International Conference on Multimedia and Expo (ICME), IEEE. pp. 1\u20136 (2024)","DOI":"10.1109\/ICME57554.2024.10687646"},{"key":"2147_CR51","doi-asserted-by":"crossref","unstructured":"Zheng, M., Cai, X., Chen, Q., Peng, Y., Liu, Y.: Training-free video temporal grounding using large-scale pre-trained models. In: European Conference on Computer Vision, pp. 20\u201337, Springer (2024)","DOI":"10.1007\/978-3-031-73007-8_2"},{"key":"2147_CR52","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1162\/tacl_a_00207","volume":"1","author":"M Regneri","year":"2013","unstructured":"Regneri, M., Rohrbach, M., Wetzel, D., Thater, S., Schiele, B., Pinkal, M.: Grounding action descriptions in videos. Transactions of the Association for Computational Linguistics 1, 25\u201336 (2013)","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"2147_CR53","unstructured":"Liu, A., Feng, B., Xue, B., Wang, B., Wu, B., Lu, C., Zhao, C., Deng, C., Zhang, C., Ruan, C., et al.: Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-02147-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-02147-z","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-02147-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T11:38:20Z","timestamp":1775129900000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-02147-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,3]]},"references-count":53,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2026,4]]}},"alternative-id":["2147"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-02147-z","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,3]]},"assertion":[{"value":"31 August 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 December 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"98"}}