{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,28]],"date-time":"2026-02-28T13:20:57Z","timestamp":1772284857218,"version":"3.50.1"},"reference-count":43,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2026,2,28]],"date-time":"2026-02-28T00:00:00Z","timestamp":1772236800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,28]],"date-time":"2026-02-28T00:00:00Z","timestamp":1772236800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Beijing Natural Science Foundation","award":["4244082"],"award-info":[{"award-number":["4244082"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62402034"],"award-info":[{"award-number":["62402034"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int. J. Mach. Learn. &amp; Cyber."],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1007\/s13042-026-03008-5","type":"journal-article","created":{"date-parts":[[2026,2,28]],"date-time":"2026-02-28T12:34:50Z","timestamp":1772282090000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Length matters: Length-Aware Transformer for temporal sentence grounding"],"prefix":"10.1007","volume":"17","author":[{"given":"Ziyi","family":"Liu","sequence":"first","affiliation":[]},{"given":"Yifan","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Jiawei","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Liufeng","family":"Li","sequence":"additional","affiliation":[]},{"given":"Xiaolong","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Hongmin","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,2,28]]},"reference":[{"issue":"3","key":"3008_CR1","doi-asserted-by":"publisher","first-page":"1873","DOI":"10.1007\/s13042-024-02368-0","volume":"16","author":"Q He","year":"2025","unstructured":"He Q, Shi R, Chen L, Huo L (2025) Video anomaly detection based on multi-scale optical flow spatio-temporal enhancement and normality mining. Int J Mach Learn Cybern 16(3):1873\u20131888","journal-title":"Int J Mach Learn Cybern"},{"issue":"5","key":"3008_CR2","doi-asserted-by":"publisher","first-page":"1826","DOI":"10.1109\/TITS.2019.2913998","volume":"21","author":"L Claussmann","year":"2019","unstructured":"Claussmann L, Revilloud M, Gruyer D, Glaser S (2019) A review of motion planning for highway autonomous driving. IEEE Trans Intell Transp Syst 21(5):1826\u20131848","journal-title":"IEEE Trans Intell Transp Syst"},{"key":"3008_CR3","doi-asserted-by":"crossref","unstructured":"Gao J, Sun C, Yang Z, Nevatia R (2017) Tall: temporal activity localization via language query. In: Proceedings of the IEEE International Conference on computer vision, pp 5267\u20135275","DOI":"10.1109\/ICCV.2017.563"},{"key":"3008_CR4","doi-asserted-by":"crossref","unstructured":"Zhang S, Peng H, Fu J, Luo J (2020) Learning 2d temporal adjacent networks for moment localization with natural language. In: Proceedings of the AAAI Conference on artificial intelligence, vol 34, pp 12870\u201312877","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"3008_CR5","doi-asserted-by":"crossref","unstructured":"Carion, N, Massa, F, Synnaeve, G, Usunier, N, Kirillov, A, Zagoruyko S (2020) End-to-end object detection with transformers. In: European Conference on computer vision, Springer, pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"3008_CR6","doi-asserted-by":"crossref","unstructured":"Moon W, Hyun S, Park S, Park D, Heo J-P (2023) Query-dependent video representation for moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition, pp 23023\u201323033","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"3008_CR7","doi-asserted-by":"crossref","unstructured":"Xiao Y, Luo Z, Liu Y, Ma Y, Bian H, Ji Y, Yang Y, Li X (2024) Bridging the gap: A unified video comprehension framework for moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition, pp 18709\u201318719","DOI":"10.1109\/CVPR52733.2024.01770"},{"key":"3008_CR8","doi-asserted-by":"publisher","first-page":"5002","DOI":"10.1109\/TMM.2022.3185900","volume":"25","author":"J Jiang","year":"2023","unstructured":"Jiang J, Liu Z, Zheng N (2023) Livlr: a lightweight visual-linguistic reasoning framework for video question answering. IEEE Trans Multimed 25:5002\u20135013","journal-title":"IEEE Trans Multimed"},{"key":"3008_CR9","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1007\/s11263-023-01858-y","volume":"132","author":"J Jiang","year":"2024","unstructured":"Jiang J, Liu Z, Zheng N (2024) Correlation information bottleneck: towards adapting pretrained multimodal models for robust visual question answering. Int J Comput Vis 132:185\u2013207","journal-title":"Int J Comput Vis"},{"key":"3008_CR10","doi-asserted-by":"crossref","unstructured":"Jiang J, Liu Z, Liu Y, Nan Z, Zheng N (2021) X-ggm: graph generative modeling for out-of-distribution generalization in visual question answering. In: ACM International Conference on multimedia, pp 199\u2013208","DOI":"10.1145\/3474085.3475350"},{"issue":"5","key":"3008_CR11","doi-asserted-by":"publisher","first-page":"3711","DOI":"10.1007\/s13042-024-02476-x","volume":"16","author":"S Artham","year":"2025","unstructured":"Artham S, Shaikh SH (2025) A transformer-based convolutional local attention (convloa) method for temporal action localization. Int J Mach Learn Cybern 16(5):3711\u20133728","journal-title":"Int J Mach Learn Cybern"},{"key":"3008_CR12","doi-asserted-by":"crossref","unstructured":"Zhu Z, Wang L, Tang W, Liu Z, Zheng N, Hua G (2022) Learning disentangled classification and localization representations for temporal action localization. In: Proceedings of the AAAI Conference on artificial intelligence, vol 36, pp 3644\u20133652","DOI":"10.1609\/aaai.v36i3.20277"},{"key":"3008_CR13","doi-asserted-by":"crossref","unstructured":"Liu Z, Liu Y (2025) Bridge the gap: from weak to full supervision for temporal action localization with pseudoformer. In: Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition, pp 8711\u20138720","DOI":"10.1109\/CVPR52734.2025.00814"},{"key":"3008_CR14","doi-asserted-by":"crossref","unstructured":"Liu M, Wang X, Nie L, Tian Q, Chen B, Chua T-S (2018) Cross-modal moment localization in videos. In: Proceedings of the 26th ACM International Conference on multimedia, pp 843\u2013851","DOI":"10.1145\/3240508.3240549"},{"key":"3008_CR15","doi-asserted-by":"crossref","unstructured":"Wang Z, Wang L, Wu T, Li T, Wu G (2022) Negative sample matters: a renaissance of metric learning for temporal grounding. In: Proceedings of the AAAI Conference on artificial intelligence, vol 36, pp 2613\u20132623","DOI":"10.1609\/aaai.v36i3.20163"},{"key":"3008_CR16","doi-asserted-by":"crossref","unstructured":"Sun X, Wang X, Gao J, Liu Q, Zhou X (2022) You need to read again: multi-granularity perception network for moment retrieval in videos. In: Proceedings of the 45th International ACM SIGIR Conference on research and development in information retrieval, pp 1022\u20131032","DOI":"10.1145\/3477495.3532083"},{"key":"3008_CR17","doi-asserted-by":"crossref","unstructured":"Lu C, Chen L, Tan C, Li X, Xiao J (2019) Debug: a dense bottom-up grounding approach for natural language video localization. In: Proceedings of the 2019 Conference on empirical methods in natural language processing and the 9th International Joint Conference on natural language processing (EMNLP-IJCNLP), pp 5144\u20135153","DOI":"10.18653\/v1\/D19-1518"},{"key":"3008_CR18","doi-asserted-by":"crossref","unstructured":"Chen L, Lu C, Tang S, Xiao J, Zhang D, Tan C, Li X (2020) Rethinking the bottom-up framework for query-based video localization. In: Proceedings of the AAAI Conference on artificial intelligence, vol 34, pp 10551\u201310558","DOI":"10.1609\/aaai.v34i07.6627"},{"key":"3008_CR19","doi-asserted-by":"crossref","unstructured":"Yang S, Wu X (2022) Entity-aware and motion-aware transformers for language-driven action localization. In: Proceedings of the thirty-first International Joint Conference on artificial intelligence, LD Raedt, Ed, pp 1552\u20131558","DOI":"10.24963\/ijcai.2022\/216"},{"key":"3008_CR20","doi-asserted-by":"crossref","unstructured":"Zhang B, Yang C, Jiang B, Komamizu T, Ide I (2025) Multi-proposal collaboration and multi-task training for weakly-supervised video moment retrieval. Int J Mach Learn Cybern, 16:1\u201316","DOI":"10.1007\/s13042-024-02520-w"},{"key":"3008_CR21","doi-asserted-by":"crossref","unstructured":"Ji W, Shi R, Wei Y, Zhao S, Zimmermann R (2024) Weakly supervised video moment retrieval via location-irrelevant proposal learning. In: Companion proceedings of the ACM Web Conference 2024, pp 1595\u20131603","DOI":"10.1145\/3589335.3651942"},{"key":"3008_CR22","first-page":"11846","volume":"34","author":"J Lei","year":"2021","unstructured":"Lei J, Berg TL, Bansal M (2021) Detecting moments and highlights in videos via natural language queries. Adv Neural Inf Process Syst 34:11846\u201311858","journal-title":"Adv Neural Inf Process Syst"},{"key":"3008_CR23","doi-asserted-by":"crossref","unstructured":"Liu D, Qu X, Hu W (2022) Reducing the vision and language bias for temporal sentence grounding. In: Proceedings of the ACM International Conference on multimedia, pp 4092\u20134101","DOI":"10.1145\/3503161.3547969"},{"key":"3008_CR24","unstructured":"Sun X, Wang L, Zhou S, Shi L, Xia K, Liu M, Wang Y, Hua G (2025) Moment quantization for video temporal grounding. arXiv preprint arXiv:2504.02286"},{"key":"3008_CR25","doi-asserted-by":"crossref","unstructured":"Lee P, Byun H (2024) Bam-detr: boundary-aligned moment detection transformer for temporal sentence grounding in videos. In: European Conference on computer vision, Springer, pp 220\u2013238","DOI":"10.1007\/978-3-031-72627-9_13"},{"key":"3008_CR26","unstructured":"Zhu X, Su W, Lu L, Li B, Wang X, Dai J (2020) Deformable detr: Deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159"},{"key":"3008_CR27","first-page":"6531","volume":"35","author":"S Shi","year":"2022","unstructured":"Shi S, Jiang L, Dai D, Schiele B (2022) Motion transformer with global intention localization and local movement refinement. Adv Neural Inf Process Syst 35:6531\u20136543","journal-title":"Adv Neural Inf Process Syst"},{"key":"3008_CR28","doi-asserted-by":"crossref","unstructured":"Wang Y, Zhang X, Yang T, Sun J (2022) Anchor detr: Query design for transformer-based detector. In: Proceedings of the AAAI Conference on artificial intelligence, vol 36, pp 2567\u20132575","DOI":"10.1609\/aaai.v36i3.20158"},{"key":"3008_CR29","unstructured":"Liu S, Li F, Zhang H, Yang X, Qi X, Su H, Zhu J, Zhang L (2022) Dab-detr: Dynamic anchor boxes are better queries for detr. arXiv preprint arXiv:2201.12329"},{"key":"3008_CR30","unstructured":"Zhang H, Li F, Liu S, Zhang L, Su H, Zhu J, Ni LM, Shum H-Y (2022) Dino: Detr with improved denoising anchor boxes for end-to-end object detection. arXiv preprint arXiv:2203.03605"},{"key":"3008_CR31","doi-asserted-by":"crossref","unstructured":"Sun X, Shi L, Wang L, Zhou S, Xia K, Wang Y, Hua G (2024) Diversifying query: Region-guided transformer for temporal sentence grounding. arXiv preprint arXiv:2406.00143","DOI":"10.1609\/aaai.v39i7.32766"},{"key":"3008_CR32","first-page":"65948","volume":"36","author":"P Li","year":"2024","unstructured":"Li P, Xie C-W, Xie H, Zhao L, Zhang L, Zheng Y, Zhao D, Zhang Y (2024) Momentdiff: generative video moment retrieval from random to real. Adv Neural Inf Process Syst 36:65948","journal-title":"Adv Neural Inf Process Syst"},{"key":"3008_CR33","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li J, Selvaraju R, Gotmare A, Joty S, Xiong C, Hoi SCH (2021) Align before fuse: vision and language representation learning with momentum distillation. Adv Neural Inf Process Syst 34:9694\u20139705","journal-title":"Adv Neural Inf Process Syst"},{"key":"3008_CR34","doi-asserted-by":"crossref","unstructured":"Sun H, Zhou M, Chen W, Xie W (2024) Tr-detr: Task-reciprocal transformer for joint moment retrieval and highlight detection. arXiv preprint arXiv:2401.02309","DOI":"10.1609\/aaai.v38i5.28304"},{"key":"3008_CR35","unstructured":"Dosovitskiy A, Beyer L, Kolesnikov A, Weissenborn D, Zhai X, Unterthiner T, Dehghani M, Minderer M, Heigold G, Gelly S, et al (2020) An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"3008_CR36","doi-asserted-by":"crossref","unstructured":"Liu Y, Li S, Wu Y, Chen C-W, Shan Y, Qie X (2022) Umt: unified multi-modal transformers for joint video moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition, pp 3042\u20133051","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"3008_CR37","doi-asserted-by":"crossref","unstructured":"Lin KQ, Zhang P, Chen J, Pramanick S, Gao D, Wang AJ, Yan R, Shou MZ (2023) Univtg: towards unified video-language temporal grounding. In: Proceedings of the IEEE\/CVF International Conference on computer vision, pp 2794\u20132804","DOI":"10.1109\/ICCV51070.2023.00262"},{"key":"3008_CR38","doi-asserted-by":"crossref","unstructured":"Yang J, Wei P, Li H, Ren Z (2024) Task-driven exploration: Decoupling and inter-task feedback for joint moment retrieval and highlight detection. arXiv preprint arXiv:2404.09263","DOI":"10.1109\/CVPR52733.2024.01733"},{"key":"3008_CR39","unstructured":"Moon W, Hyun S, Lee S, Heo J-P (2023) Correlation-guided query-dependency calibration in video representation learning for temporal grounding. arXiv preprint arXiv:2311.08835"},{"key":"3008_CR40","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1162\/tacl_a_00207","volume":"1","author":"M Regneri","year":"2013","unstructured":"Regneri M, Rohrbach M, Wetzel D, Thater S, Schiele B, Pinkal M (2013) Grounding action descriptions in videos. Trans Assoc Comput Linguist 1:25\u201336","journal-title":"Trans Assoc Comput Linguist"},{"key":"3008_CR41","unstructured":"Diwan A, Peng P, Mooney R (2023) Zero-shot video moment retrieval with off-the-shelf models. In: Transfer learning for natural language processing workshop, PMLR, pp 10\u201321"},{"key":"3008_CR42","doi-asserted-by":"crossref","unstructured":"Feichtenhofer C, Fan H, Malik J, He K (2019) Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF International Conference on computer vision, pp 6202\u20136211","DOI":"10.1109\/ICCV.2019.00630"},{"key":"3008_CR43","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J, et al (2021) Learning transferable visual models from natural language supervision. In: International Conference on machine learning, PMLR, pp 8748\u20138763"}],"container-title":["International Journal of Machine Learning and Cybernetics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-026-03008-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13042-026-03008-5","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13042-026-03008-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,28]],"date-time":"2026-02-28T12:35:04Z","timestamp":1772282104000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13042-026-03008-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,28]]},"references-count":43,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2026,4]]}},"alternative-id":["3008"],"URL":"https:\/\/doi.org\/10.1007\/s13042-026-03008-5","relation":{},"ISSN":["1868-8071","1868-808X"],"issn-type":[{"value":"1868-8071","type":"print"},{"value":"1868-808X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,28]]},"assertion":[{"value":"7 October 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 January 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 February 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no competing interests.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"167"}}