{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T11:07:29Z","timestamp":1777633649617,"version":"3.51.4"},"reference-count":57,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T00:00:00Z","timestamp":1770940800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T00:00:00Z","timestamp":1770940800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"the Shanghai Municipal Fund for Promoting the Development of the Cultural and Creative Industries"},{"name":"the Shanghai Natural Science Foundation"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Machine Vision and Applications"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1007\/s00138-026-01799-9","type":"journal-article","created":{"date-parts":[[2026,2,13]],"date-time":"2026-02-13T16:06:15Z","timestamp":1770998775000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Guided by structure: boundary-aware modeling for moment retrieval and highlight detection"],"prefix":"10.1007","volume":"37","author":[{"given":"Bing","family":"Yu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Youxian","family":"Di","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenzhen","family":"Jin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingyu","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Youdong","family":"Ding","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dongjin","family":"Huang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2026,2,13]]},"reference":[{"key":"1799_CR1","first-page":"11846","volume":"34","author":"J Lei","year":"2021","unstructured":"Lei, J., Berg, T.L., Bansal, M.: Detecting moments and highlights in videos via natural language queries. Adv. Neural Inform. Process. Syst. (NeurIPS) 34, 11846\u201311858 (2021)","journal-title":"Adv. Neural Inform. Process. Syst. (NeurIPS)"},{"key":"1799_CR2","doi-asserted-by":"publisher","unstructured":"Moon, W., Hyun, S., Park, S., Park, D., Heo, J.-P.: Query-dependent video representation for moment retrieval and highlight detection. In: The IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 23023\u201323033 (2023). https:\/\/doi.org\/10.1109\/CVPR52729.2023.02205","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"1799_CR3","unstructured":"Zhao, P., He, Z., Zhang, F., Lin, S., Zhou, F.: Ld-detr: Loop decoder detection transformer for video moment retrieval and highlight detection. arXiv:2501.10787 (2025)"},{"key":"1799_CR4","doi-asserted-by":"publisher","first-page":"4998","DOI":"10.1609\/aaai.v38i5.28304","volume":"38","author":"H Sun","year":"2024","unstructured":"Sun, H., Zhou, M., Chen, W., Xie, W.: TR-DETR: task-reciprocal transformer for joint moment retrieval and highlight detection. Proc. AAAI Conf. Artif. Intell. (AAAI) 38, 4998\u20135007 (2024). https:\/\/doi.org\/10.1609\/aaai.v38i5.28304","journal-title":"Proc. AAAI Conf. Artif. Intell. (AAAI)"},{"key":"1799_CR5","doi-asserted-by":"publisher","unstructured":"Yang, J., Wei, P., Li, H., Ren, Z.: Task-driven exploration: Decoupling and inter-task feedback for joint moment retrieval and highlight detection. In: Proceedings of IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 18308\u201318318 (2024). https:\/\/doi.org\/10.1109\/CVPR52733.2024.01733","DOI":"10.1109\/CVPR52733.2024.01733"},{"key":"1799_CR6","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2024.110898","volume":"157","author":"Y Su","year":"2025","unstructured":"Su, Y., Tan, Y., An, S., Xing, M., Feng, Z.: Semantic-driven dual consistency learning for weakly supervised video anomaly detection. Pattern Recogn. 157, 110898 (2025). https:\/\/doi.org\/10.1016\/j.patcog.2024.110898","journal-title":"Pattern Recogn."},{"key":"1799_CR7","doi-asserted-by":"publisher","unstructured":"Liu, M., Wang, X., Nie, L., He, X., Chen, B., Chua, T.-S.: Attentive moment retrieval in videos. In: Proceedings 41st International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR), pp. 15\u201324 (2018). https:\/\/doi.org\/10.1145\/3209978.3210003","DOI":"10.1145\/3209978.3210003"},{"key":"1799_CR8","doi-asserted-by":"publisher","unstructured":"Hendricks, L.A., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with temporal language. In: Proceedings 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1380\u20131390. Association for Computational Linguistics, Brussels, Belgium (2018). https:\/\/doi.org\/10.18653\/v1\/D18-1168","DOI":"10.18653\/v1\/D18-1168"},{"key":"1799_CR9","doi-asserted-by":"publisher","unstructured":"Zhang, D., Dai, X., Wang, X., Wang, Y.-F., Davis, L.S.: Man: Moment alignment network for natural language moment retrieval via iterative graph adjustment. In: Proceedings of the IEEE conference on computer vision and pattern Recognition (CVPR), pp. 1247\u20131257 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00134","DOI":"10.1109\/CVPR.2019.00134"},{"key":"1799_CR10","doi-asserted-by":"publisher","unstructured":"Gao, J., Sun, X., Xu, M., Zhou, X., Ghanem, B.: Relation-aware video reading comprehension for temporal language grounding. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 3978\u20133988. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic (2021). https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.324","DOI":"10.18653\/v1\/2021.emnlp-main.324"},{"key":"1799_CR11","doi-asserted-by":"publisher","first-page":"9159","DOI":"10.1609\/aaai.v33i01.33019159","volume":"33","author":"Y Yuan","year":"2019","unstructured":"Yuan, Y., Mei, T., Zhu, W.: To find where you talk: temporal sentence localization in video with attention based location regression. Proc. AAAI Conf. Artif. Intell. (AAAI) 33, 9159\u20139166 (2019). https:\/\/doi.org\/10.1609\/aaai.v33i01.33019159","journal-title":"Proc. AAAI Conf. Artif. Intell. (AAAI)"},{"key":"1799_CR12","doi-asserted-by":"publisher","unstructured":"Rodriguez, C., Marrese-Taylor, E., Saleh, F.S., Li, H., Gould, S.: Proposal-free temporal moment localization of a natural-language query in video using guided attention. In: Proceedings IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 2464\u20132473 (2020). https:\/\/doi.org\/10.1109\/WACV45572.2020.9093328","DOI":"10.1109\/WACV45572.2020.9093328"},{"key":"1799_CR13","doi-asserted-by":"crossref","unstructured":"Gygli, M., Song, Y., Cao, L.: Video2gif: Automatic generation of animated gifs from video. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1001\u20131009 (2016)","DOI":"10.1109\/CVPR.2016.114"},{"key":"1799_CR14","doi-asserted-by":"publisher","unstructured":"Rochan, M., Krishna\u00a0Reddy, M.K., Ye, L., Wang, Y.: Adaptive video highlight detection by learning from user history. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J. (eds.) Computer Vision \u2013 ECCV 2020. Lecture Notes in Computer Science, 12366, pp. 261\u2013278. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58589-1_16","DOI":"10.1007\/978-3-030-58589-1_16"},{"key":"1799_CR15","doi-asserted-by":"publisher","unstructured":"Xiong, B., Kalantidis, Y., Ghadiyaram, D., Grauman, K.: Less is more: Learning highlight detection from video duration. In: Proceedings IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR), pp. 1258\u20131267 (2019). https:\/\/doi.org\/10.1109\/CVPR.2019.00135","DOI":"10.1109\/CVPR.2019.00135"},{"key":"1799_CR16","doi-asserted-by":"publisher","unstructured":"Escorcia, V., Soldan, M., Sivic, J., Ghanem, B., Russell, B.: Finding moments in video collections using natural language. arXiv:1907.12763 (2019) https:\/\/doi.org\/10.48550\/arXiv.1907.12763","DOI":"10.48550\/arXiv.1907.12763"},{"key":"1799_CR17","doi-asserted-by":"publisher","unstructured":"Wei, F., Wang, B., Ge, T., Jiang, Y., Li, W., Duan, L.: Learning pixel-level distinctions for video highlight detection. In: Proceedings IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3073\u20133082 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.00308","DOI":"10.1109\/CVPR52688.2022.00308"},{"key":"1799_CR18","doi-asserted-by":"publisher","unstructured":"Badamdorj, T., Rochan, M., Wang, Y., Cheng, L.: Joint visual and audio learning for video highlight detection. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 8127\u20138137 (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00802","DOI":"10.1109\/ICCV48922.2021.00802"},{"key":"1799_CR19","doi-asserted-by":"publisher","unstructured":"Liu, Y., Li, S., Wu, Y., Chen, C.W., Shan, Y., Qie, X.: Umt: Unified multi-modal transformers for joint video moment retrieval and highlight detection. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3032\u20133041 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.00305","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"1799_CR20","doi-asserted-by":"publisher","unstructured":"Xiao, Y., Luo, Z., Liu, Y., Ma, Y., Bian, H., Ji, Y., Yang, Y., Li, X.: Bridging the gap: A unified video comprehension framework for moment retrieval and highlight detection. In: Proceedings IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 18709\u201318719 (2024). https:\/\/doi.org\/10.1109\/CVPR52733.2024.01770","DOI":"10.1109\/CVPR52733.2024.01770"},{"key":"1799_CR21","unstructured":"Moon, W., Hyun, S., Lee, S., Heo, J.-P.: Correlation-guided query-dependency calibration for video temporal grounding. arXiv:2311.08835 (2023)"},{"key":"1799_CR22","doi-asserted-by":"publisher","first-page":"7473","DOI":"10.1609\/aaai.v39i7.32804","volume":"39","author":"SJ Um","year":"2025","unstructured":"Um, S.J., Kim, D., Lee, S., Kim, J.U.: Watch video, catch keyword: context-aware keyword attention for moment retrieval and highlight detection. Proc. AAAI Conf. Artif. Intell. (AAAI) 39, 7473\u20137481 (2025). https:\/\/doi.org\/10.1609\/aaai.v39i7.32804","journal-title":"Proc. AAAI Conf. Artif. Intell. (AAAI)"},{"key":"1799_CR23","doi-asserted-by":"crossref","unstructured":"Xu, Y., Sun, Y., Zhai, B., Xie, Z., Jia, Y., Du, S.: Multi-modal fusion and query refinement network for video moment retrieval and highlight detection. In: 2024 IEEE International Conference on Multimedia and Expo (ICME), pp. 1\u20136 (2024). IEEE","DOI":"10.1109\/ICME57554.2024.10687844"},{"key":"1799_CR24","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3542081","author":"J Liu","year":"2025","unstructured":"Liu, J., He, Z., Nie, W., Zhang, Z., Su, Y.: What and where: semantic grasping and contextual scanning for moment retrieval and highlight detection. IEEE Trans. Circuits Syst. Video Technol. (2025). https:\/\/doi.org\/10.1109\/TCSVT.2025.3542081","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1799_CR25","doi-asserted-by":"publisher","unstructured":"Ma, H., Wang, G., Yu, F., Jia, Q., Ding, S.: Ms-detr: Towards effective video moment retrieval and highlight detection by joint motion-semantic learning. In: Proceedings of the ACM International Conference on Multimedia (ACM MM) (2025). https:\/\/doi.org\/10.48550\/arXiv.2507.12062 . Accepted; to appear","DOI":"10.48550\/arXiv.2507.12062"},{"issue":"5","key":"1799_CR26","doi-asserted-by":"publisher","first-page":"3955","DOI":"10.1109\/TCSVT.2024.3510950","volume":"35","author":"X Jiang","year":"2025","unstructured":"Jiang, X., Zhu, L., Xu, X., Shen, F., Yang, Y., Shen, H.T.: Query as supervision: toward low-cost and robust video moment and highlight retrieval. IEEE Trans. Circuits Syst. Video Technol. 35(5), 3955\u20133968 (2025). https:\/\/doi.org\/10.1109\/TCSVT.2024.3510950","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1799_CR27","unstructured":"Zhao, H., Lin, K., Yan, R., Li, Z.: Diffusionvmr: Diffusion model for video moment retrieval. arXiv: 2308.15109 (2023)"},{"key":"1799_CR28","unstructured":"Zhu, H., Wu, H., Li, Y., Zhang, Z., Chen, B., Zhu, L., Fang, Y., Zhai, G., Lin, W., Wang, S.: Adaptive image quality assessment via teaching large multimodal model to compare. In: Advances in Neural Information Processing Systems (2024)"},{"key":"1799_CR29","unstructured":"Zhu, L., Zeng, X., Chen, B., Chen, P., Li, Y.-H., Wang, S.: Leveraging diffusion knowledge for generative image compression with fractal frequency-aware band learning. arXiv:2503.11321 (2025) arXiv:2503.11321 [cs.CV]"},{"key":"1799_CR30","doi-asserted-by":"publisher","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Computer Vision \u2013 ECCV 2020, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"1799_CR31","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., Uszkoreit, J., Houlsby, N.: An image is worth 16x16 words: Transformers for image recognition at scale. In: International Conference on Learning Representations (ICLR) (2021)"},{"key":"1799_CR32","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., J\u00e9gou, H.: Training data-efficient image transformers & distillation through attention. In: Meila, M., Zhang, T. (eds.) Proceedings of the 38th International Conference on Machine Learning (ICML). Proc. Mach. Learn. Res., vol. 139, pp. 10347\u201310357. PMLR, Online (2021)"},{"key":"1799_CR33","doi-asserted-by":"publisher","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., Carion, N.: MDETR: Modulated detection for end-to-end multi-modal understanding. In: Proceedings IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1780\u20131790 (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00180","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"1799_CR34","doi-asserted-by":"publisher","unstructured":"Ye, J., Tian, J., Yan, M., Yang, X., Wang, X., Zhang, J., He, L., Lin, X.: Shifting more attention to visual backbone: Query-modulated refinement networks for end-to-end visual grounding. In: Proceedings IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15481\u201315491. IEEE, Piscataway, NJ, USA (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.01506","DOI":"10.1109\/CVPR52688.2022.01506"},{"key":"1799_CR35","doi-asserted-by":"crossref","unstructured":"Sun, Z., Cao, S., Yang, Y., Kitani, K.M.: Rethinking transformer-based set prediction for object detection. In: Proceedings IEEE\/CVF International Conference on Computer Vision. pp. 3611\u20133620 (2021)","DOI":"10.1109\/ICCV48922.2021.00359"},{"key":"1799_CR36","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., Krueger, G., Sutskever, I.: Learning transferable visual models from natural language supervision. In: Meila, M., Zhang, T. (eds.) Proceedings of 38th International Conference on Machine Learning (ICML). Proc. Mach. Learn. Res., vol. 139, pp. 8748\u20138763. PMLR, Online (2021)"},{"key":"1799_CR37","doi-asserted-by":"publisher","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: Proceedings of IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 6202\u20136211 (2019). https:\/\/doi.org\/10.1109\/ICCV.2019.00630","DOI":"10.1109\/ICCV.2019.00630"},{"issue":"2","key":"1799_CR38","doi-asserted-by":"publisher","first-page":"339","DOI":"10.1007\/s10115-016-0987-z","volume":"51","author":"S Aminikhanghahi","year":"2017","unstructured":"Aminikhanghahi, S., Cook, D.J.: A survey of methods for time series change point detection. Knowl. Inf. Syst. 51(2), 339\u2013367 (2017). https:\/\/doi.org\/10.1007\/s10115-016-0987-z","journal-title":"Knowl. Inf. Syst."},{"key":"1799_CR39","doi-asserted-by":"crossref","unstructured":"Foote, J.: Automatic audio segmentation using a measure of audio novelty. In: Proceedings IEEE International Conference on Multimedia and Expo (ICME), pp. 452\u2013455. IEEE, New York, NY, USA (2000)","DOI":"10.1109\/ICME.2000.869637"},{"key":"1799_CR40","doi-asserted-by":"publisher","unstructured":"Song, Y., Vallmitjana, J., Stent, A., Jaimes, A.: Tvsum: Summarizing web videos using titles. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5179\u20135187 (2015). https:\/\/doi.org\/10.1109\/CVPR.2015.7299154","DOI":"10.1109\/CVPR.2015.7299154"},{"key":"1799_CR41","doi-asserted-by":"publisher","unstructured":"Gao, J., Sun, C., Yang, Z., Nevatia, R.: Tall: Temporal activity localization via language query. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 5267\u20135275 (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.563","DOI":"10.1109\/ICCV.2017.563"},{"key":"1799_CR42","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: International Conference on Learning Representations (ICLR) (2015)"},{"key":"1799_CR43","doi-asserted-by":"publisher","unstructured":"Song, Y., Redi, M., Vallmitjana, J., Jaimes, A.: To click or not to click: Automatic selection of beautiful thumbnails from videos. In: Proceedings of the 25th ACM International Conference on Information and Knowledge Management (CIKM), pp. 659\u2013668 (2016). https:\/\/doi.org\/10.1145\/2983323.2983349","DOI":"10.1145\/2983323.2983349"},{"key":"1799_CR44","doi-asserted-by":"publisher","unstructured":"Liu, W., Mei, T., Zhang, Y., Che, C., Luo, J.: Multi-task deep visual-semantic embedding for video thumbnail selection. In: Proceedings of the IEEE conference on computer vision and pattern Recognition (CVPR), pp. 3707\u20133715 (2015). https:\/\/doi.org\/10.1109\/CVPR.2015.7298994","DOI":"10.1109\/CVPR.2015.7298994"},{"key":"1799_CR45","doi-asserted-by":"publisher","unstructured":"Hendricks, L.A., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV), pp. 5803\u20135812. IEEE, Piscataway, NJ, USA (2017). https:\/\/doi.org\/10.1109\/ICCV.2017.618","DOI":"10.1109\/ICCV.2017.618"},{"key":"1799_CR46","doi-asserted-by":"publisher","unstructured":"Lei, J., Yu, L., Berg, T.L., Bansal, M.: Tvr: A large-scale dataset for video-subtitle moment retrieval. In: European Conference on Computer Vision (ECCV), pp. 447\u2013463. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58589-1_27","DOI":"10.1007\/978-3-030-58589-1_27"},{"key":"1799_CR47","doi-asserted-by":"crossref","unstructured":"Lin, K.Q., Zhang, P., Chen, J., Pramanick, S., Gao, D., Wang, A.J., Yan, R., Shou, M.Z.: Univtg: Towards unified video-language temporal grounding. In: Proceedings of IEEE\/CVF International Conference on Computer Vision. pp. 2794\u20132804 (2023)","DOI":"10.1109\/ICCV51070.2023.00262"},{"issue":"11","key":"1799_CR48","doi-asserted-by":"publisher","first-page":"11271","DOI":"10.1109\/TCSVT.2024.3409897","volume":"34","author":"S Zhou","year":"2024","unstructured":"Zhou, S., Zhang, F., Wang, R., Zhou, F., Su, Z.: Subtask prior-driven optimized mechanism on joint video moment retrieval and highlight detection. IEEE Trans. Circuits Syst. Video Technol. 34(11), 11271\u201311285 (2024). https:\/\/doi.org\/10.1109\/TCSVT.2024.3409897","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"1799_CR49","doi-asserted-by":"crossref","unstructured":"Mahasseni, B., Lam, M., Todorovic, S.: Unsupervised video summarization with adversarial lstm networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 202\u2013211 (2017)","DOI":"10.1109\/CVPR.2017.318"},{"key":"1799_CR50","doi-asserted-by":"publisher","unstructured":"Wang, L., Liu, D., Puri, R., Metaxas, D.N.: Learning trailer moments in full-length movies with co-contrastive attention. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J. (eds.) Computer Vision \u2013 ECCV 2020. Lecture Notes in Computer Science, vol. 12363, pp. 300\u2013316. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58523-5_18","DOI":"10.1007\/978-3-030-58523-5_18"},{"key":"1799_CR51","doi-asserted-by":"crossref","unstructured":"Xu, M., Wang, H., Ni, B., Zhu, R., Sun, Z., Wang, C.: Cross-category video highlight detection via set-based learning. In: Proceedings ofIEEE\/CVF International Conference on Computer Vision, pp. 7970\u20137979 (2021)","DOI":"10.1109\/ICCV48922.2021.00787"},{"key":"1799_CR52","doi-asserted-by":"publisher","unstructured":"Chen, S., Jiang, Y.-G.: Semantic proposal for activity localization in videos via sentence query. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), vol. 33, pp. 8199\u20138206. AAAI Press, Palo Alto, CA, USA (2019). https:\/\/doi.org\/10.1609\/aaai.v33i01.33018199","DOI":"10.1609\/aaai.v33i01.33018199"},{"key":"1799_CR53","doi-asserted-by":"publisher","unstructured":"Zhang, S., Peng, H., Fu, J., Luo, J.: Learning 2d temporal adjacent networks for moment localization with natural language. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), vol. 34, pp. 12870\u201312877. AAAI Press, Palo Alto, CA, USA (2020). https:\/\/doi.org\/10.1609\/aaai.v34i07.6984","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"1799_CR54","doi-asserted-by":"publisher","unstructured":"Gao, J., Xu, C.: Fast video moment retrieval. In: Proceedings of IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1523\u20131532 (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.00155","DOI":"10.1109\/ICCV48922.2021.00155"},{"key":"1799_CR55","doi-asserted-by":"publisher","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of IEEE Conference on Computer Vision Pattern Recognition (CVPR), pp. 4724\u20134733 (2017). https:\/\/doi.org\/10.1109\/CVPR.2017.502","DOI":"10.1109\/CVPR.2017.502"},{"key":"1799_CR56","doi-asserted-by":"crossref","unstructured":"Xu, A., Zheng, W.: Efficient and effective weakly-supervised action segmentation via action-transition-aware boundary alignment. In: Proceedings of IEEE\/CVF Conference on Computer Vision Pattern Recognition (CVPR), pp. 18253\u201318262 (2024)","DOI":"10.1109\/CVPR52733.2024.01728"},{"key":"1799_CR57","doi-asserted-by":"crossref","unstructured":"Zeng, Y., Zhong, Y., Feng, C., Ma, L.: Unimd: Towards unifying moment retrieval and temporal action detection. In: European Conference on Computer Vision. (ECCV) (2024)","DOI":"10.1007\/978-3-031-72952-2_17"}],"container-title":["Machine Vision and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-026-01799-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00138-026-01799-9","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-026-01799-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T18:12:45Z","timestamp":1773079965000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00138-026-01799-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2,13]]},"references-count":57,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2026,3]]}},"alternative-id":["1799"],"URL":"https:\/\/doi.org\/10.1007\/s00138-026-01799-9","relation":{},"ISSN":["0932-8092","1432-1769"],"issn-type":[{"value":"0932-8092","type":"print"},{"value":"1432-1769","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2,13]]},"assertion":[{"value":"26 October 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 December 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 January 2026","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 February 2026","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"34"}}