{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T15:06:36Z","timestamp":1780931196353,"version":"3.54.1"},"reference-count":45,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1016\/j.patcog.2026.113971","type":"journal-article","created":{"date-parts":[[2026,5,19]],"date-time":"2026-05-19T15:11:46Z","timestamp":1779203506000},"page":"113971","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"PA","title":["VideoAligner: Text-driven feature decomposition for precise video\u2013text alignment"],"prefix":"10.1016","volume":"180","author":[{"given":"Zhanzhou","family":"Feng","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shunan","family":"Mao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yaowei","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9053-9314","authenticated-orcid":false,"given":"Shiliang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"issue":"4","key":"10.1016\/j.patcog.2026.113971_b1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3633781","article-title":"Efficient video transformers via spatial-temporal token merging for action recognition","volume":"20","author":"Feng","year":"2024","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"10.1016\/j.patcog.2026.113971_b2","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.patcog.2026.113971_b3","doi-asserted-by":"crossref","unstructured":"J. Xu, T. Mei, T. Yao, Y. Rui, Msr-vtt: A large video description dataset for bridging video and language, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016, pp. 5288\u20135296.","DOI":"10.1109\/CVPR.2016.571"},{"key":"10.1016\/j.patcog.2026.113971_b4","series-title":"Frontiers of Multimedia Research","first-page":"3","article-title":"Deep learning for video classification and captioning","author":"Wu","year":"2017"},{"key":"10.1016\/j.patcog.2026.113971_b5","doi-asserted-by":"crossref","unstructured":"L. Anne Hendricks, O. Wang, E. Shechtman, J. Sivic, T. Darrell, B. Russell, Localizing moments in video with natural language, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 5803\u20135812.","DOI":"10.1109\/ICCV.2017.618"},{"key":"10.1016\/j.patcog.2026.113971_b6","first-page":"38655","article-title":"Text-adaptive multiple visual prototype matching for video-text retrieval","volume":"35","author":"Lin","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.113971_b7","doi-asserted-by":"crossref","unstructured":"P. Li, C.-W. Xie, L. Zhao, H. Xie, J. Ge, Y. Zheng, D. Zhao, Y. Zhang, Progressive spatio-temporal prototype matching for text-video retrieval, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 4100\u20134110.","DOI":"10.1109\/ICCV51070.2023.00379"},{"key":"10.1016\/j.patcog.2026.113971_b8","doi-asserted-by":"crossref","unstructured":"Z. Wang, Y.-L. Sung, F. Cheng, G. Bertasius, M. Bansal, Unified Coarse-to-Fine Alignment for Video-Text Retrieval, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 2816\u20132827.","DOI":"10.1109\/ICCV51070.2023.00264"},{"key":"10.1016\/j.patcog.2026.113971_b9","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part IV 16","first-page":"214","article-title":"Multi-modal transformer for video retrieval","author":"Gabeur","year":"2020"},{"key":"10.1016\/j.patcog.2026.113971_b10","doi-asserted-by":"crossref","unstructured":"I. Croitoru, S.-V. Bogolin, M. Leordeanu, H. Jin, A. Zisserman, S. Albanie, Y. Liu, Teachtext: Crossmodal generalized distillation for text-video retrieval, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 11583\u201311593.","DOI":"10.1109\/ICCV48922.2021.01138"},{"key":"10.1016\/j.patcog.2026.113971_b11","first-page":"30291","article-title":"Expectation-maximization contrastive learning for compact video-and-language representations","volume":"35","author":"Jin","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.113971_b12","doi-asserted-by":"crossref","unstructured":"S. Liu, H. Fan, S. Qian, Y. Chen, W. Ding, Z. Wang, Hit: Hierarchical transformer with momentum contrast for video-text retrieval, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 11915\u201311925.","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"10.1016\/j.patcog.2026.113971_b13","doi-asserted-by":"crossref","unstructured":"M. Wray, D. Larlus, G. Csurka, D. Damen, Fine-grained action retrieval through multiple parts-of-speech embeddings, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 450\u2013459.","DOI":"10.1109\/ICCV.2019.00054"},{"issue":"2","key":"10.1016\/j.patcog.2026.113971_b14","doi-asserted-by":"crossref","first-page":"1013","DOI":"10.1109\/TPAMI.2024.3490776","article-title":"Evolved hierarchical masking for self-supervised learning","volume":"47","author":"Feng","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.113971_b15","doi-asserted-by":"crossref","unstructured":"Z. Feng, S. Zhang, Evolved part masking for self-supervised learning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 10386\u201310395.","DOI":"10.1109\/CVPR52729.2023.01001"},{"key":"10.1016\/j.patcog.2026.113971_b16","doi-asserted-by":"crossref","unstructured":"J. Lei, L. Li, L. Zhou, Z. Gan, T.L. Berg, M. Bansal, J. Liu, Less is more: Clipbert for video-and-language learning via sparse sampling, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 7331\u20137341.","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"10.1016\/j.patcog.2026.113971_b17","doi-asserted-by":"crossref","unstructured":"M. Bain, A. Nagrani, G. Varol, A. Zisserman, Frozen in time: A joint video and image encoder for end-to-end retrieval, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 1728\u20131738.","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"10.1016\/j.patcog.2026.113971_b18","doi-asserted-by":"crossref","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","article-title":"Clip4clip: An empirical study of clip for end to end video clip retrieval and captioning","volume":"508","author":"Luo","year":"2022","journal-title":"Neurocomputing"},{"key":"10.1016\/j.patcog.2026.113971_b19","doi-asserted-by":"crossref","unstructured":"S. Zhao, L. Zhu, X. Wang, Y. Yang, Centerclip: Token clustering for efficient text-video retrieval, in: Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, 2022, pp. 970\u2013981.","DOI":"10.1145\/3477495.3531950"},{"issue":"4","key":"10.1016\/j.patcog.2026.113971_b20","doi-asserted-by":"crossref","first-page":"739","DOI":"10.3390\/electronics15040739","article-title":"Causal visual\u2013semantic enhancement for video-text retrieval","volume":"15","author":"Lan","year":"2026","journal-title":"Electronics"},{"key":"10.1016\/j.patcog.2026.113971_b21","article-title":"Htvr: Hierarchical text-to-video retrieval based on relative similarity","author":"Zhang","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.113971_b22","doi-asserted-by":"crossref","unstructured":"S.K. Gorti, N. Vouitsis, J. Ma, K. Golestan, M. Volkovs, A. Garg, G. Yu, X-pool: Cross-modal language-video attention for text-video retrieval, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 5006\u20135015.","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"10.1016\/j.patcog.2026.113971_b23","doi-asserted-by":"crossref","unstructured":"S. Huang, B. Gong, Y. Pan, J. Jiang, Y. Lv, Y. Li, D. Wang, VoP: Text-Video Co-operative Prompt Tuning for Cross-Modal Retrieval, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 6565\u20136574.","DOI":"10.1109\/CVPR52729.2023.00635"},{"key":"10.1016\/j.patcog.2026.113971_b24","doi-asserted-by":"crossref","unstructured":"P. Jin, J. Huang, P. Xiong, S. Tian, C. Liu, X. Ji, L. Yuan, J. Chen, Video-text as game players: Hierarchical banzhaf interaction for cross-modal representation learning, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 2472\u20132482.","DOI":"10.1109\/CVPR52729.2023.00244"},{"key":"10.1016\/j.patcog.2026.113971_b25","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2025.103151","article-title":"TC-MGC: Text-conditioned multi-grained contrastive learning for text\u2013video retrieval","volume":"121","author":"Jing","year":"2025","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.patcog.2026.113971_b26","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"13323","article-title":"Temporal calibrating and distilling for scene-text aware text-video retrieval","volume":"vol. 40","author":"Zhao","year":"2026"},{"key":"10.1016\/j.patcog.2026.113971_b27","doi-asserted-by":"crossref","unstructured":"Y. Ma, G. Xu, X. Sun, M. Yan, J. Zhang, R. Ji, X-clip: End-to-end multi-grained contrastive learning for video-text retrieval, in: Proceedings of the 30th ACM International Conference on Multimedia, 2022, pp. 638\u2013647.","DOI":"10.1145\/3503161.3547910"},{"key":"10.1016\/j.patcog.2026.113971_b28","doi-asserted-by":"crossref","unstructured":"Y. Ge, Y. Ge, X. Liu, D. Li, Y. Shan, X. Qie, P. Luo, Bridging video-text retrieval with multiple choice questions, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 16167\u201316176.","DOI":"10.1109\/CVPR52688.2022.01569"},{"key":"10.1016\/j.patcog.2026.113971_b29","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"3976","article-title":"SCAN: Self-calibrated autoregression for high-quality visual generation","volume":"vol. 40","author":"Feng","year":"2026"},{"key":"10.1016\/j.patcog.2026.113971_b30","doi-asserted-by":"crossref","unstructured":"Z. Feng, Q. Guo, X. Xiao, R. Xu, M. Yang, S. Zhang, Unified Video Generation via Next-Set Prediction in Continuous Domain, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2025, pp. 19427\u201319438.","DOI":"10.1109\/ICCV51701.2025.01806"},{"key":"10.1016\/j.patcog.2026.113971_b31","doi-asserted-by":"crossref","unstructured":"A. Reddy, A. Martin, E. Yang, A. Yates, K. Sanders, K. Murray, R. Kriz, C.M. De Melo, B. Van Durme, R. Chellappa, Video-colbert: Contextualized late interaction for text-to-video retrieval, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 19691\u201319701.","DOI":"10.1109\/CVPR52734.2025.01834"},{"key":"10.1016\/j.patcog.2026.113971_b32","doi-asserted-by":"crossref","unstructured":"Z. Bian, C. Jiang, F. Zhu, Z. Zhang, Selective multi-grained alignment for text-video retrieval, in: Companion Proceedings of the ACM on Web Conference 2025, 2025, pp. 873\u2013877.","DOI":"10.1145\/3701716.3715525"},{"key":"10.1016\/j.patcog.2026.113971_b33","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.113971_b34","doi-asserted-by":"crossref","unstructured":"C.D. Manning, M. Surdeanu, J. Bauer, J.R. Finkel, S. Bethard, D. McClosky, The Stanford CoreNLP natural language processing toolkit, in: Proceedings of 52nd Annual Meeting of the Association for Computational Linguistics: System Demonstrations, 2014, pp. 55\u201360.","DOI":"10.3115\/v1\/P14-5010"},{"key":"10.1016\/j.patcog.2026.113971_b35","series-title":"Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss","author":"Cheng","year":"2021"},{"key":"10.1016\/j.patcog.2026.113971_b36","doi-asserted-by":"crossref","unstructured":"S.-V. Bogolin, I. Croitoru, H. Jin, Y. Liu, S. Albanie, Cross modal retrieval with querybank normalisation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 5194\u20135205.","DOI":"10.1109\/CVPR52688.2022.00513"},{"key":"10.1016\/j.patcog.2026.113971_b37","series-title":"International Conference on Machine Learning","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"key":"10.1016\/j.patcog.2026.113971_b38","series-title":"European Conference on Computer Vision","first-page":"319","article-title":"Ts2-net: Token shift and selection transformer for text-video retrieval","author":"Liu","year":"2022"},{"key":"10.1016\/j.patcog.2026.113971_b39","doi-asserted-by":"crossref","unstructured":"R. Liu, J. Huang, G. Li, J. Feng, X. Wu, T.H. Li, Revisiting temporal modeling for clip-based image-to-video knowledge transferring, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 6555\u20136564.","DOI":"10.1109\/CVPR52729.2023.00634"},{"key":"10.1016\/j.patcog.2026.113971_b40","series-title":"European Conference on Computer Vision","first-page":"76","article-title":"Ea-vtr: Event-aware video-text retrieval","author":"Ma","year":"2024"},{"key":"10.1016\/j.patcog.2026.113971_b41","doi-asserted-by":"crossref","unstructured":"H. Zhang, P. Zeng, L. Gao, J. Song, H.T. Shen, Mpt: Multi-grained prompt tuning for text-video retrieval, in: Proceedings of the 32nd ACM International Conference on Multimedia, 2024, pp. 1206\u20131214.","DOI":"10.1145\/3664647.3680839"},{"key":"10.1016\/j.patcog.2026.113971_b42","doi-asserted-by":"crossref","unstructured":"P. Jin, H. Li, Z. Cheng, K. Li, X. Ji, C. Liu, L. Yuan, J. Chen, Diffusionret: Generative text-video retrieval with diffusion model, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 2470\u20132481.","DOI":"10.1109\/ICCV51070.2023.00234"},{"key":"10.1016\/j.patcog.2026.113971_b43","series-title":"European Conference on Computer Vision","first-page":"444","article-title":"Lightweight attentional feature fusion: A new baseline for text-to-video retrieval","author":"Hu","year":"2022"},{"key":"10.1016\/j.patcog.2026.113971_b44","unstructured":"I. Loshchilov, F. Hutter, SGDR: Stochastic Gradient Descent with Warm Restarts, in: International Conference on Learning Representations, 2017."},{"key":"10.1016\/j.patcog.2026.113971_b45","doi-asserted-by":"crossref","unstructured":"E. Loper, S. Bird, Nltk: The natural language toolkit, in: Proceedings of the ACL-02 Workshop on Effective Tools and Methodologies for Teaching Natural Language Processing and Computational Linguistics, 2002, pp. 63\u201370.","DOI":"10.3115\/1118108.1118117"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326009362?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0031320326009362?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T14:49:39Z","timestamp":1780930179000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0031320326009362"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,12]]},"references-count":45,"alternative-id":["S0031320326009362"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113971","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"VideoAligner: Text-driven feature decomposition for precise video\u2013text alignment","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113971","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Published by Elsevier Ltd.","name":"copyright","label":"Copyright"}],"article-number":"113971"}}