{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,13]],"date-time":"2026-06-13T16:30:01Z","timestamp":1781368201131,"version":"3.54.1"},"reference-count":46,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1016\/j.patcog.2026.114009","type":"journal-article","created":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T16:11:26Z","timestamp":1779466286000},"page":"114009","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"PA","title":["SFA: Scan, Focus, and Amplify toward guidance-aware answering for Video TextVQA"],"prefix":"10.1016","volume":"180","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2004-150X","authenticated-orcid":false,"given":"Haibin","family":"He","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0118-5217","authenticated-orcid":false,"given":"Qihuang","family":"Zhong","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3907-8820","authenticated-orcid":false,"given":"Juhua","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8104-3448","authenticated-orcid":false,"given":"Bo","family":"Du","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9895-394X","authenticated-orcid":false,"given":"Peng","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6595-7661","authenticated-orcid":false,"given":"Jing","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.114009_b1","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"3241","article-title":"Dptext-detr: Towards better scene text detection with dynamic points in transformer","volume":"vol. 37","author":"Ye","year":"2023"},{"key":"10.1016\/j.patcog.2026.114009_b2","doi-asserted-by":"crossref","unstructured":"Maoyuan Ye, Jing Zhang, Shanshan Zhao, Juhua Liu, Tongliang Liu, Bo Du, Dacheng Tao, Deepsolo: Let transformer decoder with explicit points solo for text spotting, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 19348\u201319357.","DOI":"10.1109\/CVPR52729.2023.01854"},{"key":"10.1016\/j.patcog.2026.114009_b3","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2021.108214","article-title":"A multimodal attention fusion network with a dynamic vocabulary for TextVQA","volume":"122","author":"Wu","year":"2022","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114009_b4","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109337","article-title":"Beyond OCR+ VQA: Towards end-to-end reading and reasoning for robust and accurate textvqa","volume":"138","author":"Zeng","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114009_b5","doi-asserted-by":"crossref","DOI":"10.1109\/TMM.2025.3613125","article-title":"Tensor completion framework by graph refinement for incomplete multi-view clustering","author":"Wang","year":"2025","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.patcog.2026.114009_b6","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2023.109531","article-title":"FETNet: Feature erasing and transferring network for scene text removal","volume":"140","author":"Lyu","year":"2023","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114009_b7","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111513","article-title":"TextDiff: Enhancing scene text image super-resolution with mask-guided residual diffusion models","volume":"164","author":"Liu","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114009_b8","article-title":"ArtGlyphDiffuser: Text-driven artistic glyph generation via style-to-CLIP projection and multi-level controlled diffusion","author":"Lu","year":"2025","journal-title":"Pattern Recognit."},{"issue":"2","key":"10.1016\/j.patcog.2026.114009_b9","doi-asserted-by":"crossref","first-page":"297","DOI":"10.1109\/TAI.2021.3116216","article-title":"Character-level street view text spotting based on deep multisegmentation network for smarter autonomous driving","volume":"3","author":"Zhang","year":"2021","journal-title":"IEEE Trans. Artif. Intell."},{"issue":"10","key":"10.1016\/j.patcog.2026.114009_b10","doi-asserted-by":"crossref","first-page":"2425","DOI":"10.1007\/s11263-022-01657-x","article-title":"Explainability of deep vision-based autonomous driving systems: Review and challenges","volume":"130","author":"Zablocki","year":"2022","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.patcog.2026.114009_b11","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.111099","article-title":"Text\u2013video retrieval re-ranking via multi-grained cross attention and frozen image encoders","volume":"159","author":"Dai","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114009_b12","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110818","article-title":"A large cross-modal video retrieval dataset with reading comprehension","volume":"157","author":"Wu","year":"2025","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.patcog.2026.114009_b13","first-page":"35549","article-title":"Towards video text visual question answering: Benchmark and baseline","volume":"35","author":"Zhao","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114009_b14","series-title":"International Conference on Document Analysis and Recognition","first-page":"137","article-title":"Reading between the lanes: Text videoqa on the road","author":"Tom","year":"2023"},{"key":"10.1016\/j.patcog.2026.114009_b15","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"10275","article-title":"Track the answer: Extending textvqa from image to video with spatio-temporal clues","volume":"vol. 39","author":"Zhang","year":"2025"},{"key":"10.1016\/j.patcog.2026.114009_b16","doi-asserted-by":"crossref","unstructured":"Yan Zhang, Gangyan Zeng, Daiqing Wu, Huawen Shen, Binbin Li, Yu Zhou, Can Ma, Xiaojun Bi, Gather and trace: Rethinking video textvqa from an instance-oriented perspective, in: Proceedings of the 33rd ACM International Conference on Multimedia, 2025, pp. 876\u2013885.","DOI":"10.1145\/3746027.3754718"},{"key":"10.1016\/j.patcog.2026.114009_b17","series-title":"InternVideo2. 5: Empowering video MLLMs with long and rich context modeling","author":"Wang","year":"2025"},{"key":"10.1016\/j.patcog.2026.114009_b18","series-title":"Qwen2. 5-vl technical report","author":"Bai","year":"2025"},{"key":"10.1016\/j.patcog.2026.114009_b19","first-page":"25663","article-title":"Gomatching: A simple baseline for video text spotting via long and short term matching","volume":"37","author":"He","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.patcog.2026.114009_b20","doi-asserted-by":"crossref","DOI":"10.1109\/TCSVT.2026.3688478","article-title":"GoMatching++: Parameter-and data-efficient arbitrary-shaped video text spotting and benchmarking","author":"He","year":"2026","journal-title":"IEEE Transactions on Circuits and Systems for Video Technology"},{"key":"10.1016\/j.patcog.2026.114009_b21","doi-asserted-by":"crossref","unstructured":"Dejing Xu, Zhou Zhao, Jun Xiao, Fei Wu, Hanwang Zhang, Xiangnan He, Yueting Zhuang, Video question answering via gradually refined attention over appearance and motion, in: Proceedings of the 25th ACM International Conference on Multimedia, 2017, pp. 1645\u20131653.","DOI":"10.1145\/3123266.3123427"},{"key":"10.1016\/j.patcog.2026.114009_b22","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"2804","article-title":"Video as conditional graph hierarchy for multi-granular question answering","volume":"vol. 36","author":"Xiao","year":"2022"},{"key":"10.1016\/j.patcog.2026.114009_b23","unstructured":"Yicong Li, Xiang Wang, Junbin Xiao, Wei Ji, Tat-Seng Chua, Invariant grounding for video question answering, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 2928\u20132937."},{"key":"10.1016\/j.patcog.2026.114009_b24","doi-asserted-by":"crossref","unstructured":"Linjie Li, Yen-Chun Chen, Yu Cheng, Zhe Gan, Licheng Yu, Jingjing Liu, HERO: Hierarchical Encoder for Video+ Language Omni-representation Pre-training, in: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing, EMNLP, 2020, pp. 2046\u20132065.","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"10.1016\/j.patcog.2026.114009_b25","doi-asserted-by":"crossref","unstructured":"Antoine Yang, Antoine Miech, Josef Sivic, Ivan Laptev, Cordelia Schmid, Just ask: Learning to answer questions from millions of narrated videos, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 1686\u20131697.","DOI":"10.1109\/ICCV48922.2021.00171"},{"key":"10.1016\/j.patcog.2026.114009_b26","article-title":"GIT: A generative image-to-text transformer for vision and language","author":"Wang","year":"2022","journal-title":"Trans. Mach. Learn. Res."},{"key":"10.1016\/j.patcog.2026.114009_b27","doi-asserted-by":"crossref","unstructured":"Jinpeng Wang, Yixiao Ge, Rui Yan, Yuying Ge, Kevin Qinghong Lin, Satoshi Tsutsui, Xudong Lin, Guanyu Cai, Jianping Wu, Ying Shan, et al., All in one: Exploring unified video-language pre-training, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 6598\u20136608.","DOI":"10.1109\/CVPR52729.2023.00638"},{"key":"10.1016\/j.patcog.2026.114009_b28","doi-asserted-by":"crossref","unstructured":"Jie Lei, Tamara Berg, Mohit Bansal, Revealing Single Frame Bias for Video-and-Language Learning, in: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), 2023, pp. 487\u2013507.","DOI":"10.18653\/v1\/2023.acl-long.29"},{"key":"10.1016\/j.patcog.2026.114009_b29","article-title":"Hi-SAM: Marrying segment anything model for hierarchical text segmentation","author":"Ye","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.patcog.2026.114009_b30","series-title":"Internlm2 technical report","author":"Cai","year":"2024"},{"key":"10.1016\/j.patcog.2026.114009_b31","series-title":"Qwen2 technical report","author":"Team","year":"2024"},{"key":"10.1016\/j.patcog.2026.114009_b32","series-title":"The llama 3 herd of models","author":"Dubey","year":"2024"},{"key":"10.1016\/j.patcog.2026.114009_b33","series-title":"VideoLLaMA 2: Advancing spatial-temporal modeling and audio understanding in video-LLMs","author":"Cheng","year":"2024"},{"key":"10.1016\/j.patcog.2026.114009_b34","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.patcog.2026.114009_b35","doi-asserted-by":"crossref","unstructured":"Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer, Sigmoid loss for language image pre-training, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 11975\u201311986.","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"10.1016\/j.patcog.2026.114009_b36","doi-asserted-by":"crossref","unstructured":"Ji Lin, Hongxu Yin, Wei Ping, Pavlo Molchanov, Mohammad Shoeybi, Song Han, Vila: On pre-training for visual language models, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024, pp. 26689\u201326699.","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"10.1016\/j.patcog.2026.114009_b37","series-title":"Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling","author":"Chen","year":"2024"},{"key":"10.1016\/j.patcog.2026.114009_b38","series-title":"Videollama 3: Frontier multimodal foundation models for image and video understanding","author":"Zhang","year":"2025"},{"key":"10.1016\/j.patcog.2026.114009_b39","series-title":"Emotion-Qwen: Training hybrid experts for unified emotion and general vision-language understanding","first-page":"arXiv","author":"Huang","year":"2025"},{"key":"10.1016\/j.patcog.2026.114009_b40","doi-asserted-by":"crossref","unstructured":"Geng Li, Jinglin Xu, Yunzhen Zhao, Yuxin Peng, Dyfo: A training-free dynamic focus visual search for enhancing lmms in fine-grained visual understanding, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 9098\u20139108.","DOI":"10.1109\/CVPR52734.2025.00850"},{"key":"10.1016\/j.patcog.2026.114009_b41","doi-asserted-by":"crossref","DOI":"10.1016\/j.cviu.2024.103983","article-title":"Cascade transformers with dynamic attention for video question answering","volume":"242","author":"Jiang","year":"2024","journal-title":"Comput. Vis. Image Underst."},{"key":"10.1016\/j.patcog.2026.114009_b42","doi-asserted-by":"crossref","unstructured":"Ali Furkan Biten, Ruben Tito, Andres Mafla, Lluis Gomez, Mar\u00e7al Rusinol, Ernest Valveny, CV Jawahar, Dimosthenis Karatzas, Scene text visual question answering, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 4291\u20134301.","DOI":"10.1109\/ICCV.2019.00439"},{"key":"10.1016\/j.patcog.2026.114009_b43","doi-asserted-by":"crossref","unstructured":"Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munan Ning, Peng Jin, Li Yuan, Video-LLaVA: Learning United Visual Representation by Alignment Before Projection, in: Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, 2024, pp. 5971\u20135984.","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"10.1016\/j.patcog.2026.114009_b44","doi-asserted-by":"crossref","unstructured":"Zhijian Liu, Ligeng Zhu, Baifeng Shi, Zhuoyang Zhang, Yuming Lou, Shang Yang, Haocheng Xi, Shiyi Cao, Yuxian Gu, Dacheng Li, et al., Nvila: Efficient frontier visual language models, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 4122\u20134134.","DOI":"10.1109\/CVPR52734.2025.00390"},{"key":"10.1016\/j.patcog.2026.114009_b45","series-title":"Qwen2-VL: Enhancing vision-language model\u2019s perception of the world at any resolution","author":"Wang","year":"2024"},{"issue":"9","key":"10.1016\/j.patcog.2026.114009_b46","doi-asserted-by":"crossref","first-page":"4019","DOI":"10.1007\/s11263-024-02063-1","article-title":"End-to-end video text spotting with transformer","volume":"132","author":"Wu","year":"2024","journal-title":"Int. J. Comput. Vis."}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S003132032600974X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S003132032600974X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,8]],"date-time":"2026-06-08T14:50:30Z","timestamp":1780930230000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S003132032600974X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,12]]},"references-count":46,"alternative-id":["S003132032600974X"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.114009","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"SFA: Scan, Focus, and Amplify toward guidance-aware answering for Video TextVQA","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.114009","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"Crown Copyright \u00a9 2026 Published by Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"114009"}}