{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T14:07:17Z","timestamp":1779458837109,"version":"3.53.1"},"reference-count":55,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100006004","name":"Tohoku University","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100006004","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Image and Vision Computing"],"published-print":{"date-parts":[[2026,8]]},"DOI":"10.1016\/j.imavis.2026.106027","type":"journal-article","created":{"date-parts":[[2026,5,20]],"date-time":"2026-05-20T19:32:36Z","timestamp":1779305556000},"page":"106027","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["A real-driven image synthesis framework for adapting pre-trained scene text detectors"],"prefix":"10.1016","volume":"172","author":[{"given":"Yaohou","family":"Fan","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2011-8105","authenticated-orcid":false,"given":"Zhengmi","family":"Tang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tomo","family":"Miyazaki","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yongsong","family":"Huang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shinichiro","family":"Omachi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.imavis.2026.106027_b1","doi-asserted-by":"crossref","unstructured":"M. Liao, Z. Wan, C. Yao, K. Chen, X. Bai, Real-time scene text detection with differentiable binarization, in: Proceedings of the AAAI Conference on Artificial Intelligence, 2020.","DOI":"10.1609\/aaai.v34i07.6812"},{"key":"10.1016\/j.imavis.2026.106027_b2","doi-asserted-by":"crossref","unstructured":"X. Zhou, C. Yao, H. Wen, Y. Wang, S. Zhou, W. He, J. Liang, East: an efficient and accurate scene text detector, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017.","DOI":"10.1109\/CVPR.2017.283"},{"key":"10.1016\/j.imavis.2026.106027_b3","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2019.107026","article-title":"Realtime multi-scale scene text detection with scale-based region proposal network","author":"He","year":"2020","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.imavis.2026.106027_b4","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2013.10.003","article-title":"Keyword spotting in unconstrained handwritten Chinese documents using contextual word model","author":"Huang","year":"2013","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.106027_b5","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2023.104840","article-title":"Multi-modal spatial relational attention networks for visual question answering","author":"Yao","year":"2023","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.106027_b6","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2025.105768","article-title":"CODNet: Context-based object detection network for multimodal image captioning and virtual question answering","author":"Gupta","year":"2025","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.106027_b7","doi-asserted-by":"crossref","unstructured":"A. Gupta, A. Vedaldi, A. Zisserman, Synthetic Data for Text Localisation in Natural Images, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2016.","DOI":"10.1109\/CVPR.2016.254"},{"key":"10.1016\/j.imavis.2026.106027_b8","doi-asserted-by":"crossref","unstructured":"W. Wang, E. Xie, X. Li, W. Hou, T. Lu, G. Yu, S. Shao, Shape robust text detection with progressive scale expansion network, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2019.","DOI":"10.1109\/CVPR.2019.00956"},{"key":"10.1016\/j.imavis.2026.106027_b9","doi-asserted-by":"crossref","unstructured":"X. Qin, P. Lyu, C. Zhang, Y. Zhou, K. Yao, P. Zhang, H. Lin, W. Wang, Towards robust real-time scene text detection: From semantic to instance representation learning, in: Proceedings of the ACM International Conference on Multimedia, 2023.","DOI":"10.1145\/3581783.3611801"},{"key":"10.1016\/j.imavis.2026.106027_b10","doi-asserted-by":"crossref","unstructured":"C. Xue, W. Zhang, Y. Hao, S. Lu, P.H. Torr, S. Bai, Language matters: A weakly supervised vision-language pre-training approach for scene text detection and spotting, in: Proceedings of the European Conference on Computer Vision, 2022.","DOI":"10.1007\/978-3-031-19815-1_17"},{"key":"10.1016\/j.imavis.2026.106027_b11","doi-asserted-by":"crossref","unstructured":"S. Song, J. Wan, Z. Yang, J. Tang, W. Cheng, X. Bai, C. Yao, Vision-language pre-training for boosting scene text detectors, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022.","DOI":"10.1109\/CVPR52688.2022.01523"},{"key":"10.1016\/j.imavis.2026.106027_b12","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2025.105541","article-title":"Dynamic feature extraction and histopathology domain shift alignment for mitosis detection","author":"Han","year":"2025","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.106027_b13","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2025.105727","article-title":"Your image generator is your new private dataset","author":"Resmini","year":"2025","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.106027_b14","article-title":"Semantic-aware for point cloud domain adaptation with self-distillation learning","author":"Yang","year":"2025","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.106027_b15","doi-asserted-by":"crossref","unstructured":"T. Guan, W. Shen, X. Yang, X. Wang, X. Yang, Bridging synthetic and real worlds for pre-training scene text detectors, in: Proceedings of the European Conference on Computer Vision, 2024.","DOI":"10.1007\/978-3-031-72784-9_24"},{"key":"10.1016\/j.imavis.2026.106027_b16","series-title":"Proceedings of the IEEE International Conference on Multimedia and Expo","article-title":"UNITS: Unsupervised intermediate training stage for scene text detection","author":"Guo","year":"2022"},{"key":"10.1016\/j.imavis.2026.106027_b17","doi-asserted-by":"crossref","unstructured":"D. Karatzas, L. Gomez-Bigorda, A. Nicolaou, S. Ghosh, A. Bagdanov, M. Iwamura, J. Matas, L. Neumann, V.R. Chandrasekhar, S. Lu, et al., ICDAR 2015 competition on robust reading, in: Proceedings of the International Conference on Document Analysis and Recognition, 2015.","DOI":"10.1109\/ICDAR.2015.7333942"},{"key":"10.1016\/j.imavis.2026.106027_b18","doi-asserted-by":"crossref","unstructured":"C.-K. Ch\u2019ng, C.S. Chan, C.-L. Liu, Total-text: toward orientation robustness in scene text detection, in: International Journal on Document Analysis and Recognition, 2020.","DOI":"10.1007\/s10032-019-00334-z"},{"key":"10.1016\/j.imavis.2026.106027_b19","series-title":"Detecting curve text in the wild: New dataset and new solution","author":"Yuliang","year":"2017"},{"key":"10.1016\/j.imavis.2026.106027_b20","doi-asserted-by":"crossref","unstructured":"F. Zhan, S. Lu, C. Xue, Verisimilar image synthesis for accurate detection and recognition of texts in scenes, in: Proceedings of the European Conference on Computer Vision, 2018.","DOI":"10.1007\/978-3-030-01237-3_16"},{"key":"10.1016\/j.imavis.2026.106027_b21","doi-asserted-by":"crossref","DOI":"10.1109\/TIP.2023.3326685","article-title":"A scene-text synthesis engine achieved through learning from decomposed real-world data","author":"Tang","year":"2023","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.imavis.2026.106027_b22","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2025.113306","article-title":"VQ-STE: Scene text erasing with mask refinement and vector-quantized texture dictionary","author":"Tang","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.imavis.2026.106027_b23","doi-asserted-by":"crossref","DOI":"10.1109\/TIP.2021.3125260","article-title":"Stroke-based scene text erasing using synthetic data for training","author":"Tang","year":"2021","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.imavis.2026.106027_b24","series-title":"Unrealtext: Synthesizing realistic scene text images from the unreal world","author":"Long","year":"2020"},{"key":"10.1016\/j.imavis.2026.106027_b25","series-title":"Twenty Newsgroups","author":"Mitchell","year":"1997"},{"key":"10.1016\/j.imavis.2026.106027_b26","doi-asserted-by":"crossref","unstructured":"Y. Zhu, J. Liu, F. Gao, W. Liu, X. Wang, P. Wang, F. Huang, C. Yao, Z. Yang, Visual text generation in the wild, in: Proceedings of the European Conference on Computer Vision, 2024.","DOI":"10.1007\/978-3-031-73668-1_6"},{"key":"10.1016\/j.imavis.2026.106027_b27","doi-asserted-by":"crossref","unstructured":"L. Zhang, X. Chen, Y. Wang, Y. Lu, Y. Qiao, Brush your text: Synthesize any scene text on images via diffusion model, in: Proceedings of the AAAI Conference on Artificial Intelligence, 2024.","DOI":"10.1609\/aaai.v38i7.28550"},{"key":"10.1016\/j.imavis.2026.106027_b28","series-title":"Glyphdraw: Seamlessly rendering text with intricate spatial structures in text-to-image generation","author":"Ma","year":"2023"},{"key":"10.1016\/j.imavis.2026.106027_b29","series-title":"Anytext: Multilingual visual text generation and editing","author":"Tuo","year":"2023"},{"key":"10.1016\/j.imavis.2026.106027_b30","series-title":"Proceedings of the European Conference on Computer Vision","article-title":"Textdiffuser-2: Unleashing the power of language models for text rendering","author":"Chen","year":"2024"},{"key":"10.1016\/j.imavis.2026.106027_b31","article-title":"Image re-identification: Where self-supervision meets vision-language learning","author":"Wang","year":"2025","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.106027_b32","article-title":"Attention head purification: A new perspective to harness CLIP for domain generalization","author":"Wang","year":"2025","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.106027_b33","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2023.104751","article-title":"COME: Clip-OCR and master ObjEct for text image captioning","author":"Lv","year":"2023","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.106027_b34","doi-asserted-by":"crossref","unstructured":"Q. Wan, H. Ji, L. Shen, Self-attention based text knowledge mining for text detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021.","DOI":"10.1109\/CVPR46437.2021.00592"},{"key":"10.1016\/j.imavis.2026.106027_b35","doi-asserted-by":"crossref","unstructured":"K. Wang, H. Xie, Y. Wang, D. Zhang, Y. Qu, Z. Gao, Y. Zhang, Masked text modeling: a self-supervised pre-training method for scene text detection, in: Proceedings of the ACM International Conference on Multimedia, 2023.","DOI":"10.1145\/3581783.3612370"},{"key":"10.1016\/j.imavis.2026.106027_b36","article-title":"Turning a clip model into a scene text spotter","author":"Yu","year":"2024","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.106027_b37","article-title":"Domain adaptive object detection via synthetically generated intermediate domain and progressive feature alignment","author":"Gao","year":"2025","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.106027_b38","doi-asserted-by":"crossref","unstructured":"A. Das, S. Biswas, A. Banerjee, J. Llad\u00f3s, U. Pal, S. Bhattacharya, Harnessing the power of multi-lingual datasets for pre-training: Towards enhancing text spotting performance, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2024.","DOI":"10.1109\/WACV57701.2024.00077"},{"key":"10.1016\/j.imavis.2026.106027_b39","doi-asserted-by":"crossref","unstructured":"S. Yun, D. Han, S.J. Oh, S. Chun, J. Choe, Y. Yoo, Cutmix: Regularization strategy to train strong classifiers with localizable features, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019.","DOI":"10.1109\/ICCV.2019.00612"},{"key":"10.1016\/j.imavis.2026.106027_b40","doi-asserted-by":"crossref","unstructured":"V. Olsson, W. Tranheden, J. Pinto, L. Svensson, Classmix: Segmentation-based data augmentation for semi-supervised learning, in: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, 2021.","DOI":"10.1109\/WACV48630.2021.00141"},{"key":"10.1016\/j.imavis.2026.106027_b41","article-title":"Deep unsupervised domain adaptation: A review of recent advances and perspectives","author":"Liu","year":"2022","journal-title":"APSIPA Trans. Signal Inf. Process."},{"key":"10.1016\/j.imavis.2026.106027_b42","doi-asserted-by":"crossref","unstructured":"Z. Yang, Y. Lu, J. Wang, X. Yin, D. Florencio, L. Wang, C. Zhang, L. Zhang, J. Luo, Tap: Text-aware pre-training for text-vqa and text-caption, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021.","DOI":"10.1109\/CVPR46437.2021.00864"},{"key":"10.1016\/j.imavis.2026.106027_b43","doi-asserted-by":"crossref","unstructured":"Z. Zhao, L. Yang, S. Long, J. Pi, L. Zhou, J. Wang, Augmentation matters: A simple-yet-effective approach to semi-supervised semantic segmentation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023.","DOI":"10.1109\/CVPR52729.2023.01092"},{"key":"10.1016\/j.imavis.2026.106027_b44","doi-asserted-by":"crossref","unstructured":"M. Ye, J. Zhang, S. Zhao, J. Liu, B. Du, D. Tao, DPText-DETR: Towards Better Scene Text Detection with Dynamic Points in Transformer, in: Proceedings of the AAAI Conference on Artificial Intelligence, 2023.","DOI":"10.1609\/aaai.v37i3.25430"},{"key":"10.1016\/j.imavis.2026.106027_b45","doi-asserted-by":"crossref","unstructured":"Y. Fan, T. Miyazaki, Z. Tang, J. Wang, Y. Huang, S. Omachi, Scene Text Reconstructor: A Contextual-Aware Masking Framework for Pre-training Text Detectors, in: International Conference on Neural Information Processing, 2025.","DOI":"10.1007\/978-981-95-4378-6_13"},{"key":"10.1016\/j.imavis.2026.106027_b46","doi-asserted-by":"crossref","unstructured":"C. Duan, P. Fu, S. Guo, Q. Jiang, X. Wei, Odm: A text-image further alignment pre-training approach for scene text detection and spotting, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024.","DOI":"10.1109\/CVPR52733.2024.01476"},{"key":"10.1016\/j.imavis.2026.106027_b47","doi-asserted-by":"crossref","unstructured":"Y. Zhu, J. Chen, L. Liang, Z. Kuang, L. Jin, W. Zhang, Fourier Contour Embedding for Arbitrary-Shaped Text Detection, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021.","DOI":"10.1109\/CVPR46437.2021.00314"},{"key":"10.1016\/j.imavis.2026.106027_b48","article-title":"Inverse-like antagonistic scene text spotting via reading-order estimation and dynamic sampling","author":"Zhang","year":"2024","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.imavis.2026.106027_b49","doi-asserted-by":"crossref","unstructured":"M. Liang, J.-W. Ma, X. Zhu, J. Qin, X.-C. Yin, LayoutFormer: Hierarchical Text Detection Towards Scene Text Understanding, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2024.","DOI":"10.1109\/CVPR52733.2024.01483"},{"key":"10.1016\/j.imavis.2026.106027_b50","article-title":"Real-time scene text detection with differentiable binarization and adaptive scale fusion","author":"Liao","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.imavis.2026.106027_b51","article-title":"CT-Net: Arbitrary-shaped text detection via contour transformer","author":"Shao","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.imavis.2026.106027_b52","doi-asserted-by":"crossref","DOI":"10.1007\/s11263-025-02428-0","article-title":"Swintextspotter v2: Towards better synergy for scene text spotting","author":"Huang","year":"2025","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.imavis.2026.106027_b53","doi-asserted-by":"crossref","unstructured":"N. Nayef, F. Yin, I. Bizid, H. Choi, Y. Feng, D. Karatzas, Z. Luo, U. Pal, C. Rigaud, J. Chazalon, et al., Icdar2017 robust reading challenge on multi-lingual scene text detection and script identification-rrc-mlt, in: Proceedings of the International Conference on Document Analysis and Recognition, 2017.","DOI":"10.1109\/ICDAR.2017.237"},{"key":"10.1016\/j.imavis.2026.106027_b54","doi-asserted-by":"crossref","unstructured":"T. Cao, J. Lyu, W. Zeng, W. Mu, Y. Zhou, The devil is in fine-tuning and long-tailed problems: a new benchmark for scene text detection, in: Proceedings of the International Joint Conference on Artificial Intelligence, 2025.","DOI":"10.24963\/ijcai.2025\/83"},{"key":"10.1016\/j.imavis.2026.106027_b55","series-title":"Paddleocr 3.0 technical report","author":"Cui","year":"2025"}],"container-title":["Image and Vision Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885626001344?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885626001344?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,22]],"date-time":"2026-05-22T13:29:54Z","timestamp":1779456594000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0262885626001344"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,8]]},"references-count":55,"alternative-id":["S0262885626001344"],"URL":"https:\/\/doi.org\/10.1016\/j.imavis.2026.106027","relation":{},"ISSN":["0262-8856"],"issn-type":[{"value":"0262-8856","type":"print"}],"subject":[],"published":{"date-parts":[[2026,8]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"A real-driven image synthesis framework for adapting pre-trained scene text detectors","name":"articletitle","label":"Article Title"},{"value":"Image and Vision Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.imavis.2026.106027","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"106027"}}