{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T08:06:34Z","timestamp":1761897994312,"version":"3.40.3"},"publisher-location":"Cham","reference-count":46,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031728471"},{"type":"electronic","value":"9783031728488"}],"license":[{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72848-8_21","type":"book-chapter","created":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T13:36:11Z","timestamp":1732800971000},"page":"357-373","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Leveraging Text Localization for\u00a0Scene Text Removal via\u00a0Text-Aware Masked Image Modeling"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0009-5033","authenticated-orcid":false,"given":"Zixiao","family":"Wang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6249-5315","authenticated-orcid":false,"given":"Hongtao","family":"Xie","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0228-6220","authenticated-orcid":false,"given":"YuXin","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0265-5011","authenticated-orcid":false,"given":"Yadong","family":"Qu","sequence":"additional","affiliation":[]},{"given":"Fengjun","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Pengwei","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,29]]},"reference":[{"key":"21_CR1","doi-asserted-by":"crossref","unstructured":"Baek, Y., Lee, B., Han, D., Yun, S., Lee, H.: Character region awareness for text detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9365\u20139374 (2019)","DOI":"10.1109\/CVPR.2019.00959"},{"key":"21_CR2","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: Beit: BERT pre-training of image transformers. arXiv preprint arXiv:2106.08254 (2021)"},{"key":"21_CR3","doi-asserted-by":"crossref","unstructured":"Ch\u2019ng, C.K., Chan, C.S.: Total-text: a comprehensive dataset for scene text detection and recognition. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR), vol.\u00a01, pp. 935\u2013942. IEEE (2017)","DOI":"10.1109\/ICDAR.2017.157"},{"key":"21_CR4","doi-asserted-by":"crossref","unstructured":"Chng, C.K., et\u00a0al.: ICDAR2019 robust reading challenge on arbitrary-shaped text-RRC-art. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1571\u20131576. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00252"},{"key":"21_CR5","doi-asserted-by":"crossref","unstructured":"Du, X., Zhou, Z., Zheng, Y., Ma, T., Wu, X., Jin, C.: Modeling stroke mask for end-to-end text erasing. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 6151\u20136159 (2023)","DOI":"10.1109\/WACV56688.2023.00609"},{"key":"21_CR6","unstructured":"Feng, H., Wang, W., Liu, S., Deng, J., Zhou, W., Li, H.: Deeperaser: deep iterative context mining for generic text eraser. arXiv preprint arXiv:2402.19108 (2024)"},{"key":"21_CR7","doi-asserted-by":"crossref","unstructured":"Ge, J., Xie, H., Min, S., Li, P., Zhang, Y.: Dual part discovery network for zero-shot learning. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 3244\u20133252 (2022)","DOI":"10.1145\/3503161.3547889"},{"key":"21_CR8","doi-asserted-by":"crossref","unstructured":"Ge, J., Xie, H., Min, S., Zhang, Y.: Semantic-guided reinforced region embedding for generalized zero-shot learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 1406\u20131414 (2021)","DOI":"10.1609\/aaai.v35i2.16230"},{"key":"21_CR9","doi-asserted-by":"crossref","unstructured":"Guo, X., Yang, H., Huang, D.: Image inpainting via conditional texture and structure dual generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 14134\u201314143 (2021)","DOI":"10.1109\/ICCV48922.2021.01387"},{"key":"21_CR10","doi-asserted-by":"crossref","unstructured":"Gupta, A., Vedaldi, A., Zisserman, A.: Synthetic data for text localisation in natural images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2315\u20132324 (2016)","DOI":"10.1109\/CVPR.2016.254"},{"key":"21_CR11","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"21_CR12","unstructured":"Hou, Y., Chen, J.J., Wang, Z.: Multi-branch network with ensemble learning for text removal in the wild. In: Proceedings of the Asian Conference on Computer Vision, pp. 1333\u20131349 (2022)"},{"key":"21_CR13","doi-asserted-by":"crossref","unstructured":"Jiang, G., Wang, S., Ge, T., Jiang, Y., Wei, Y., Lian, D.: Self-supervised text erasing with controllable image synthesis. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 1973\u20131983 (2022)","DOI":"10.1145\/3503161.3547905"},{"key":"21_CR14","doi-asserted-by":"crossref","unstructured":"Jiang, L., Dai, B., Wu, W., Loy, C.C.: Focal frequency loss for image reconstruction and synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13919\u201313929 (2021)","DOI":"10.1109\/ICCV48922.2021.01366"},{"key":"21_CR15","doi-asserted-by":"crossref","unstructured":"Jiang, Q., Wang, J., Peng, D., Liu, C., Jin, L.: Revisiting scene text recognition: a data perspective. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 20543\u201320554 (2023)","DOI":"10.1109\/ICCV51070.2023.01878"},{"key":"21_CR16","doi-asserted-by":"crossref","unstructured":"Karatzas, D., et\u00a0al.: ICDAR 2015 competition on robust reading. In: 2015 13th International Conference on Document Analysis and Recognition (ICDAR), pp. 1156\u20131160. IEEE (2015)","DOI":"10.1109\/ICDAR.2015.7333942"},{"key":"21_CR17","doi-asserted-by":"crossref","unstructured":"Khodadadi, M., Behrad, A.: Text localization, extraction and inpainting in color images. In: 20th Iranian Conference on Electrical Engineering (ICEE 2012), pp. 1035\u20131040. IEEE (2012)","DOI":"10.1109\/IranianCEE.2012.6292505"},{"key":"21_CR18","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"457","DOI":"10.1007\/978-3-031-19787-1_26","volume-title":"ECCV 2022","author":"H Lee","year":"2022","unstructured":"Lee, H., Choi, C.: The surprisingly straightforward scene text removal method with gated attention and region of interest generation: a comprehensive prominent model analysis. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13676, pp. 457\u2013472. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19787-1_26"},{"key":"21_CR19","doi-asserted-by":"crossref","unstructured":"Li, W., Lin, Z., Zhou, K., Qi, L., Wang, Y., Jia, J.: Mat: mask-aware transformer for large hole image inpainting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10758\u201310768 (2022)","DOI":"10.1109\/CVPR52688.2022.01049"},{"key":"21_CR20","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"409","DOI":"10.1007\/978-3-031-19815-1_24","volume-title":"ECCV 2022","author":"C Liu","year":"2022","unstructured":"Liu, C., et al.: Don\u2019t forget me: accurate background recovery for text removal via modeling local-global context. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13688, pp. 409\u2013426. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_24"},{"key":"21_CR21","doi-asserted-by":"publisher","first-page":"8760","DOI":"10.1109\/TIP.2020.3018859","volume":"29","author":"C Liu","year":"2020","unstructured":"Liu, C., Liu, Y., Jin, L., Zhang, S., Luo, C., Wang, Y.: Erasenet: end-to-end text removal in the wild. IEEE Trans. Image Process. 29, 8760\u20138775 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"21_CR22","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"21_CR23","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109531","volume":"140","author":"G Lyu","year":"2023","unstructured":"Lyu, G., Liu, K., Zhu, A., Uchida, S., Iwana, B.K.: Fetnet: feature erasing and transferring network for scene text removal. Pattern Recogn. 140, 109531 (2023)","journal-title":"Pattern Recogn."},{"key":"21_CR24","doi-asserted-by":"crossref","unstructured":"Lyu, G., Zhu, A.: Psstrnet: progressive segmentation-guided scene text removal network. In: 2022 IEEE International Conference on Multimedia and Expo (ICME), pp.\u00a01\u20136. IEEE (2022)","DOI":"10.1109\/ICME52920.2022.9859792"},{"key":"21_CR25","doi-asserted-by":"crossref","unstructured":"Nayef, N., et\u00a0al.: ICDAR2019 robust reading challenge on multi-lingual scene text detection and recognition\u2014RRC-MLT-2019. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1582\u20131587. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00254"},{"key":"21_CR26","doi-asserted-by":"crossref","unstructured":"Pathak, D., Krahenbuhl, P., Donahue, J., Darrell, T., Efros, A.A.: Context encoders: feature learning by inpainting. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2536\u20132544 (2016)","DOI":"10.1109\/CVPR.2016.278"},{"key":"21_CR27","doi-asserted-by":"crossref","unstructured":"Peng, D., Liu, C., Liu, Y., Jin, L.: Viteraser: harnessing the power of vision transformers for scene text removal with segmim pretraining. arXiv preprint arXiv:2306.12106 (2023)","DOI":"10.1609\/aaai.v38i5.28245"},{"key":"21_CR28","doi-asserted-by":"crossref","unstructured":"Singh, A., Pang, G., Toh, M., Huang, J., Galuba, W., Hassner, T.: Textocr: towards large-scale end-to-end reasoning for arbitrary-shaped scene text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8802\u20138812 (2021)","DOI":"10.1109\/CVPR46437.2021.00869"},{"key":"21_CR29","doi-asserted-by":"crossref","unstructured":"Sun, Y., et\u00a0al.: ICDAR 2019 competition on large-scale street view text with partial labeling-RRC-LSVT. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1557\u20131562. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00250"},{"key":"21_CR30","doi-asserted-by":"crossref","unstructured":"Suvorov, R., et al.: Resolution-robust large mask inpainting with Fourier convolutions. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2149\u20132159 (2022)","DOI":"10.1109\/WACV51458.2022.00323"},{"key":"21_CR31","doi-asserted-by":"publisher","first-page":"9306","DOI":"10.1109\/TIP.2021.3125260","volume":"30","author":"Z Tang","year":"2021","unstructured":"Tang, Z., Miyazaki, T., Sugaya, Y., Omachi, S.: Stroke-based scene text erasing using synthetic data for training. IEEE Trans. Image Process. 30, 9306\u20139320 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"21_CR32","unstructured":"Tian, K., Jiang, Y., Diao, Q., Lin, C., Wang, L., Yuan, Z.: Designing BERT for convolutional networks: sparse and hierarchical masked modeling. arXiv preprint arXiv:2301.03580 (2023)"},{"key":"21_CR33","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2020.103066","volume":"201","author":"O Tursun","year":"2020","unstructured":"Tursun, O., Denman, S., Zeng, R., Sivapalan, S., Sridharan, S., Fookes, C.: MTRNet++: one-stage mask-based scene text eraser. Comput. Vis. Image Underst. 201, 103066 (2020)","journal-title":"Comput. Vis. Image Underst."},{"key":"21_CR34","doi-asserted-by":"crossref","unstructured":"Tursun, O., Zeng, R., Denman, S., Sivapalan, S., Sridharan, S., Fookes, C.: MTRNet: a generic scene text eraser. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 39\u201344. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00016"},{"key":"21_CR35","unstructured":"Veit, A., Matera, T., Neumann, L., Matas, J., Belongie, S.: Coco-text: dataset and benchmark for text detection and recognition in natural images. arXiv preprint arXiv:1601.07140 (2016)"},{"key":"21_CR36","doi-asserted-by":"crossref","unstructured":"Wagh, P.D., Patil, D.: Text detection and removal from image using inpainting with smoothing. In: 2015 International Conference on Pervasive Computing (ICPC), pp.\u00a01\u20134. IEEE (2015)","DOI":"10.1109\/PERVASIVE.2015.7087154"},{"key":"21_CR37","doi-asserted-by":"crossref","unstructured":"Wang, K., et al.: Masked text modeling: a self-supervised pre-training method for scene text detection. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 2006\u20132015 (2023)","DOI":"10.1145\/3581783.3612370"},{"key":"21_CR38","doi-asserted-by":"crossref","unstructured":"Wang, Y., Xie, H., Wang, Z., Qu, Y., Zhang, Y.: What is the real need for scene text removal? Exploring the background integrity and erasure exhaustivity properties. IEEE Trans. Image Process. (2023)","DOI":"10.1109\/TIP.2023.3290517"},{"key":"21_CR39","doi-asserted-by":"crossref","unstructured":"Wang, Z., Cun, X., Bao, J., Zhou, W., Liu, J., Li, H.: UFormer: a general u-shaped transformer for image restoration. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 17683\u201317693 (2022)","DOI":"10.1109\/CVPR52688.2022.01716"},{"issue":"4","key":"21_CR40","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1109\/TIP.2003.819861","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A.C., Sheikh, H.R., Simoncelli, E.P.: Image quality assessment: from error visibility to structural similarity. IEEE Trans. Image Process. 13(4), 600\u2013612 (2004)","journal-title":"IEEE Trans. Image Process."},{"key":"21_CR41","doi-asserted-by":"crossref","unstructured":"Xie, Z., et al.: SimMIM: a simple framework for masked image modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9653\u20139663 (2022)","DOI":"10.1109\/CVPR52688.2022.00943"},{"issue":"9","key":"21_CR42","doi-asserted-by":"publisher","first-page":"8934","DOI":"10.1109\/TKDE.2022.3220219","volume":"35","author":"X Yang","year":"2022","unstructured":"Yang, X., Song, Z., King, I., Xu, Z.: A survey on deep semi-supervised learning. IEEE Trans. Knowl. Data Eng. 35(9), 8934\u20138954 (2022)","journal-title":"IEEE Trans. Knowl. Data Eng."},{"key":"21_CR43","doi-asserted-by":"crossref","unstructured":"Zdenek, J., Nakayama, H.: Erasing scene text with weak supervision. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2238\u20132246 (2020)","DOI":"10.1109\/WACV45572.2020.9093544"},{"key":"21_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, R., et\u00a0al.: ICDAR 2019 robust reading challenge on reading Chinese text on signboard. In: 2019 International Conference on Document Analysis and Recognition (ICDAR), pp. 1577\u20131581. IEEE (2019)","DOI":"10.1109\/ICDAR.2019.00253"},{"key":"21_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, S., Liu, Y., Jin, L., Huang, Y., Lai, S.: ENSNet: ensconce text in the wild. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 801\u2013808 (2019)","DOI":"10.1609\/aaai.v33i01.3301801"},{"key":"21_CR46","doi-asserted-by":"crossref","unstructured":"Zhang, W., et al.: Context-aware image inpainting with learned semantic priors. arXiv preprint arXiv:2106.07220 (2021)","DOI":"10.24963\/ijcai.2021\/183"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72848-8_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T14:10:58Z","timestamp":1732803058000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72848-8_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,29]]},"ISBN":["9783031728471","9783031728488"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72848-8_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,29]]},"assertion":[{"value":"29 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}