{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T01:24:54Z","timestamp":1742952294164,"version":"3.40.3"},"publisher-location":"Cham","reference-count":43,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031727504"},{"type":"electronic","value":"9783031727511"}],"license":[{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72751-1_17","type":"book-chapter","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T09:52:13Z","timestamp":1729849933000},"page":"289-306","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["WeCromCL: Weakly Supervised Cross-Modality Contrastive Learning for\u00a0Transcription-Only Supervised Text Spotting"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1422-8984","authenticated-orcid":false,"given":"Jingjing","family":"Wu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0333-6210","authenticated-orcid":false,"given":"Zhengyao","family":"Fang","sequence":"additional","affiliation":[]},{"given":"Pengyuan","family":"Lyu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8254-5773","authenticated-orcid":false,"given":"Chengquan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Fanglin","family":"Chen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1578-2634","authenticated-orcid":false,"given":"Guangming","family":"Lu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8117-2696","authenticated-orcid":false,"given":"Wenjie","family":"Pei","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,26]]},"reference":[{"key":"17_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"17_CR2","doi-asserted-by":"crossref","unstructured":"Ch\u2019ng, C.K., Chan, C.S.: Total-text: a comprehensive dataset for scene text detection and recognition. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR). vol.\u00a01. IEEE (2017)","DOI":"10.1109\/ICDAR.2017.157"},{"key":"17_CR3","doi-asserted-by":"crossref","unstructured":"Duan, J., et al.: Multi-modal alignment using representation codebook. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01520"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Fang, S., Mao, Z., Xie, H., Wang, Y., Yan, C., Zhang, Y.: ABINet++: autonomous, bidirectional and iterative language modeling for scene text spotting. IEEE Trans. Pattern Anal. Mach. Intell. 45(6), 7123\u20137141 (2022)","DOI":"10.1109\/TPAMI.2022.3223908"},{"key":"17_CR5","doi-asserted-by":"crossref","unstructured":"Feng, W., He, W., Yin, F., Zhang, X.Y., Liu, C.L.: TextDragon: an end-to-end framework for arbitrary shaped text spotting. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00917"},{"key":"17_CR6","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"17_CR7","doi-asserted-by":"crossref","unstructured":"Huang, M., et al.: EstextSpotter: towards better scene text spotting with explicit synergy in transformer. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01786"},{"key":"17_CR8","unstructured":"Huo, Y et\u00a0al.: WenLan: bridging vision and language by large-scale multi-modal pre-training. arXiv preprint arXiv:2103.06561 (2021)"},{"key":"17_CR9","doi-asserted-by":"crossref","unstructured":"Jaderberg, M., Simonyan, K., Vedaldi, A., Zisserman, A.: Reading text in the wild with convolutional neural networks. Int. J. Comput. Vision 116(1), 1\u201320 (2016)","DOI":"10.1007\/s11263-015-0823-z"},{"key":"17_CR10","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: ICML. PMLR (2021)"},{"key":"17_CR11","doi-asserted-by":"crossref","unstructured":"Karatzas, D., et\u00a0al.: ICDAR 2015 competition on robust reading. In: 2015 13th international conference on document analysis and recognition (ICDAR). IEEE (2015)","DOI":"10.1109\/ICDAR.2015.7333942"},{"key":"17_CR12","doi-asserted-by":"crossref","unstructured":"Karatzas, D., et al.: ICDAR 2013 robust reading competition. In: 2013 12th International Conference on Document Analysis and Recognition. IEEE (2013)","DOI":"10.1109\/ICDAR.2013.221"},{"key":"17_CR13","doi-asserted-by":"crossref","unstructured":"Kittenplon, Y., Lavi, I., Fogel, S., Bar, Y., Manmatha, R., Perona, P.: Towards weakly-supervised text spotting using a multi-task transformer. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00456"},{"key":"17_CR14","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before Fuse: vision and language representation learning with momentum distillation. NeurIPS 34, 9694\u20139705 (2021)"},{"key":"17_CR15","doi-asserted-by":"crossref","unstructured":"Li, W., et al.: UNIMO: towards unified-modal understanding and generation via cross-modal contrastive learning. arXiv preprint arXiv:2012.15409 (2020)","DOI":"10.18653\/v1\/2021.acl-long.202"},{"key":"17_CR16","doi-asserted-by":"crossref","unstructured":"Liao, M., Lyu, P., He, M., Yao, C., Wu, W., Bai, X.: Mask TextSpotter: an end-to-end trainable neural network for spotting text with arbitrary shapes. IEEE Trans. Pattern Anal. and Mach. Intell. 43(2), 532\u2013548 (2019)","DOI":"10.1109\/TPAMI.2019.2937086"},{"key":"17_CR17","doi-asserted-by":"publisher","first-page":"706","DOI":"10.1007\/978-3-030-58621-8_41","volume-title":"Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XI","author":"M Liao","year":"2020","unstructured":"Liao, M., Pang, G., Huang, J., Hassner, T., Bai, X.: Mask TextSpotter V3: segmentation Proposal Network for Robust Scene Text Spotting. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) Computer Vision \u2013 ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XI, pp. 706\u2013722. Springer International Publishing, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58621-8_41"},{"key":"17_CR18","doi-asserted-by":"crossref","unstructured":"Liao, M., Shi, B., Bai, X.: TextBoxes++: a single-shot oriented scene text detector. IEEE Trans. Image Process. 27(8), 3676\u20133690 (2018)","DOI":"10.1109\/TIP.2018.2825107"},{"key":"17_CR19","doi-asserted-by":"crossref","unstructured":"Liao, M., Shi, B., Bai, X., Wang, X., Liu, W.: TextBoxes: a fast text detector with a single deep neural network. In: AAAI (2017)","DOI":"10.1609\/aaai.v31i1.11196"},{"key":"17_CR20","doi-asserted-by":"crossref","unstructured":"Liu, Y., Chen, H., Shen, C., He, T., Jin, L., Wang, L.: ABCNet: real-time scene text spotting with adaptive Bezier-curve network. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00983"},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Liu, Y., Jin, L., Zhang, S., Luo, C., Zhang, S.: Curved scene text detection via transverse and longitudinal sequence connection. Pattern Recogn. 90, 337\u2013345 (2019)","DOI":"10.1016\/j.patcog.2019.02.002"},{"key":"17_CR22","doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: ABCNet v2: adaptive Bezier-curve network for real-time end-to-end text spotting. IEEE Trans. Pattern Anal. Mach. Intell. 44(11), 8048\u20138064 (2021)","DOI":"10.1109\/TPAMI.2021.3107437"},{"key":"17_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Y., et\u00a0al.: SPTS v2: single-point scene text spotting. arXiv preprint arXiv:2301.01635 (2023)","DOI":"10.1109\/TPAMI.2023.3312285"},{"key":"17_CR24","doi-asserted-by":"crossref","unstructured":"Lyu, P., Liao, M., Yao, C., Wu, W., Bai, X.: Mask TextSpotter: an end-to-end trainable neural network for spotting text with arbitrary shapes. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01264-9_5"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Nayef, N., et\u00a0al.: ICDAR2017 robust reading challenge on multi-lingual scene text detection and script identification-RRC-MLT. In: 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR). vol.\u00a01. IEEE (2017)","DOI":"10.1109\/ICDAR.2017.237"},{"key":"17_CR26","doi-asserted-by":"crossref","unstructured":"Peng, D., et\u00a0al.: SPTS: single-point text spotting. In: ACM MM (2022)","DOI":"10.1145\/3503161.3547942"},{"key":"17_CR27","doi-asserted-by":"crossref","unstructured":"Qiao, L., et al.: MANGO: a mask attention guided one-stage scene text spotter. In: AAAI. vol.\u00a035 (2021)","DOI":"10.1609\/aaai.v35i3.16348"},{"key":"17_CR28","doi-asserted-by":"crossref","unstructured":"Qiao, L., et al.: Text Perceptron: towards end-to-end arbitrary-shaped text spotting. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol.\u00a034 (2020)","DOI":"10.1609\/aaai.v34i07.6864"},{"key":"17_CR29","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML. PMLR (2021)"},{"key":"17_CR30","doi-asserted-by":"crossref","unstructured":"Song, S., et al.: Vision-language pre-training for boosting scene text detectors. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01523"},{"key":"17_CR31","doi-asserted-by":"crossref","unstructured":"Tan, M., Pang, R., Le, Q.V.: EfficientDet: scalable and efficient object detection. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01079"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Tang, J., Qiao, S., Cui, B., Ma, Y., Zhang, S., Kanoulas, D.: You can even annotate text with voice: transcription-only-supervised text spotting. In: ACM MM (2022)","DOI":"10.1145\/3503161.3547787"},{"key":"17_CR33","doi-asserted-by":"crossref","unstructured":"Wang, H., et al.: All You Need Is Boundary: toward arbitrary-shaped text spotting. In: AAAI. vol.\u00a034 (2020)","DOI":"10.1609\/aaai.v34i07.6896"},{"key":"17_CR34","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: PAN++: towards efficient and accurate end-to-end spotting of arbitrarily-shaped text. IEEE Trans. Pattern Anal. Mach. Intell. 44(9), 5349\u20135367 (2021)","DOI":"10.1109\/TPAMI.2021.3077555"},{"key":"17_CR35","unstructured":"Wu, J., Lyu, P., Lu, G., Zhang, C., Pei, W.: Single shot self-reliant scene text spotter by decoupled yet collaborative detection and recognition (2023)"},{"key":"17_CR36","doi-asserted-by":"crossref","unstructured":"Wu, J., Lyu, P., Lu, G., Zhang, C., Yao, K., Pei, W.: Decoupling recognition from detection: single shot self-reliant scene text spotter. In: ACM MM (2022)","DOI":"10.1145\/3503161.3548266"},{"key":"17_CR37","doi-asserted-by":"publisher","first-page":"284","DOI":"10.1007\/978-3-031-19815-1_17","volume-title":"Computer Vision \u2013 ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXVIII","author":"C Xue","year":"2022","unstructured":"Xue, C., Zhang, W., Hao, Yu., Lu, S., Torr, P.H.S., Bai, S.: Language matters: a weakly supervised vision-language pre-training approach for scene text detection and spotting. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXVIII, pp. 284\u2013302. Springer Nature Switzerland, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_17"},{"key":"17_CR38","doi-asserted-by":"publisher","first-page":"284","DOI":"10.1007\/978-3-031-19815-1_17","volume-title":"Computer Vision \u2013 ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXVIII","author":"C Xue","year":"2022","unstructured":"Xue, C., Zhang, W., Hao, Yu., Lu, S., Torr, P.H.S., Bai, S.: Language matters: a weakly supervised vision-language pre-training approach for scene text detection and spotting. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision \u2013 ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXVIII, pp. 284\u2013302. Springer Nature Switzerland, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19815-1_17"},{"key":"17_CR39","doi-asserted-by":"crossref","unstructured":"Yang, J., et al.: Vision-language pre-training with triple contrastive learning. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"17_CR40","doi-asserted-by":"crossref","unstructured":"Ye, M., et al.: DeepSolo: let transformer decoder with explicit points solo for text spotting. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01854"},{"key":"17_CR41","doi-asserted-by":"crossref","unstructured":"Yu, W., Liu, Y., Hua, W., Jiang, D., Ren, B., Bai, X.: Turning a CLIP model into a scene text detector. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00674"},{"key":"17_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, X., Su, Y., Tripathi, S., Tu, Z.: Text spotting transformers. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00930"},{"key":"17_CR43","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. In: ICLR"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72751-1_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T10:19:02Z","timestamp":1729851542000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72751-1_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,26]]},"ISBN":["9783031727504","9783031727511"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72751-1_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,26]]},"assertion":[{"value":"26 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}