{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T08:53:04Z","timestamp":1763196784032,"version":"3.45.0"},"publisher-location":"Singapore","reference-count":24,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819533480","type":"print"},{"value":"9789819533497","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,16]],"date-time":"2025-11-16T00:00:00Z","timestamp":1763251200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,16]],"date-time":"2025-11-16T00:00:00Z","timestamp":1763251200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-3349-7_19","type":"book-chapter","created":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T08:49:46Z","timestamp":1763196586000},"page":"241-254","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Fine-Grained Contrastive Learning for\u00a0End-to-End Vietnamese Text Image Machine Translation"],"prefix":"10.1007","author":[{"given":"Cunli","family":"Mao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haoxiang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rui","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ying","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shengxiang","family":"Gao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhengtao","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,11,16]]},"reference":[{"key":"19_CR1","doi-asserted-by":"crossref","unstructured":"Aberdam, A., et al.: Sequence-to-sequence contrastive learning for text recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15302\u201315312 (2021)","DOI":"10.1109\/CVPR46437.2021.01505"},{"key":"19_CR2","doi-asserted-by":"crossref","unstructured":"Baek, J., et al.: What is wrong with scene text recognition model comparisons? dataset and model analysis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4715\u20134723 (2019)","DOI":"10.1109\/ICCV.2019.00481"},{"key":"19_CR3","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: International Conference on Machine Learning, pp. 1597\u20131607. PMLR (2020)"},{"key":"19_CR4","doi-asserted-by":"crossref","unstructured":"Chen, Z., Yin, F., Zhang, X.Y., Yang, Q., Liu, C.L.: Cross-lingual text image recognition via multi-task sequence to sequence learning. In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 3122\u20133129. IEEE (2021)","DOI":"10.1109\/ICPR48806.2021.9412281"},{"issue":"5","key":"19_CR5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3406095","volume":"53","author":"R Dabre","year":"2020","unstructured":"Dabre, R., Chu, C., Kunchukuttan, A.: A survey of multilingual neural machine translation. ACM Comput. Surv. (CSUR) 53(5), 1\u201338 (2020)","journal-title":"ACM Comput. Surv. (CSUR)"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Doan, L., Nguyen, L.T., Tran, N.L., Hoang, T., Nguyen, D.Q.: PhoMT: a high-quality and large-scale benchmark dataset for Vietnamese-English machine translation. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 4495\u20134503 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.369"},{"key":"19_CR7","unstructured":"Dong, X., et\u00a0al.: Internlm-xcomposer2: mastering free-form text-image composition and comprehension in vision-language large model. arXiv preprint arXiv:2401.16420 (2024)"},{"key":"19_CR8","unstructured":"Khosla, P., et al.: Supervised contrastive learning. In: Advances in Neural Information Processing Systems, vol. 33, pp. 18661\u201318673 (2020)"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Luong, M.T., Pham, H., Manning, C.D.: Effective approaches to attention-based neural machine translation. In: Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing, pp. 1412\u20131421 (2015)","DOI":"10.18653\/v1\/D15-1166"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Ma, C., et al.: Modal contrastive learning based end-to-end text image machine translation. IEEE\/ACM Trans. Audio, Speech Lang. Process. 32, 2153\u20132165 (2023)","DOI":"10.1109\/TASLP.2023.3324540"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Ma, C., et al.: Improving end-to-end text image translation from the auxiliary text translation task. In: 2022 26th International Conference on Pattern Recognition (ICPR), pp. 1664\u20131670. IEEE (2022)","DOI":"10.1109\/ICPR56361.2022.9956695"},{"key":"19_CR12","doi-asserted-by":"crossref","unstructured":"Ma, C., Zhang, Y., Tu, M., Zhao, Y., Zhou, Y., Zong, C.: CCIM: cross-modal cross-lingual interactive image translation. In: Findings of the Association for Computational Linguistics: EMNLP 2023, pp. 4959\u20134965 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.330"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Ma, C., Zhang, Y., Tu, M., Zhao, Y., Zhou, Y., Zong, C.: E2timt: efficient and effective modal adapter for text image machine translation. In: International Conference on Document Analysis and Recognition, pp. 70\u201388. Springer (2023)","DOI":"10.1007\/978-3-031-41731-3_5"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Ma, C., Zhang, Y., Tu, M., Zhao, Y., Zhou, Y., Zong, C.: Multi-teacher knowledge distillation for end-to-end text image machine translation. In: International Conference on Document Analysis and Recognition, pp. 484\u2013501. Springer (2023)","DOI":"10.1007\/978-3-031-41676-7_28"},{"key":"19_CR15","unstructured":"Ma, C., et al.: Born a Babynet with hierarchical parental supervision for end-to-end text image machine translation. In: Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pp. 2468\u20132479 (2024)"},{"key":"19_CR16","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Shi, B., Wang, X., Lyu, P., Yao, C., Bai, X.: Robust scene text recognition with automatic rectification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4168\u20134176 (2016)","DOI":"10.1109\/CVPR.2016.452"},{"key":"19_CR18","unstructured":"Wang, P., et\u00a0al.: Qwen2-vl: enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)"},{"key":"19_CR19","unstructured":"Wu, Z., et\u00a0al.: Deepseek-vl2: mixture-of-experts vision-language models for advanced multimodal understanding. arXiv preprint arXiv:2412.10302 (2024)"},{"key":"19_CR20","unstructured":"Xiao, T., Zhu, J., Zhang, H., Li, Q.: Niutrans: an open source toolkit for phrase-based and syntax-based machine translation. In: Proceedings of the ACL 2012 System Demonstrations, pp. 19\u201324 (2012)"},{"key":"19_CR21","doi-asserted-by":"crossref","unstructured":"Yao, S., Wan, X.: Multimodal transformer for multimodal machine translation. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 4346\u20134350 (2020)","DOI":"10.18653\/v1\/2020.acl-main.400"},{"key":"19_CR22","unstructured":"Zeng, A., et\u00a0al.: Chatglm: g family of large language models from GLM-130b to GLM-4 all tools. CoRR (2024)"},{"key":"19_CR23","doi-asserted-by":"publisher","first-page":"3922","DOI":"10.1109\/TIP.2021.3066903","volume":"30","author":"Y Zhang","year":"2021","unstructured":"Zhang, Y., Nie, S., Liang, S., Liu, W.: Robust text image recognition via adversarial sequence-to-sequence domain adaptation. IEEE Trans. Image Process. 30, 3922\u20133933 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"19_CR24","doi-asserted-by":"crossref","unstructured":"Zhu, S., Li, S., Lei, Y., Xiong, D.: Peit: bridging the modality gap with pre-trained models for end-to-end image translation. In: Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 13433\u201313447 (2023)","DOI":"10.18653\/v1\/2023.acl-long.751"}],"container-title":["Lecture Notes in Computer Science","Natural Language Processing and Chinese Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-3349-7_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T08:49:51Z","timestamp":1763196591000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-3349-7_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,16]]},"ISBN":["9789819533480","9789819533497"],"references-count":24,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-3349-7_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,16]]},"assertion":[{"value":"16 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"NLPCC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF International Conference on Natural Language Processing and Chinese Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"nlpcc2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/tcci.ccf.org.cn\/conference\/2025\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}