{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T10:15:16Z","timestamp":1743070516950,"version":"3.40.3"},"publisher-location":"Cham","reference-count":30,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031723469"},{"type":"electronic","value":"9783031723476"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-72347-6_5","type":"book-chapter","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T13:02:55Z","timestamp":1726491775000},"page":"66-80","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Exploring Interpretable Semantic Alignment for\u00a0Multimodal Machine Translation"],"prefix":"10.1007","author":[{"given":"Guojing","family":"Liu","sequence":"first","affiliation":[]},{"given":"Xiangqian","family":"Ding","sequence":"additional","affiliation":[]},{"given":"Nanzhe","family":"Ding","sequence":"additional","affiliation":[]},{"given":"Huili","family":"Gong","sequence":"additional","affiliation":[]},{"given":"Zhenyu","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Xiangyu","family":"Qu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,17]]},"reference":[{"key":"5_CR1","doi-asserted-by":"crossref","unstructured":"Barrault, L., Bougares, F., Specia, L., Lala, C., Elliott, D., Frank, S.: Findings of the third shared task on multimodal machine translation. In: Third Conference on Machine Translation (WMT 2018), vol.\u00a02, pp. 308\u2013327 (2018)","DOI":"10.18653\/v1\/W18-6402"},{"key":"5_CR2","doi-asserted-by":"crossref","unstructured":"Caglayan, O., et al.: Cross-lingual visual pre-training for multimodal machine translation. arXiv preprint arXiv:2101.10044 (2021)","DOI":"10.18653\/v1\/2021.eacl-main.112"},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Caglayan, O., Madhyastha, P., Specia, L., Barrault, L.: Probing the need for visual context in multimodal machine translation. arXiv preprint arXiv:1903.08678 (2019)","DOI":"10.18653\/v1\/N19-1422"},{"key":"5_CR4","doi-asserted-by":"crossref","unstructured":"Calixto, I., Liu, Q., Campbell, N.: Doubly-attentive decoder for multi-modal neural machine translation. arXiv preprint arXiv:1702.01287 (2017)","DOI":"10.18653\/v1\/P17-1175"},{"key":"5_CR5","doi-asserted-by":"crossref","unstructured":"Calixto, I., Liu, Q., Campbell, N.: Incorporating global visual features into attention-based neural machine translation. arXiv preprint arXiv:1701.06521 (2017)","DOI":"10.18653\/v1\/D17-1105"},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Calixto, I., Rios, M., Aziz, W.: Latent variable model for multi-modal translation. arXiv preprint arXiv:1811.00357 (2018)","DOI":"10.18653\/v1\/P19-1642"},{"key":"5_CR7","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"Elliott, D.: Adversarial evaluation of multimodal machine translation. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pp. 2974\u20132978 (2018)","DOI":"10.18653\/v1\/D18-1329"},{"key":"5_CR9","doi-asserted-by":"crossref","unstructured":"Elliott, D., Frank, S., Barrault, L., Bougares, F., Specia, L.: Findings of the second shared task on multimodal machine translation and multilingual image description. arXiv preprint arXiv:1710.07177 (2017)","DOI":"10.18653\/v1\/W17-4718"},{"key":"5_CR10","doi-asserted-by":"crossref","unstructured":"Elliott, D., Frank, S., Sima\u2019an, K., Specia, L.: Multi30K: multilingual English-German image descriptions. arXiv preprint arXiv:1605.00459 (2016)","DOI":"10.18653\/v1\/W16-3210"},{"key":"5_CR11","doi-asserted-by":"crossref","unstructured":"Gr\u00f6nroos, S.A., et\u00a0al.: The MeMAD submission to the WMT18 multimodal translation task. arXiv preprint arXiv:1808.10802 (2018)","DOI":"10.18653\/v1\/W18-6439"},{"key":"5_CR12","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"5_CR13","doi-asserted-by":"crossref","unstructured":"Huang, P.Y., Liu, F., Shiang, S.R., Oh, J., Dyer, C.: Attention-based multimodal neural machine translation. In: Proceedings of the First Conference on Machine Translation: Volume 2, Shared Task Papers, pp. 639\u2013645 (2016)","DOI":"10.18653\/v1\/W16-2360"},{"key":"5_CR14","doi-asserted-by":"crossref","unstructured":"Ive, J., Madhyastha, P., Specia, L.: Distilling translations with visual awareness. arXiv preprint arXiv:1906.07701 (2019)","DOI":"10.18653\/v1\/P19-1653"},{"key":"5_CR15","doi-asserted-by":"crossref","unstructured":"Ji, B., Zhang, T., Zou, Y., Hu, B., Shen, S.: Increasing visual awareness in multimodal neural machine translation from an information theoretic perspective. arXiv preprint arXiv:2210.08478 (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.453"},{"key":"5_CR16","doi-asserted-by":"crossref","unstructured":"Lala, C., Madhyastha, P.S., Scarton, C., Specia, L.: Sheffield submissions for WMT18 multimodal translation shared task. In: Proceedings of the Third Conference on Machine Translation: Shared Task Papers, pp. 624\u2013631 (2018)","DOI":"10.18653\/v1\/W18-6442"},{"key":"5_CR17","unstructured":"Li, B., et al.: On vision features in multimodal machine translation. arXiv preprint arXiv:2203.09173 (2022)"},{"key":"5_CR18","doi-asserted-by":"crossref","unstructured":"Lin, H., et al.: Dynamic context-guided capsule network for multimodal machine translation. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1320\u20131329 (2020)","DOI":"10.1145\/3394171.3413715"},{"key":"5_CR19","doi-asserted-by":"crossref","unstructured":"Liu, Z., et\u00a0al.: Swin transformer V2: scaling up capacity and resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12009\u201312019 (2022)","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"5_CR20","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"5_CR21","unstructured":"Minderer, M., et\u00a0al.: Simple open-vocabulary object detection with vision transformers. arxiv 2022. arXiv preprint arXiv:2205.06230 (2022)"},{"key":"5_CR22","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"5_CR23","doi-asserted-by":"crossref","unstructured":"Song, Y., Chen, S., Jin, Q., Luo, W., Xie, J., Huang, F.: Product-oriented machine translation with cross-modal cross-lingual pre-training. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 2843\u20132852 (2021)","DOI":"10.1145\/3474085.3475303"},{"key":"5_CR24","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"5_CR25","doi-asserted-by":"crossref","unstructured":"Wu, Z., Kong, L., Bi, W., Li, X., Kao, B.: Good for misconceived reasons: an empirical revisiting on the need for visual context in multimodal machine translation. arXiv preprint arXiv:2105.14462 (2021)","DOI":"10.18653\/v1\/2021.acl-long.480"},{"key":"5_CR26","doi-asserted-by":"crossref","unstructured":"Xu, X., Wu, C., Rosenman, S., Lal, V., Che, W., Duan, N.: BridgeTower: building bridges between encoders in vision-language representation learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 10637\u201310647 (2023)","DOI":"10.1609\/aaai.v37i9.26263"},{"key":"5_CR27","doi-asserted-by":"crossref","unstructured":"Yang, B., Tu, Z., Wong, D.F., Meng, F., Chao, L.S., Zhang, T.: Modeling localness for self-attention networks. arXiv preprint arXiv:1810.10182 (2018)","DOI":"10.18653\/v1\/D18-1475"},{"key":"5_CR28","unstructured":"Ye, J., Guo, J., Tan, K., Xiang, Y., Yu, Z.: Based on semantic guidance of fine-grained alignment of image-text for multi-modal neural machine translation. In: Proceedings of the 21st Chinese National Conference on Computational Linguistics, pp. 281\u2013292 (2022)"},{"key":"5_CR29","doi-asserted-by":"crossref","unstructured":"Yin, Y., et al.: A novel graph-based multi-modal fusion encoder for neural machine translation. arXiv preprint arXiv:2007.08742 (2020)","DOI":"10.18653\/v1\/2020.acl-main.273"},{"key":"5_CR30","doi-asserted-by":"crossref","unstructured":"Zhou, M., Cheng, R., Lee, Y.J., Yu, Z.: A visual attention grounding neural model for multimodal machine translation. arXiv preprint arXiv:1808.08266 (2018)","DOI":"10.18653\/v1\/D18-1400"}],"container-title":["Lecture Notes in Computer Science","Artificial Neural Networks and Machine Learning \u2013 ICANN 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72347-6_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T13:15:26Z","timestamp":1726492526000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72347-6_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031723469","9783031723476"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72347-6_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"17 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICANN","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Artificial Neural Networks","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lugano","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Switzerland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"33","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icann2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}