{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T14:53:21Z","timestamp":1743087201412,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":32,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819601189"},{"type":"electronic","value":"9789819601196"}],"license":[{"start":{"date-parts":[[2024,11,12]],"date-time":"2024-11-12T00:00:00Z","timestamp":1731369600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,12]],"date-time":"2024-11-12T00:00:00Z","timestamp":1731369600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-0119-6_32","type":"book-chapter","created":{"date-parts":[[2024,11,16]],"date-time":"2024-11-16T18:18:45Z","timestamp":1731781125000},"page":"334-346","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["FINE-LMT: Fine-Grained Feature Learning for\u00a0Multi-modal Machine Translation"],"prefix":"10.1007","author":[{"given":"Yusong","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ying","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dongyuan","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jialun","family":"Shen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yicheng","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingkun","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kotaro","family":"Funakoshi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Manabu","family":"Okumura","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,12]]},"reference":[{"key":"32_CR1","doi-asserted-by":"crossref","unstructured":"Calixto, I., Liu, Q., Campbell, N.: Doubly-attentive decoder for multi-modal neural machine translation. In: Proceedings of ACL, pp. 1913\u20131924. Vancouver, Canada (2017)","DOI":"10.18653\/v1\/P17-1175"},{"key":"32_CR2","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Proceedings ECCV, pp. 213\u2013229. Cham (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"32_CR3","unstructured":"Chen, T., Kornblith, S., Norouzi, M., Hinton, G.: A simple framework for contrastive learning of visual representations. In: Proceedings of ICML. vol.\u00a0119, pp. 1597\u20131607 (2020)"},{"key":"32_CR4","doi-asserted-by":"crossref","unstructured":"Elliott, D., Frank, S., Barrault, L., Bougares, F., Specia, L.: Findings of the second shared task on multimodal machine translation and multilingual image description. In: Proceedings of MT, pp. 215\u2013233. Copenhagen, Denmark (2017)","DOI":"10.18653\/v1\/W17-4718"},{"key":"32_CR5","doi-asserted-by":"crossref","unstructured":"Elliott, D., Frank, S., Sima\u2019an, K., Specia, L.: Multi30K: multilingual English-German image descriptions. In: Proceedings of WVL, pp. 70\u201374. Berlin, Germany (2016)","DOI":"10.18653\/v1\/W16-3210"},{"key":"32_CR6","unstructured":"Elliott, D., K\u00e1d\u00e1r, \u00c1.: Imagination improves multimodal translation. In: Proceedings of IJCN 2017, pp. 130\u2013141. Taipei, Taiwan (2017)"},{"key":"32_CR7","doi-asserted-by":"crossref","unstructured":"Fang, Q., Feng, Y.: Neural machine translation with phrase-level universal visual representations. In: Proceedings of ACL, pp. 5687\u20135698. Dublin, Ireland (2022)","DOI":"10.18653\/v1\/2022.acl-long.390"},{"key":"32_CR8","doi-asserted-by":"crossref","unstructured":"Futeral, M., Schmid, C., Laptev, I., Sagot, B., Bawden, R.: Tackling ambiguity with images: improved multimodal machine translation and contrastive evaluation. In: Proceedings of ACL 2023, pp. 5394\u20135413. Toronto, Canada (2023)","DOI":"10.18653\/v1\/2023.acl-long.295"},{"key":"32_CR9","doi-asserted-by":"crossref","unstructured":"Gehring, J., Auli, M., Grangier, D., Dauphin, Y.: A convolutional encoder model for neural machine translation. In: Proceedings of ACL, pp. 123\u2013135. Vancouver, Canada (2017)","DOI":"10.18653\/v1\/P17-1012"},{"key":"32_CR10","doi-asserted-by":"crossref","unstructured":"Hazarika, D., Zimmermann, R., Poria, S.: Misa: Modality-invariant and -specific representations for multimodal sentiment analysis. In: Proceedings of MM, pp. 1122-1131. Seattle, Washington (2020)","DOI":"10.1145\/3394171.3413678"},{"key":"32_CR11","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of CVPR, pp. 770\u2013778. Las Vegas, Nevada (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"32_CR12","doi-asserted-by":"crossref","unstructured":"Lai, Z., Wu, J., Chen, S., Zhou, Y., Hovakimyan, N.: Residual-based language models are free boosters for biomedical imaging tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5086\u20135096 (2024)","DOI":"10.1109\/CVPRW63382.2024.00515"},{"key":"32_CR13","unstructured":"Li, B., et al.: On vision features in multimodal machine translation. In: Proceedings of ACL 2022, pp. 6327\u20136337. Dublin, Ireland (2022)"},{"key":"32_CR14","doi-asserted-by":"crossref","unstructured":"Li, D., Wang, Y., Funakoshi, K., Okumura, M.: Joyful: joint modality fusion and graph contrastive learning for multimoda emotion recognition. In: Proceedings of EMNLP, pp. 16051\u201316069. Singapore (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.996"},{"key":"32_CR15","unstructured":"Li, D., You, J., Funakoshi, K., Okumura, M.: A-TIP: attribute-aware text infilling via pre-trained language model. In: Proceedings of COLING, pp. 5857\u20135869. Gyeongju, Republic of Korea (2022)"},{"key":"32_CR16","doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: Multilingual denoising pre-training for neural machine translation. Trans. Assoc. Comput. Linguist. 8, 726\u2013742 (2020)","DOI":"10.1162\/tacl_a_00343"},{"key":"32_CR17","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021)"},{"key":"32_CR18","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Proceedings of NeurIPS, pp. 5998\u20136008. Long Beach, California (2017)"},{"key":"32_CR19","doi-asserted-by":"crossref","unstructured":"Wang, Y., Li, D., Funakoshi, K., Okumura, M.: Emp: emotion-guided multi-modal fusion and contrastive learning for personality traits recognition. In: Proceedings of ICMR, pp. 243\u2013252. ICMR \u201923, New York, NY, USA (2023)","DOI":"10.1145\/3591106.3592243"},{"key":"32_CR20","doi-asserted-by":"crossref","unstructured":"Wang, Z., Li, D., Li, G., Zhang, Z., Jiang, R.: Multimodal low-light image enhancement with depth information. In: Proceedings of MM (2024)","DOI":"10.1145\/3664647.3680741"},{"key":"32_CR21","doi-asserted-by":"crossref","unstructured":"Wang, Z., Li, D., Okumura, M.: Multimodal graph-based audio-visual event localization. In: Proceedings of ICASSP, pp. 7880\u20137884 (2024)","DOI":"10.1109\/ICASSP48485.2024.10448223"},{"key":"32_CR22","doi-asserted-by":"crossref","unstructured":"Wu, Z., Kong, L., Bi, W., Li, X., Kao, B.: Good for misconceived reasons: an empirical revisiting on the need for visual context in multimodal machine translation. In: Proceedings of ACL-IJCNLP, pp. 6153\u20136166. Online (2021)","DOI":"10.18653\/v1\/2021.acl-long.480"},{"key":"32_CR23","doi-asserted-by":"crossref","unstructured":"Yang, D., Huang, S., Kuang, H., Du, Y., Zhang, L.: Disentangled representation learning for multimodal emotion recognition. In: Proceedings of MM, pp. 1642\u20131651. Lisboa, Portugal (2022)","DOI":"10.1145\/3503161.3547754"},{"key":"32_CR24","unstructured":"Ye, J., Guo, J., Xiang, Y., Tan, K., Yu, Z.: Noise-robust cross-modal interactive learning with Text2Image mask for multi-modal neural machine translation. In: Proceedings of COLING, pp. 5098\u20135108. Gyeongju, Republic of Korea (2022)"},{"key":"32_CR25","doi-asserted-by":"crossref","unstructured":"Yin, Y., Meng, F., Su, J., Zhou, C., Yang, Z., Zhou, J., Luo, J.: A novel graph-based multi-modal fusion encoder for neural machine translation. In: Proceedings of ACL, pp. 3025\u20133035. Online (2020)","DOI":"10.18653\/v1\/2020.acl-main.273"},{"key":"32_CR26","doi-asserted-by":"crossref","unstructured":"You, J., Li, D., Kamigaito, H., Funakoshi, K., Okumura, M.: Joint learning-based heterogeneous graph attention network for timeline summarization. In: Proceedings of NAACL, pp. 4091\u20134104. Seattle, United States (2022)","DOI":"10.18653\/v1\/2022.naacl-main.301"},{"key":"32_CR27","doi-asserted-by":"crossref","unstructured":"Zhang, M., Mosbach, M., Adelani, D., Hedderich, M., Klakow, D.: MCSE: multimodal contrastive learning of sentence embeddings. In: Proceedings of NAACL, pp. 5959\u20135969. Seattle, Washington (2022)","DOI":"10.18653\/v1\/2022.naacl-main.436"},{"key":"32_CR28","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Kamigaito, H., Okumura, M.: Bidirectional transformer reranker for grammatical error correction. In: Proceedings of ACL, pp. 3801\u20133825. Toronto, Canada (2023)","DOI":"10.18653\/v1\/2023.findings-acl.234"},{"key":"32_CR29","unstructured":"Zhang, Z., et al.: Neural machine translation with universal visual representation. In: Proceedings of ICLR, pp. 1\u201314. Addis Ababa, Ethiopia (2020)"},{"key":"32_CR30","unstructured":"Zhao, Y., Komachi, M., Kajiwara, T., Chu, C.: Double attention-based multimodal neural machine translation with semantic image regions. In: Proceedings of EAMT, pp. 105\u2013114. Lisboa, Portugal (2020)"},{"key":"32_CR31","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, \u00c0., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: Proceedings of CVPR 2016, pp. 2921\u20132929. Las Vegas, Nevada (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"32_CR32","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Sun, Z., Cheng, S., Huang, L., Wu, L., Wang, M.: Beyond triplet: leveraging the most data for multimodal machine translation. In: Proceedings of ACL, pp. 2679\u20132697. Toronto, Canada (2023)","DOI":"10.18653\/v1\/2023.findings-acl.168"}],"container-title":["Lecture Notes in Computer Science","PRICAI 2024: Trends in Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-0119-6_32","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,23]],"date-time":"2025-01-23T12:36:45Z","timestamp":1737635805000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-0119-6_32"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,12]]},"ISBN":["9789819601189","9789819601196"],"references-count":32,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-0119-6_32","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,12]]},"assertion":[{"value":"12 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRICAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pacific Rim International Conference on Artificial Intelligence","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kyoto","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19 November 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"pricai2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.pricai.org\/2024\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}