{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T16:20:55Z","timestamp":1773246055297,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2021YFC2202603"],"award-info":[{"award-number":["2021YFC2202603"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guangdong Provincial Key Laboratory of Human Digital Twin","award":["2022B1212010004"],"award-info":[{"award-number":["2022B1212010004"]}]},{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372187"],"award-info":[{"award-number":["62372187"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3652583.3658018","type":"proceedings-article","created":{"date-parts":[[2024,6,7]],"date-time":"2024-06-07T06:30:40Z","timestamp":1717741840000},"page":"860-868","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["RetrievalMMT: Retrieval-Constrained Multi-Modal Prompt Learning for Multi-Modal Machine Translation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-9403-2844","authenticated-orcid":false,"given":"Yan","family":"Wang","sequence":"first","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1908-1157","authenticated-orcid":false,"given":"Yawen","family":"Zeng","sequence":"additional","affiliation":[{"name":"ByteDance AI Lab, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7200-0749","authenticated-orcid":false,"given":"Junjie","family":"Liang","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0016-9055","authenticated-orcid":false,"given":"Xiaofen","family":"Xing","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8735-3532","authenticated-orcid":false,"given":"Jin","family":"Xu","sequence":"additional","affiliation":[{"name":"South China University of Technology &amp; Pazhou Lab, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4573-5820","authenticated-orcid":false,"given":"Xiangmin","family":"Xu","sequence":"additional","affiliation":[{"name":"South China University of Technology, Guangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,6,7]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jinze Bai Shuai Bai Yunfei Chu Zeyu Cui Kai Dang Xiaodong Deng Yang Fan Wenbin Ge Yu Han Fei Huang Binyuan Hui Luo Ji Mei Li Junyang Lin Runji Lin Dayiheng Liu Gao Liu Chengqiang Lu Keming Lu Jianxin Ma Rui Men Xingzhang Ren Xuancheng Ren Chuanqi Tan Sinan Tan Jianhong Tu Peng Wang Shijie Wang Wei Wang Shengguang Wu Benfeng Xu Jin Xu An Yang Hao Yang Jian Yang Shusheng Yang Yang Yao Bowen Yu Hongyi Yuan Zheng Yuan Jianwei Zhang Xingxuan Zhang Yichang Zhang Zhenru Zhang Chang Zhou Jingren Zhou Xiaohuan Zhou and Tianhang Zhu. 2023 a. Qwen Technical Report. arxiv: 2309.16609 [cs.CL]"},{"key":"e_1_3_2_1_2_1","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023 b. Qwen-VL: A Versatile Vision-Language Model for Understanding Localization Text Reading and Beyond. arxiv: 2308.12966 [cs.CV]"},{"key":"e_1_3_2_1_3_1","volume-title":"NeurIPS. Curran Associates","author":"Brown Tom","year":"1877","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In NeurIPS. Curran Associates, Inc., 1877--1901."},{"key":"e_1_3_2_1_4_1","unstructured":"Ozan Caglayan. 2019. Multimodal machine translation. Ph. D. Dissertation. Universit\u00e9 du Maine."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the Second Conference on Machine Translation.","author":"Caglayan Ozan","unstructured":"Ozan Caglayan, Walid Aransa, Adrien Bardet, Mercedes Garc'ia-Mart'inez, Fethi Bougares, Lo\"ic Barrault, Marc Masana, Luis Herranz, and Joost van de Weijer. 2017. LIUM-CVC Submissions for WMT17 Multimodal Translation Task. In Proceedings of the Second Conference on Machine Translation."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"Iacer Calixto and Qun Liu. 2017. Incorporating Global Visual Features into Attention-based Neural Machine Translation.. In EMNLP. 992--1003.","DOI":"10.18653\/v1\/D17-1105"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1175"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Iacer Calixto Miguel Rios and Wilker Aziz. 2019. Latent Variable Model for Multi-modal Translation. In ACL. 6392--6405.","DOI":"10.18653\/v1\/P19-1642"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-3210"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Qingkai Fang and Yang Feng. 2022. Neural Machine Translation with Phrase-Level Universal Visual Representations. arxiv: 2203.10299 [cs.CL]","DOI":"10.18653\/v1\/2022.acl-long.390"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Julia Ive Pranava Madhyastha and Lucia Specia. 2019. Distilling Translations with Visual Awareness. In ACL. 6525--6538.","DOI":"10.18653\/v1\/P19-1653"},{"key":"e_1_3_2_1_12_1","unstructured":"Wonjae Kim Bokyung Son and Ildoo Kim. 2021. ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision. In ICML. 5583--5594."},{"key":"e_1_3_2_1_13_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2015","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In ICLR, Yoshua Bengio and Yann LeCun (Eds.)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.3115\/1626355.1626389"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Brian Lester Rami Al-Rfou and Noah Constant. 2021. The Power of Scale for Parameter-Efficient Prompt Tuning. In EMNLP. 3045--3059.","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"e_1_3_2_1_16_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML."},{"key":"e_1_3_2_1_17_1","unstructured":"Xiang Lisa Li and Percy Liang. 2021. Prefix-Tuning: Optimizing Continuous Prompts for Generation. In ACL. 4582--4597."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Huan Lin Fandong Meng Jinsong Su Yongjing Yin Zhengyuan Yang Yubin Ge Jie Zhou and Jiebo Luo. 2020. Dynamic context-guided capsule network for multimodal machine translation. In MM. 1320--1329.","DOI":"10.1145\/3394171.3413715"},{"key":"e_1_3_2_1_19_1","volume-title":"Gumbel-Attention for Multi-modal Machine Translation. arXiv preprint arXiv:2103.08862","author":"Liu Pengbo","year":"2021","unstructured":"Pengbo Liu, Hailong Cao, and Tiejun Zhao. 2021a. Gumbel-Attention for Multi-modal Machine Translation. arXiv preprint arXiv:2103.08862 (2021)."},{"key":"e_1_3_2_1_20_1","volume-title":"A Systematic Survey of Prompting Methods in Natural Language Processing. arXiv:2107.13586","author":"Liu Pengfei","year":"2021","unstructured":"Pengfei Liu, Weizhe Yuan, Jinlan Fu, Jiang Zhengbao, Hiroaki Hayashi, and Graham Neubig. 2021b. Pre-train, Prompt, and Predict: A Systematic Survey of Prompting Methods in Natural Language Processing. arXiv:2107.13586 (2021)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Quanyu Long Mingxuan Wang and Lei Li. 2021. Generative Imagination Elevates Machine Translation. In NAACL. 5738--5748.","DOI":"10.18653\/v1\/2021.naacl-main.457"},{"key":"e_1_3_2_1_22_1","volume-title":"Text2Event: Controllable sequence-to-structure generation for end-to-end event extraction. arXiv preprint arXiv:2106.09232","author":"Lu Yaojie","year":"2021","unstructured":"Yaojie Lu, Hongyu Lin, Jin Xu, Xianpei Han, Jialong Tang, Annan Li, Le Sun, Meng Liao, and Shaoyi Chen. 2021. Text2Event: Controllable sequence-to-structure generation for end-to-end event extraction. arXiv preprint arXiv:2106.09232 (2021)."},{"key":"e_1_3_2_1_23_1","unstructured":"Keyu Pan and Yawen Zeng. 2023. Do LLMs Possess a Personality? Making the MBTI Test an Amazing Evaluation for Large Language Models. arxiv: 2307.16180 [cs.CL]"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Kishore Papineni Salim Roukos Todd Ward and Wei-Jing Zhu. 2002. Bleu: a Method for Automatic Evaluation of Machine Translation. In ACL. ACL 311--318.","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Ru Peng Yawen Zeng and Junbo Zhao. 2022. HybridVocab: Towards Multi-Modal Machine Translation via Multi-Aspect Alignment. In ICMR. 380--388.","DOI":"10.1145\/3512527.3531386"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Ru Peng Yawen Zeng and Junbo Zhao. 2023. Distill the Image to Nowhere: Inversion Knowledge Distillation for Multimodal Machine Translation. arxiv: 2210.04468 [cs.CL]","DOI":"10.18653\/v1\/2022.emnlp-main.152"},{"key":"e_1_3_2_1_27_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML. 8748--8763."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3092187"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-2346"},{"key":"e_1_3_2_1_30_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. In Advances in neural information processing systems. 5998--6008."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Dexin Wang and Deyi Xiong. 2021. Efficient Object-Level Visual Context Modeling for Multimodal Machine Translation: Masking Irrelevant Objects Helps Grounding. In AAAI. 2720--2728.","DOI":"10.1609\/aaai.v35i4.16376"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5389"},{"key":"e_1_3_2_1_33_1","volume-title":"Good for Misconceived Reasons: An Empirical Revisiting on the Need for Visual Context in Multimodal Machine Translation. arXiv preprint arXiv:2105.14462","author":"Wu Zhiyong","year":"2021","unstructured":"Zhiyong Wu, Lingpeng Kong, Wei Bi, Xiang Li, and Ben Kao. 2021. Good for Misconceived Reasons: An Empirical Revisiting on the Need for Visual Context in Multimodal Machine Translation. arXiv preprint arXiv:2105.14462 (2021)."},{"key":"e_1_3_2_1_34_1","unstructured":"Shaowei Yao and Xiaojun Wan. 2020. Multimodal Transformer for Multimodal Machine Translation. In ACL Dan Jurafsky Joyce Chai Natalie Schluter and Joel R. Tetreault (Eds.). 4346--4350."},{"key":"e_1_3_2_1_35_1","volume-title":"CPT: Colorful Prompt Tuning for Pre-trained Vision-Language Models. CoRR","author":"Yao Yuan","year":"2021","unstructured":"Yuan Yao, Ao Zhang, Zhengyan Zhang, Zhiyuan Liu, Tat-Seng Chua, and Maosong Sun. 2021. CPT: Colorful Prompt Tuning for Pre-trained Vision-Language Models. CoRR , Vol. abs\/2109.11797 (2021)."},{"key":"e_1_3_2_1_36_1","unstructured":"Yongjing Yin Fandong Meng Jinsong Su Chulun Zhou Zhengyuan Yang Jie Zhou and Jiebo Luo. 2020. A Novel Graph-based Multi-modal Fusion Encoder for Neural Machine Translation. In ACL. 3025--3035."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Yawen Zeng. 2022. Point Prompt Tuning for Temporally Language Grounding. In SIGIR. 2003--2007.","DOI":"10.1145\/3477495.3531795"},{"key":"e_1_3_2_1_38_1","volume-title":"Multi-Modal Relational Graph for Cross-Modal Video Moment Retrieval","author":"Zeng Yawen","unstructured":"Yawen Zeng, Da Cao, Xiaochi Wei, Meng Liu, Zhou Zhao, and Zheng Qin. 2021. Multi-Modal Relational Graph for Cross-Modal Video Moment Retrieval. In CVPR. IEEE, 2215--2224."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2023.121601"},{"key":"e_1_3_2_1_40_1","unstructured":"Zhuosheng Zhang Kehai Chen Rui Wang Masao Utiyama Eiichiro Sumita Zuchao Li and Hai Zhao. 2020. Neural Machine Translation with Universal Visual Representation. In ICLR. https:\/\/openreview.net\/forum?id=Byl8hhNYPS"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.12.076"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"e_1_3_2_1_43_1","volume-title":"Yong Jae Lee, and Zhou Yu","author":"Zhou Mingyang","year":"2018","unstructured":"Mingyang Zhou, Runxiang Cheng, Yong Jae Lee, and Zhou Yu. 2018. A Visual Attention Grounding Neural Model for Multimodal Machine Translation. In EMNLP. 3643--3653."}],"event":{"name":"ICMR '24: International Conference on Multimedia Retrieval","location":"Phuket Thailand","acronym":"ICMR '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia","SIGSOFT ACM Special Interest Group on Software Engineering"]},"container-title":["Proceedings of the 2024 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658018","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3652583.3658018","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T08:43:49Z","timestamp":1755765829000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3652583.3658018"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":43,"alternative-id":["10.1145\/3652583.3658018","10.1145\/3652583"],"URL":"https:\/\/doi.org\/10.1145\/3652583.3658018","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}