{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,4]],"date-time":"2026-07-04T16:38:42Z","timestamp":1783183122972,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China","award":["2020AAA0108600"],"award-info":[{"award-number":["2020AAA0108600"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61772535,62072462"],"award-info":[{"award-number":["61772535,62072462"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Beijing Natural Science Foundation","award":["4192028"],"award-info":[{"award-number":["4192028"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475303","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T04:52:26Z","timestamp":1634532746000},"page":"2843-2852","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["Product-oriented Machine Translation with Cross-modal Cross-lingual Pre-training"],"prefix":"10.1145","author":[{"given":"Yuqing","family":"Song","sequence":"first","affiliation":[{"name":"Renmin University of China, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shizhe","family":"Chen","sequence":"additional","affiliation":[{"name":"INRIA, Paris, France"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qin","family":"Jin","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Wei","family":"Luo","sequence":"additional","affiliation":[{"name":"Alibaba Damo Academy, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jun","family":"Xie","sequence":"additional","affiliation":[{"name":"Alibaba Damo Academy, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fei","family":"Huang","sequence":"additional","affiliation":[{"name":"Alibaba Damo Academy, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Ozan Caglayan Lo\"i c Barrault and Fethi Bougares. 2016. Multimodal Attention for Neural Machine Translation. (2016). arxiv: 1609.03976  Ozan Caglayan Lo\"i c Barrault and Fethi Bougares. 2016. Multimodal Attention for Neural Machine Translation. (2016). arxiv: 1609.03976"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1105"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1175"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1642"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-2004"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/E17-2101"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.5555\/3367722.3367731"},{"key":"e_1_3_2_1_8_1","volume-title":"The Thirty-Third AAAI Conference on Artificial Intelligence. 8207--8214","author":"Chen Shizhe"},{"key":"e_1_3_2_1_9_1","volume-title":"Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu. 2019 c. UNITER: Learning UNiversal Image-TExt Representations.","author":"Chen Yen-Chun","year":"2019"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.747"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454921"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1095"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the Ninth Workshop on Statistical Machine Translation. 376--380","author":"Michael"},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics. 4171--4186","author":"Devlin Jacob","year":"2019"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W17-4718"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-3210"},{"key":"e_1_3_2_1_17_1","volume-title":"International Joint Conference on Natural Language Processing. 130--141","author":"Desmond"},{"key":"e_1_3_2_1_18_1","volume-title":"British Machine Vision Conference. 12","author":"Faghri Fartash","year":"2018"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123394"},{"key":"e_1_3_2_1_20_1","volume-title":"Deep Residual Learning for Image Recognition. In IEEE Conference on Computer Vision and Pattern Recognition. 770--778","author":"He Kaiming","year":"2016"},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, ACL 2020, Online, July 5--10","author":"Huang Po-Yao","year":"2020"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1653"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"e_1_3_2_1_24_1","volume-title":"VisualBERT: A Simple and Performant Baseline for Vision and Language. (2019). arxiv","author":"Li Liunian Harold","year":"1908"},{"key":"e_1_3_2_1_25_1","volume-title":"Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks. In European Conference on Computer Vision. 121--137","author":"Li Xiujun","year":"2020"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-2031"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413715"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454289"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00397"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.5555\/2969239.2969250"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1162"},{"key":"e_1_3_2_1_33_1","volume-title":"Visual Grounding in Video for Unsupervised Word Translation. In IEEE Conference on Computer Vision and Pattern Recognition. 10847--10856","author":"Sigurdsson Gunnar A.","year":"2020"},{"key":"e_1_3_2_1_34_1","volume-title":"Unsupervised Multi-Modal Neural Machine Translation. In IEEE Conference on Computer Vision and Pattern Recognition. 10482--10491","author":"Su Yuanhang","year":"2019"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_38_1","volume-title":"The Fashion IQ Dataset: Retrieving Images by Combining Side Information and Relative Natural Language Feedback. arXiv preprint arXiv:1905.12794","author":"Wu Hui","year":"2019"},{"key":"e_1_3_2_1_39_1","volume-title":"Visual Agreement Regularized Training for Multi-Modal Machine Translation. In The Thirty-Fourth AAAI Conference on Artificial Intelligence. 9418--9425","author":"Yang Pengcheng","year":"2020"},{"key":"e_1_3_2_1_40_1","volume-title":"Fashion Captioning: Towards Generating Accurate Descriptions with Semantic Rewards. In European Conference on Computer Vision. 1--17","author":"Yang Xuewen","year":"2020"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.400"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.273"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413880"},{"key":"e_1_3_2_1_44_1","volume-title":"Comprehensive Information Integration Modeling Framework for Video Titling. In The 26th ACM SIGKDD Conference on Knowledge Discovery and Data Mining. 2744--2754","author":"Zhang Shengyu","year":"2020"},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the 8th International Conference on Learning Representations.","author":"Zhang Zhuosheng","year":"2020"},{"key":"e_1_3_2_1_46_1","volume-title":"Unified Vision-Language Pre-Training for Image Captioning and VQA. In The Thirty-Fourth AAAI Conference on Artificial Intelligence. 13041--13049","author":"Zhou Luowei","year":"2020"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1400"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00414"}],"event":{"name":"MM '21: ACM Multimedia Conference","location":"Virtual Event China","acronym":"MM '21","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475303","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475303","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:49:18Z","timestamp":1750193358000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475303"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":48,"alternative-id":["10.1145\/3474085.3475303","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475303","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}