{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T06:09:14Z","timestamp":1775542154687,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":90,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T00:00:00Z","timestamp":1665360000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R&D Program of China","award":["No. 2018YFB1404102"],"award-info":[{"award-number":["No. 2018YFB1404102"]}]},{"name":"the Public Welfare Technology Research Project of Zhejiang Province","award":["No. LGF21F020010"],"award-info":[{"award-number":["No. LGF21F020010"]}]},{"name":"NSFC","award":["No. 61902347, 61976188"],"award-info":[{"award-number":["No. 61902347, 61976188"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,10]]},"DOI":"10.1145\/3503161.3548003","type":"proceedings-article","created":{"date-parts":[[2022,10,10]],"date-time":"2022-10-10T15:42:35Z","timestamp":1665416555000},"page":"422-433","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":23,"title":["Cross-Lingual Cross-Modal Retrieval with Noise-Robust Learning"],"prefix":"10.1145","author":[{"given":"Yabing","family":"Wang","sequence":"first","affiliation":[{"name":"Zhejiang Gongshang University, Hangzhou, China"}]},{"given":"Jianfeng","family":"Dong","sequence":"additional","affiliation":[{"name":"Zhejiang Gongshang University, Hangzhou, China"}]},{"given":"Tianxiang","family":"Liang","sequence":"additional","affiliation":[{"name":"Zhejiang Gongshang University, Hangzhou, China"}]},{"given":"Minsong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang Gongshang University, Hangzhou, China"}]},{"given":"Rui","family":"Cai","sequence":"additional","affiliation":[{"name":"Zhejiang Gongshang University, Hangzhou, China"}]},{"given":"Xun","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang Gongshang University, Hangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2022,10,10]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Towards zero-shot Cross-lingual Image retrieval. arXiv preprint arXiv:2012.05107","author":"Aggarwal Pranav","year":"2020","unstructured":"Pranav Aggarwal and Ajinkya Kale . 2020. Towards zero-shot Cross-lingual Image retrieval. arXiv preprint arXiv:2012.05107 ( 2020 ). Pranav Aggarwal and Ajinkya Kale. 2020. Towards zero-shot Cross-lingual Image retrieval. arXiv preprint arXiv:2012.05107 (2020)."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00055"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00118"},{"key":"e_1_3_2_2_4_1","volume-title":"Proceedings of the Sixth International Conference on Learning Representations.","author":"Artetxe Mikel","year":"2017","unstructured":"Mikel Artetxe , Gorka Labaka , Eneko Agirre , and Kyunghyun Cho . 2017 . Unsupervised neural machine translation . In Proceedings of the Sixth International Conference on Learning Representations. Mikel Artetxe, Gorka Labaka, Eneko Agirre, and Kyunghyun Cho. 2017. Unsupervised neural machine translation. In Proceedings of the Sixth International Conference on Learning Representations."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00288"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_12"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19891"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_2_9_1","unstructured":"Xiaojun Chang Yi Yang Alexander Hauptmann Eric P Xing and Yao-Liang Yu. 2015. Semantic concept discovery for large-scale zero-shot event detection. In Twenty-fourth international joint conference on artificial intelligence.  Xiaojun Chang Yi Yang Alexander Hauptmann Eric P Xing and Yao-Liang Yu. 2015. Semantic concept discovery for large-scale zero-shot event detection. In Twenty-fourth international joint conference on artificial intelligence."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01553"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"e_1_3_2_2_12_1","volume-title":"Word translation without parallel data. arXiv preprint arXiv:1710.04087","author":"Conneau Alexis","year":"2017","unstructured":"Alexis Conneau , Guillaume Lample , Marc'Aurelio Ranzato , Ludovic Denoyer , and Herv\u00e9 J\u00e9gou . 2017. Word translation without parallel data. arXiv preprint arXiv:1710.04087 ( 2017 ). Alexis Conneau, Guillaume Lample, Marc'Aurelio Ranzato, Ludovic Denoyer, and Herv\u00e9 J\u00e9gou. 2017. Word translation without parallel data. arXiv preprint arXiv:1710.04087 (2017)."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2940693"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i14.17506"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2505515.2507880"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_2_17_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . 2019 . BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding . In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies , Volume 1 (Long and Short Papers). Minneapolis, Minnesota, 4171--4186. Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers). Minneapolis, Minnesota, 4171--4186."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2832602"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00957"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3059295"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3150959"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-3210"},{"key":"e_1_3_2_2_23_1","volume-title":"Proceedings of the Eighth International Joint Conference on Natural Language Processing(Volume 1: Long Papers). 130--141","author":"Elliott Desmond","year":"2017","unstructured":"Desmond Elliott and Akos K\u00e1d\u00e1r . 2017 . Imagination improves multimodal translation . In Proceedings of the Eighth International Joint Conference on Natural Language Processing(Volume 1: Long Papers). 130--141 . Desmond Elliott and Akos K\u00e1d\u00e1r. 2017. Imagination improves multimodal translation. In Proceedings of the Eighth International Joint Conference on Natural Language Processing(Volume 1: Long Papers). 130--141."},{"key":"e_1_3_2_2_24_1","volume-title":"Proceedings of the British Machine Vision Conference.","author":"Faghri Fartash","year":"2018","unstructured":"Fartash Faghri , David J Fleet , Jamie Ryan Kiros , and Sanja Fidler . 2018 . Vse: Improving visual-semantic embeddings with hard negatives . Proceedings of the British Machine Vision Conference. Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2018. Vse: Improving visual-semantic embeddings with hard negatives. Proceedings of the British Machine Vision Conference."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.285"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00217"},{"key":"e_1_3_2_2_27_1","volume-title":"European Conference on Computer Vision. 214--229","author":"Gabeur Valentin","year":"2020","unstructured":"Valentin Gabeur , Chen Sun , Karteek Alahari , and Cordelia Schmid . 2020 . Multimodal transformer for video retrieval . In European Conference on Computer Vision. 214--229 . Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. 2020. Multimodal transformer for video retrieval. In European Conference on Computer Vision. 214--229."},{"key":"e_1_3_2_2_28_1","first-page":"6616","article-title":"Large-scale adversarial training for vision-and-language representation learning","volume":"33","author":"Gan Zhe","year":"2020","unstructured":"Zhe Gan , Yen-Chun Chen , Linjie Li , Chen Zhu , Yu Cheng , and Jingjing Liu . 2020 . Large-scale adversarial training for vision-and-language representation learning . Advances in Neural Information Processing Systems 33 (2020), 6616 -- 6628 . Zhe Gan, Yen-Chun Chen, Linjie Li, Chen Zhu, Yu Cheng, and Jingjing Liu. 2020. Large-scale adversarial training for vision-and-language representation learning. Advances in Neural Information Processing Systems 33 (2020), 6616--6628.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_29_1","volume-title":"Video2vec embeddings recognize events when examples are scarce","author":"Habibian Amirhossein","year":"2016","unstructured":"Amirhossein Habibian , Thomas Mensink , and Cees GM Snoek . 2016. Video2vec embeddings recognize events when examples are scarce . IEEE transactions on pattern analysis and machine intelligence 39, 10 ( 2016 ), 2089--2103. Amirhossein Habibian, Thomas Mensink, and Cees GM Snoek. 2016. Video2vec embeddings recognize events when examples are scarce. IEEE transactions on pattern analysis and machine intelligence 39, 10 (2016), 2089--2103."},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475241"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3463031"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462838"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.findings-acl.150"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.195"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.767"},{"key":"e_1_3_2_2_37_1","volume-title":"International Conference on Machine Learning. 4904--4916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia , Yinfei Yang , Ye Xia , Yi-Ting Chen , Zarana Parekh , Hieu Pham , Quoc Le , Yun-Hsuan Sung , Zhen Li , and Tom Duerig . 2021 . Scaling up visual and visionlanguage representation learning with noisy text supervision . In International Conference on Machine Learning. 4904--4916 . Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and visionlanguage representation learning with noisy text supervision. In International Conference on Machine Learning. 4904--4916."},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6785"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123366"},{"key":"e_1_3_2_2_40_1","volume-title":"mtvr: Multilingual moment retrieval in videos. arXiv preprint arXiv:2108.00061","author":"Lei Jie","year":"2021","unstructured":"Jie Lei , Tamara L Berg , and Mohit Bansal . 2021. mtvr: Multilingual moment retrieval in videos. arXiv preprint arXiv:2108.00061 ( 2021 ). Jie Lei, Tamara L Berg, and Mohit Bansal. 2021. mtvr: Multilingual moment retrieval in videos. arXiv preprint arXiv:2108.00061 (2021)."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"e_1_3_2_2_44_1","volume-title":"Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks","volume":"1","author":"Li Linjie","year":"2021","unstructured":"Linjie Li , Jie Lei , Zhe Gan , Licheng Yu , Yen-Chun Chen , Rohit Pillai , Yu Cheng , Luowei Zhou , Xin EricWang , William YangWang , 2021 . Value: A multi-task benchmark for video-and-language understanding evaluation . In Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks , Vol. 1 . Linjie Li, Jie Lei, Zhe Gan, Licheng Yu, Yen-Chun Chen, Rohit Pillai, Yu Cheng, Luowei Zhou, Xin EricWang, William YangWang, et al. 2021. Value: A multi-task benchmark for video-and-language understanding evaluation. In Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks, Vol. 1."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/2911996.2912049"},{"key":"e_1_3_2_2_46_1","volume-title":"Proceedings of the 27th ACM International Conference on Multimedia. 1786--1794","author":"Li Xirong","year":"2019","unstructured":"Xirong Li , Chaoxi Xu , Gang Yang , Zhineng Chen , and Jianfeng Dong . 2019 . W2vv fully deep learning for ad-hoc video search . In Proceedings of the 27th ACM International Conference on Multimedia. 1786--1794 . Xirong Li, Chaoxi Xu, Gang Yang, Zhineng Chen, and Jianfeng Dong. 2019. W2vv fully deep learning for ad-hoc video search. In Proceedings of the 27th ACM International Conference on Multimedia. 1786--1794."},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3042067"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2020.09.057"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475621"},{"key":"e_1_3_2_2_51_1","volume-title":"Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487","author":"Liu Yang","year":"2019","unstructured":"Yang Liu , Samuel Albanie , Arsha Nagrani , and Andrew Zisserman . 2019. Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487 ( 2019 ). Yang Liu, Samuel Albanie, Arsha Nagrani, and Andrew Zisserman. 2019. Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487 (2019)."},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.442"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01073"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331217"},{"key":"e_1_3_2_2_55_1","volume-title":"Clip4clip: An empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:2104.08860","author":"Luo Huaishao","year":"2021","unstructured":"Huaishao Luo , Lei Ji , Ming Zhong , Yang Chen , Wen Lei , Nan Duan , and Tianrui Li. 2021. Clip4clip: An empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:2104.08860 ( 2021 ). Huaishao Luo, Lei Ji, Ming Zhong, Yang Chen, Wen Lei, Nan Duan, and Tianrui Li. 2021. Clip4clip: An empirical study of clip for end to end video clip retrieval. arXiv preprint arXiv:2104.08860 (2021)."},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475703"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W15-1521"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3078971.3079041"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/3377875"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00970"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00397"},{"key":"e_1_3_2_2_63_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-industry.13"},{"key":"e_1_3_2_2_64_1","volume-title":"8th ACM IKDD CODS and 26th COMAD. 178--187.","author":"Philip Jerin","unstructured":"Jerin Philip , Shashank Siripragada , Vinay P Namboodiri , and CV Jawahar . 2021. Revisiting low resource status of indian languages in machine translation . In 8th ACM IKDD CODS and 26th COMAD. 178--187. Jerin Philip, Shashank Siripragada, Vinay P Namboodiri, and CV Jawahar. 2021. Revisiting low resource status of indian languages in machine translation. In 8th ACM IKDD CODS and 26th COMAD. 178--187."},{"key":"e_1_3_2_2_65_1","volume-title":"Image search using multilingual texts: a cross-modal learning approach between image and text. arXiv preprint arXiv:1903.11299","author":"Portaz Maxime","year":"2019","unstructured":"Maxime Portaz , Hicham Randrianarivo , Adrien Nivaggioli , Estelle Maudet , Christophe Servan , and Sylvain Peyronnet . 2019. Image search using multilingual texts: a cross-modal learning approach between image and text. arXiv preprint arXiv:1903.11299 ( 2019 ). Maxime Portaz, Hicham Randrianarivo, Adrien Nivaggioli, Estelle Maudet, Christophe Servan, and Sylvain Peyronnet. 2019. Image search using multilingual texts: a cross-modal learning approach between image and text. arXiv preprint arXiv:1903.11299 (2019)."},{"key":"e_1_3_2_2_66_1","volume-title":"International Conference on Machine Learning. 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford , Jong Wook Kim , Chris Hallacy , Aditya Ramesh , Gabriel Goh , Sandhini Agarwal , Girish Sastry , Amanda Askell , Pamela Mishkin , Jack Clark , 2021 . Learning transferable visual models from natural language supervision . In International Conference on Machine Learning. 8748--8763 . Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. 8748--8763."},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3090595"},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475303"},{"key":"e_1_3_2_2_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00208"},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2797921"},{"key":"e_1_3_2_2_72_1","volume-title":"Many Hands Make Light Work: Transferring Knowledge from Auxiliary Tasks for Video-Text Retrieval","author":"Wang Wei","year":"2022","unstructured":"Wei Wang , Junyu Gao , Xiaoshan Yang , and Changsheng Xu. 2022. Many Hands Make Light Work: Transferring Knowledge from Auxiliary Tasks for Video-Text Retrieval . IEEE Transactions on Multimedia ( 2022 ). Wei Wang, Junyu Gao, Xiaoshan Yang, and Changsheng Xu. 2022. Many Hands Make Light Work: Transferring Knowledge from Auxiliary Tasks for Video-Text Retrieval. IEEE Transactions on Multimedia (2022)."},{"key":"e_1_3_2_2_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"e_1_3_2_2_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3168424"},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00590"},{"key":"e_1_3_2_2_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475451"},{"key":"e_1_3_2_2_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2923608"},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475515"},{"key":"e_1_3_2_2_79_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16406"},{"key":"e_1_3_2_2_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"e_1_3_2_2_81_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_2_82_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401151"},{"key":"e_1_3_2_2_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462823"},{"key":"e_1_3_2_2_84_1","volume-title":"Steve Yuan, Chris Tar, Yun-Hsuan Sung, et al.","author":"Yang Yinfei","year":"2019","unstructured":"Yinfei Yang , Daniel Cer , Amin Ahmad , Mandy Guo , Jax Law , Noah Constant , Gustavo Hernandez Abrego , Steve Yuan, Chris Tar, Yun-Hsuan Sung, et al. 2019 . Multilingual universal sentence encoder for semantic retrieval. arXiv preprint arXiv:1907.04307 (2019). Yinfei Yang, Daniel Cer, Amin Ahmad, Mandy Guo, Jax Law, Noah Constant, Gustavo Hernandez Abrego, Steve Yuan, Chris Tar, Yun-Hsuan Sung, et al. 2019. Multilingual universal sentence encoder for semantic retrieval. arXiv preprint arXiv:1907.04307 (2019)."},{"key":"e_1_3_2_2_85_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"e_1_3_2_2_86_1","doi-asserted-by":"publisher","DOI":"10.1145\/3459637.3482233"},{"key":"e_1_3_2_2_87_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"e_1_3_2_2_88_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475380"},{"key":"e_1_3_2_2_89_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00414"},{"key":"e_1_3_2_2_90_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"}],"event":{"name":"MM '22: The 30th ACM International Conference on Multimedia","location":"Lisboa Portugal","acronym":"MM '22","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 30th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548003","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503161.3548003","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:02:29Z","timestamp":1750186949000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503161.3548003"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,10]]},"references-count":90,"alternative-id":["10.1145\/3503161.3548003","10.1145\/3503161"],"URL":"https:\/\/doi.org\/10.1145\/3503161.3548003","relation":{},"subject":[],"published":{"date-parts":[[2022,10,10]]},"assertion":[{"value":"2022-10-10","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}