{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:13:08Z","timestamp":1750219988157,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":24,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,11,17]],"date-time":"2022-11-17T00:00:00Z","timestamp":1668643200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Development Foundation of the 54th Research Institute of China Electronics Technology Group Corporation","award":["SKX212010053"],"award-info":[{"award-number":["SKX212010053"]}]},{"name":"Natural Science Foundation of Guangxi","award":["2019GXNSFDA185006, 2019GXNSFDA185007"],"award-info":[{"award-number":["2019GXNSFDA185006, 2019GXNSFDA185007"]}]},{"name":"National Natural Science Foundation of China","award":["62262006"],"award-info":[{"award-number":["62262006"]}]},{"name":"Guilin Science and Technology Development Program","award":["20190211-17, 20210104-1"],"award-info":[{"award-number":["20190211-17, 20210104-1"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,11,17]]},"DOI":"10.1145\/3581807.3581857","type":"proceedings-article","created":{"date-parts":[[2023,5,23]],"date-time":"2023-05-23T00:02:28Z","timestamp":1684800148000},"page":"347-354","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Semantic Maximum Relevance and Modal Alignment for Cross-Modal Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7626-8167","authenticated-orcid":false,"given":"Pingping","family":"Sun","sequence":"first","affiliation":[{"name":"Guangxi Key Laboratory of Image and Graphic Intelligent Processing, Guilin University of Electronic Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3469-6590","authenticated-orcid":false,"given":"Baohua","family":"Qiang","sequence":"additional","affiliation":[{"name":"Guangxi Key Laboratory of Image and Graphic Intelligent Processing, Guilin University of Electronic Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5454-9435","authenticated-orcid":false,"given":"Zhiguang","family":"Liu","sequence":"additional","affiliation":[{"name":"Hebei Key Laboratory of Intelligent Information Perception and Processing, The 54th Research Institute of CETC, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9026-7934","authenticated-orcid":false,"given":"Xianyi","family":"Yang","sequence":"additional","affiliation":[{"name":"Guangxi Key Laboratory of Image and Graphic Intelligent Processing, Guilin University of Electronic Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1163-4565","authenticated-orcid":false,"given":"Guangyong","family":"Xi","sequence":"additional","affiliation":[{"name":"Guangxi Key Laboratory of Image and Graphic Intelligent Processing, Guilin University of Electronic Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6916-9950","authenticated-orcid":false,"given":"Weigang","family":"Liu","sequence":"additional","affiliation":[{"name":"Hebei Key Laboratory of Intelligent Information Perception and Processing, The 54th Research Institute of CETC, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3083-014X","authenticated-orcid":false,"given":"Ruidong","family":"Chen","sequence":"additional","affiliation":[{"name":"Guangxi Key Laboratory of Image and Graphic Intelligent Processing, Guilin University of Electronic Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6489-3694","authenticated-orcid":false,"given":"Shihao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Guangxi Key Laboratory of Image and Graphic Intelligent Processing, Guilin University of Electronic Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,5,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"A kernel method for canonical correlation analysis. arXiv preprint cs\/0609071","author":"Akaho Shotaro","year":"2006","unstructured":"Shotaro Akaho . 2006. A kernel method for canonical correlation analysis. arXiv preprint cs\/0609071 ( 2006 ). Shotaro Akaho. 2006. A kernel method for canonical correlation analysis. arXiv preprint cs\/0609071 (2006)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_2_1","DOI":"10.1145\/1646396.1646452"},{"key":"e_1_3_2_1_3_1","volume-title":"Arcface: Additive angular margin loss for deep face recognition.","author":"Deng Jiankang","year":"2019","unstructured":"Jiankang Deng , Jia Guo , Niannan Xue , and Stefanos Zafeiriou . 2019 . Arcface: Additive angular margin loss for deep face recognition. (2019), 4690\u20134699. Jiankang Deng, Jia Guo, Niannan Xue, and Stefanos Zafeiriou. 2019. Arcface: Additive angular margin loss for deep face recognition. (2019), 4690\u20134699."},{"key":"e_1_3_2_1_4_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy , Lucas Beyer , Alexander Kolesnikov , Dirk Weissenborn , Xiaohua Zhai , Thomas Unterthiner , Mostafa Dehghani , Matthias Minderer , Georg Heigold , Sylvain Gelly , 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 ( 2020 ). Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_5_1","DOI":"10.1145\/2647868.2654902"},{"key":"e_1_3_2_1_6_1","volume-title":"X-Pool: Cross-Modal Language-Video Attention for Text-Video Retrieval. ArXiv abs\/2203.15086","author":"Gorti Satya\u00a0Krishna","year":"2022","unstructured":"Satya\u00a0Krishna Gorti , Noel Vouitsis , Junwei Ma , Keyvan Golestan , Maksims Volkovs , Animesh Garg , and Guangwei Yu. 2022. X-Pool: Cross-Modal Language-Video Attention for Text-Video Retrieval. ArXiv abs\/2203.15086 ( 2022 ). Satya\u00a0Krishna Gorti, Noel Vouitsis, Junwei Ma, Keyvan Golestan, Maksims Volkovs, Animesh Garg, and Guangwei Yu. 2022. X-Pool: Cross-Modal Language-Video Attention for Text-Video Retrieval. ArXiv abs\/2203.15086 (2022)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","first-page":"39","DOI":"10.18178\/joig.6.1.39-43","article-title":"Multimodal Sentiment Analysis of Arabic Videos","volume":"6","author":"Hassan\u00a0Najadat Ftoon\u00a0Abushaqra","year":"2018","unstructured":"Ftoon\u00a0Abushaqra Hassan\u00a0Najadat . 2018 . Multimodal Sentiment Analysis of Arabic Videos . Journal of Image and Graphics 6 , 1 (2018), 39 \u2013 43 . Ftoon\u00a0Abushaqra Hassan\u00a0Najadat. 2018. Multimodal Sentiment Analysis of Arabic Videos. Journal of Image and Graphics 6, 1 (2018), 39\u201343.","journal-title":"Journal of Image and Graphics"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_8_1","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_9_1","volume-title":"Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415","author":"Hendrycks Dan","year":"2016","unstructured":"Dan Hendrycks and Kevin Gimpel . 2016. Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415 ( 2016 ). Dan Hendrycks and Kevin Gimpel. 2016. Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415 (2016)."},{"volume-title":"Breakthroughs in statistics","author":"Hotelling Harold","unstructured":"Harold Hotelling . 1992. Relations between two sets of variates . In Breakthroughs in statistics . Springer , 162\u2013190. Harold Hotelling. 1992. Relations between two sets of variates. In Breakthroughs in statistics. Springer, 162\u2013190.","key":"e_1_3_2_1_10_1"},{"key":"e_1_3_2_1_11_1","volume-title":"A decomposable attention model for natural language inference. arXiv preprint arXiv:1606.01933","author":"Parikh P","year":"2016","unstructured":"Ankur\u00a0 P Parikh , Oscar T\u00e4ckstr\u00f6m , Dipanjan Das , and Jakob Uszkoreit . 2016. A decomposable attention model for natural language inference. arXiv preprint arXiv:1606.01933 ( 2016 ). Ankur\u00a0P Parikh, Oscar T\u00e4ckstr\u00f6m, Dipanjan Das, and Jakob Uszkoreit. 2016. A decomposable attention model for natural language inference. arXiv preprint arXiv:1606.01933 (2016)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_12_1","DOI":"10.1109\/TMM.2017.2742704"},{"key":"e_1_3_2_1_13_1","volume-title":"On the role of correlation and abstraction in cross-modal multimedia retrieval","author":"Pereira Jose\u00a0Costa","year":"2013","unstructured":"Jose\u00a0Costa Pereira , Emanuele Coviello , Gabriel Doyle , Nikhil Rasiwasia , Gert\u00a0 RG Lanckriet , Roger Levy , and Nuno Vasconcelos . 2013. On the role of correlation and abstraction in cross-modal multimedia retrieval . IEEE transactions on pattern analysis and machine intelligence 36, 3 ( 2013 ), 521\u2013535. Jose\u00a0Costa Pereira, Emanuele Coviello, Gabriel Doyle, Nikhil Rasiwasia, Gert\u00a0RG Lanckriet, Roger Levy, and Nuno Vasconcelos. 2013. On the role of correlation and abstraction in cross-modal multimedia retrieval. IEEE transactions on pattern analysis and machine intelligence 36, 3 (2013), 521\u2013535."},{"key":"e_1_3_2_1_14_1","volume-title":"International conference on machine learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford , Jong\u00a0Wook Kim , Chris Hallacy , Aditya Ramesh , Gabriel Goh , Sandhini Agarwal , Girish Sastry , Amanda Askell , Pamela Mishkin , Jack Clark , 2021 . Learning transferable visual models from natural language supervision . In International conference on machine learning. PMLR, 8748\u20138763 . Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_15_1","DOI":"10.5555\/1866696.1866717"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_16_1","DOI":"10.1145\/1873951.1873987"},{"key":"e_1_3_2_1_17_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan\u00a0 N Gomez , \u0141ukasz Kaiser , and Illia Polosukhin . 2017. Attention is all you need. Advances in neural information processing systems 30 ( 2017 ). Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_18_1","DOI":"10.1145\/3123266.3123326"},{"key":"e_1_3_2_1_19_1","volume-title":"Simvlm: Simple visual language model pretraining with weak supervision. arXiv preprint arXiv:2108.10904","author":"Wang Zirui","year":"2021","unstructured":"Zirui Wang , Jiahui Yu , Adams\u00a0Wei Yu , Zihang Dai , Yulia Tsvetkov , and Yuan Cao . 2021 . Simvlm: Simple visual language model pretraining with weak supervision. arXiv preprint arXiv:2108.10904 (2021). Zirui Wang, Jiahui Yu, Adams\u00a0Wei Yu, Zihang Dai, Yulia Tsvetkov, and Yuan Cao. 2021. Simvlm: Simple visual language model pretraining with weak supervision. arXiv preprint arXiv:2108.10904 (2021)."},{"key":"e_1_3_2_1_20_1","volume-title":"A Comprehensive Empirical Study of Vision-Language Pre-trained Model for Supervised Cross-Modal Retrieval. arXiv preprint arXiv:2201.02772","author":"Zeng Zhixiong","year":"2022","unstructured":"Zhixiong Zeng and Wenji Mao . 2022. A Comprehensive Empirical Study of Vision-Language Pre-trained Model for Supervised Cross-Modal Retrieval. arXiv preprint arXiv:2201.02772 ( 2022 ). Zhixiong Zeng and Wenji Mao. 2022. A Comprehensive Empirical Study of Vision-Language Pre-trained Model for Supervised Cross-Modal Retrieval. arXiv preprint arXiv:2201.02772 (2022)."},{"key":"e_1_3_2_1_21_1","volume-title":"Proceedings of the 29th ACM International Conference on Multimedia. 5427\u20135435","author":"Zeng Zhixiong","year":"2021","unstructured":"Zhixiong Zeng , Ying Sun , and Wenji Mao . 2021 . MCCN: Multimodal Coordinated Clustering Network for Large-Scale Cross-modal Retrieval . In Proceedings of the 29th ACM International Conference on Multimedia. 5427\u20135435 . Zhixiong Zeng, Ying Sun, and Wenji Mao. 2021. MCCN: Multimodal Coordinated Clustering Network for Large-Scale Cross-modal Retrieval. In Proceedings of the 29th ACM International Conference on Multimedia. 5427\u20135435."},{"key":"e_1_3_2_1_22_1","volume-title":"Pan: Prototype-based adaptive network for robust cross-modal retrieval.","author":"Zeng Zhixiong","year":"2021","unstructured":"Zhixiong Zeng , Shuai Wang , Nan Xu , and Wenji Mao . 2021 . Pan: Prototype-based adaptive network for robust cross-modal retrieval. (2021), 1125\u20131134. Zhixiong Zeng, Shuai Wang, Nan Xu, and Wenji Mao. 2021. Pan: Prototype-based adaptive network for robust cross-modal retrieval. (2021), 1125\u20131134."},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_23_1","DOI":"10.1109\/TCSVT.2013.2276704"},{"key":"e_1_3_2_1_24_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 10394\u201310403","author":"Zhen Liangli","year":"2019","unstructured":"Liangli Zhen , Peng Hu , Xu Wang , and Dezhong Peng . 2019 . Deep supervised cross-modal retrieval . In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 10394\u201310403 . Liangli Zhen, Peng Hu, Xu Wang, and Dezhong Peng. 2019. Deep supervised cross-modal retrieval. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 10394\u201310403."}],"event":{"acronym":"ICCPR 2022","name":"ICCPR 2022: 2022 11th International Conference on Computing and Pattern Recognition","location":"Beijing China"},"container-title":["Proceedings of the 2022 11th International Conference on Computing and Pattern Recognition"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581807.3581857","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581807.3581857","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:49:30Z","timestamp":1750182570000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581807.3581857"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,11,17]]},"references-count":24,"alternative-id":["10.1145\/3581807.3581857","10.1145\/3581807"],"URL":"https:\/\/doi.org\/10.1145\/3581807.3581857","relation":{},"subject":[],"published":{"date-parts":[[2022,11,17]]},"assertion":[{"value":"2023-05-22","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}