{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T05:05:00Z","timestamp":1750309500170,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681577","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"701-709","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Partially Aligned Cross-modal Retrieval via Optimal Transport-based Prototype Alignment Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1605-1219","authenticated-orcid":false,"given":"Junsheng","family":"Wang","sequence":"first","affiliation":[{"name":"Nanjing University of Science and Technology, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8142-3782","authenticated-orcid":false,"given":"Tiantian","family":"Gong","sequence":"additional","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, Nanjing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7618-119X","authenticated-orcid":false,"given":"Yan","family":"Yan","sequence":"additional","affiliation":[{"name":"Illinois Institute of Technology, Chicago, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"23270","article-title":"Unbalanced optimal transport through non-negative penalized linear regression","volume":"34","author":"Chapel Laetitia","year":"2021","unstructured":"Laetitia Chapel, R\u00e9mi Flamary, HaoranWu, C\u00e9dric F\u00e9votte, and Gilles Gasso. 2021. Unbalanced optimal transport through non-negative penalized linear regression. Advances in Neural Information Processing Systems 34 (2021), 23270--23282.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_2_1","volume-title":"International Conference on Machine Learning. PMLR, 1542--1553","author":"Chen Liqun","year":"2020","unstructured":"Liqun Chen, Zhe Gan, Yu Cheng, Linjie Li, Lawrence Carin, and Jingjing Liu. 2020. Graph optimal transport for cross-domain alignment. In International Conference on Machine Learning. PMLR, 1542--1553."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01953"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/1646396.1646452"},{"key":"e_1_3_2_1_5_1","volume-title":"Optimal transport for domain adaptation","author":"Courty Nicolas","year":"2016","unstructured":"Nicolas Courty, R\u00e9mi Flamary, Devis Tuia, and Alain Rakotomamonjy. 2016. Optimal transport for domain adaptation. IEEE transactions on pattern analysis and machine intelligence 39, 9 (2016), 1853--1865."},{"key":"e_1_3_2_1_6_1","volume-title":"Lightspeed Computation of Optimal Transport. Advances in neural information processing systems 26","author":"Sinkhorn Distances Cuturi M","year":"2013","unstructured":"Cuturi M Sinkhorn Distances. 2013. Lightspeed Computation of Optimal Transport. Advances in neural information processing systems 26 (2013), 2292--2300."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654902"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3262832"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613802"},{"key":"e_1_3_2_1_10_1","volume-title":"Canonical correlation analysis: An overview with application to learning methods. Neural computation 16, 12","author":"Hardoon David R","year":"2004","unstructured":"David R Hardoon, Sandor Szedmak, and John Shawe-Taylor. 2004. Canonical correlation analysis: An overview with application to learning methods. Neural computation 16, 12 (2004), 2639--2664."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_12_1","volume-title":"Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415","author":"Hendrycks Dan","year":"2016","unstructured":"Dan Hendrycks and Kevin Gimpel. 2016. Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415 (2016)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3331184.3331213"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i01.5339"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00921"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413676"},{"key":"e_1_3_2_1_17_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_18_1","volume-title":"Fine-grained late-interaction multi-modal retrieval for retrieval augmented visual question answering. Advances in Neural Information Processing Systems 36","author":"Lin Weizhe","year":"2024","unstructured":"Weizhe Lin, Jinghong Chen, Jingbiao Mei, Alexandru Coca, and Bill Byrne. 2024. Fine-grained late-interaction multi-modal retrieval for retrieval augmented visual question answering. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"Self-supervised Correlation Learning for Cross-Modal Retrieval","author":"Liu Yaxin","year":"2022","unstructured":"Yaxin Liu, Jianlong Wu, Leigang Qu, Tian Gan, Jianhua Yin, and Liqiang Nie. 2022. Self-supervised Correlation Learning for Cross-Modal Retrieval. IEEE Transactions on Multimedia (2022)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP40778.2020.9190722"},{"key":"e_1_3_2_1_21_1","volume-title":"International Joint Conference on Artificial Intelligence. 3846--3853","author":"Peng Yuxin","year":"2016","unstructured":"Yuxin Peng, Xin Huang, and Jinwei Qi. 2016. Cross-media shared representation by hierarchical learning with multiple deep networks. In International Joint Conference on Artificial Intelligence. 3846--3853."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3284750"},{"key":"e_1_3_2_1_23_1","volume-title":"CCL: Cross-modal correlation learning with multigrained fusion by hierarchical network","author":"Peng Yuxin","year":"2017","unstructured":"Yuxin Peng, Jinwei Qi, Xin Huang, and Yuxin Yuan. 2017. CCL: Cross-modal correlation learning with multigrained fusion by hierarchical network. IEEE Transactions on Multimedia (2017), 405--420."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2852503"},{"key":"e_1_3_2_1_25_1","volume-title":"Locate before answering: Answer guided question localization for video question answering","author":"Qian Tianwen","year":"2023","unstructured":"Tianwen Qian, Ran Cui, Jingjing Chen, Pai Peng, Xiaowei Guo, and Yu-Gang Jiang. 2023. Locate before answering: Answer guided question localization for video question answering. IEEE Transactions on Multimedia (2023)."},{"key":"e_1_3_2_1_26_1","volume-title":"International Conference on Machine Learning. 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. 8748--8763."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.5555\/1866696.1866717"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1873987"},{"key":"e_1_3_2_1_29_1","volume-title":"Neural machine translation of rare words with subword units. arXiv preprint arXiv:1508.07909","author":"Sennrich Rico","year":"2015","unstructured":"Rico Sennrich, Barry Haddow, and Alexandra Birch. 2015. Neural machine translation of rare words with subword units. arXiv preprint arXiv:1508.07909 (2015)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995350"},{"key":"e_1_3_2_1_31_1","volume-title":"Incomplete Cross-Modal Retrieval with Deep Correlation Transfer. ACM Transactions on Multimedia Computing, Communications and Applications","author":"Shi Dan","year":"2023","unstructured":"Dan Shi, Lei Zhu, Jingjing Li, Guohua Dong, and Huaxiang Zhang. 2023. Incomplete Cross-Modal Retrieval with Deep Correlation Transfer. ACM Transactions on Multimedia Computing, Communications and Applications (2023)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01939"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3238308"},{"key":"e_1_3_2_1_34_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_35_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123326"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the 30th ACM International Conference on Multimedia. 4300--4308","author":"Wang Junsheng","year":"2022","unstructured":"Junsheng Wang, Tiantian Gong, Zhixiong Zeng, Changchang Sun, and Yan Yan. 2022. C3CMR: Cross-Modality Cross-Instance Contrastive Learning for Cross- Media Retrieval. In Proceedings of the 30th ACM International Conference on Multimedia. 4300--4308."},{"key":"e_1_3_2_1_38_1","volume-title":"International conference on machine learning. PMLR, 1083--1092","author":"Arora Raman","year":"2015","unstructured":"WeiranWang, Raman Arora, Karen Livescu, and Jeff Bilmes. 2015. On deep multiview representation learning. In International conference on machine learning. PMLR, 1083--1092."},{"key":"e_1_3_2_1_39_1","volume-title":"Large-scale approximate kernel canonical correlation analysis. arXiv preprint arXiv:1511.04773","author":"Karen Livescu WeiranWang","year":"2015","unstructured":"WeiranWang and Karen Livescu. 2015. Large-scale approximate kernel canonical correlation analysis. arXiv preprint arXiv:1511.04773 (2015)."},{"key":"e_1_3_2_1_40_1","volume-title":"Masked Contrastive Reconstruction for Cross-modal Medical Image-Report Retrieval. arXiv preprint arXiv:2312.15840","author":"Wei Zeqiang","year":"2023","unstructured":"Zeqiang Wei, Kai Jin, and Xiuzhuang Zhou. 2023. Masked Contrastive Reconstruction for Cross-modal Medical Image-Report Retrieval. arXiv preprint arXiv:2312.15840 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548022"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298966"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00366"},{"key":"e_1_3_2_1_44_1","volume-title":"A comprehensive empirical study of visionlanguage pre-trained model for supervised cross-modal retrieval. arXiv preprint arXiv:2201.02772","author":"Zeng Zhixiong","year":"2022","unstructured":"Zhixiong Zeng andWenji Mao. 2022. A comprehensive empirical study of visionlanguage pre-trained model for supervised cross-modal retrieval. arXiv preprint arXiv:2201.02772 (2022)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462867"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2013.2276704"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2723841"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.21"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01064"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681577","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681577","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:48Z","timestamp":1750295868000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681577"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":49,"alternative-id":["10.1145\/3664647.3681577","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681577","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}