{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T05:49:32Z","timestamp":1777873772460,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","funder":[{"name":"Pioneer and Leading Goose R\\&D Program of Zhejiang","award":["Grant No. 2025C02110"],"award-info":[{"award-number":["Grant No. 2025C02110"]}]},{"name":"Public Welfare Research Program of Ningbo","award":["Grant No. 2024S062"],"award-info":[{"award-number":["Grant No. 2024S062"]}]},{"name":"Yongjiang Talent Project of Ningbo","award":["Grant No. 2024A-161-G"],"award-info":[{"award-number":["Grant No. 2024A-161-G"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,3]]},"DOI":"10.1145\/3711896.3737062","type":"proceedings-article","created":{"date-parts":[[2025,8,3]],"date-time":"2025-08-03T21:05:41Z","timestamp":1754255141000},"page":"3577-3585","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Multimodal Conditional Retrieval with High Controllability"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-7297-4536","authenticated-orcid":false,"given":"Xiaoda","family":"Yang","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9708-3225","authenticated-orcid":false,"given":"Xize","family":"Cheng","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6488-9695","authenticated-orcid":false,"given":"Minghui","family":"Fang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3344-3813","authenticated-orcid":false,"given":"Hongshun","family":"Qiu","sequence":"additional","affiliation":[{"name":"Beijing University of Technology, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4646-6896","authenticated-orcid":false,"given":"Yuhang","family":"Ma","sequence":"additional","affiliation":[{"name":"ByteDance Inc., Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6239-0660","authenticated-orcid":false,"given":"JunYu","family":"Lu","sequence":"additional","affiliation":[{"name":"China Merchants Research Institute of Advance Technology, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5525-2143","authenticated-orcid":false,"given":"Jiaqi","family":"Duan","sequence":"additional","affiliation":[{"name":"Suzhou University of Technology, Suzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8693-7142","authenticated-orcid":false,"given":"Sihang","family":"Cai","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7509-7563","authenticated-orcid":false,"given":"Zehan","family":"Wang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4618-1140","authenticated-orcid":false,"given":"Ruofan","family":"Hu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7682-7678","authenticated-orcid":false,"given":"Dongjie","family":"Fu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6121-0384","authenticated-orcid":false,"given":"Zhou","family":"Zhao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3564-1628","authenticated-orcid":false,"given":"Tao","family":"Jin","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,8,3]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2015.84"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Zhe Chen Weiyun Wang Hao Tian Shenglong Ye Zhangwei Gao Erfei Cui Wenwen Tong Kongzhi Hu Jiapeng Luo Zheng Ma et al. 2024. How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open- Source Suites. arXiv preprint arXiv:2404.16821 (2024).","DOI":"10.1007\/s11432-024-4231-5"},{"key":"e_1_3_2_2_4_1","volume-title":"InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. arXiv preprint arXiv:2312.14238","author":"Chen Zhe","year":"2023","unstructured":"Zhe Chen, Jiannan Wu, Wenhai Wang, Weijie Su, Guo Chen, Sen Xing, Muyan Zhong, Qinglong Zhang, Xizhou Zhu, Lewei Lu, Bin Li, Ping Luo, Tong Lu, Yu Qiao, and Jifeng Dai. 2023. InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks. arXiv preprint arXiv:2312.14238 (2023)."},{"key":"e_1_3_2_2_5_1","volume-title":"Introduction to modern information retrieval","author":"Chowdhury Gobinda G","unstructured":"Gobinda G Chowdhury. 2010. Introduction to modern information retrieval. Facet publishing."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"},{"key":"e_1_3_2_2_7_1","volume-title":"A learned representation for artistic style. arXiv preprint arXiv:1610.07629","author":"Dumoulin Vincent","year":"2016","unstructured":"Vincent Dumoulin, Jonathon Shlens, and Manjunath Kudlur. 2016. A learned representation for artistic style. arXiv preprint arXiv:1610.07629 (2016)."},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.265"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.265"},{"key":"e_1_3_2_2_10_1","unstructured":"James Jerome Gibson. 1966. The senses considered as perceptual systems. (1966)."},{"key":"e_1_3_2_2_11_1","volume-title":"Armand Joulin, and Ishan Misra.","author":"Girdhar Rohit","year":"2023","unstructured":"Rohit Girdhar, Alaaeldin El-Nouby, Zhuang Liu, Mannat Singh, Kalyan Vasudev Alwala, Armand Joulin, and Ishan Misra. 2023. ImageBind: One Embedding Space To Bind Them All. In CVPR."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1002\/wics.197"},{"key":"e_1_3_2_2_13_1","first-page":"133","article-title":"MosAIc: Finding Artistic Connections across Culture with Conditional Image Retrieval. In NeurIPS 2020 Competition and Demonstration Track","author":"Hamilton Mark","year":"2021","unstructured":"Mark Hamilton, Stephanie Fu, Mindren Lu, Johnny Bui, Darius Bopp, Zhenbang Chen, Felix Tran, MargaretWang, Marina Rogers, Lei Zhang, et al. 2021. MosAIc: Finding Artistic Connections across Culture with Conditional Image Retrieval. In NeurIPS 2020 Competition and Demonstration Track. PMLR, 133-155.","journal-title":"PMLR"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00576"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1117\/12.838849"},{"key":"e_1_3_2_2_16_1","volume-title":"Adaptive mixtures of local experts. Neural computation 3, 1","author":"Jacobs Robert A","year":"1991","unstructured":"Robert A Jacobs, Michael I Jordan, Steven J Nowlan, and Geoffrey E Hinton. 1991. Adaptive mixtures of local experts. Neural computation 3, 1 (1991), 79-87."},{"key":"e_1_3_2_2_17_1","volume-title":"Unifying visual semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539","author":"Kiros Ryan","year":"2014","unstructured":"Ryan Kiros, Ruslan Salakhutdinov, and Richard S Zemel. 2014. Unifying visual semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539 (2014)."},{"key":"e_1_3_2_2_18_1","unstructured":"Alex Krizhevsky and Geoffrey Hinton. 2009. Learning Multiple Layers of Features from Tiny Images. Technical Report. University of Toronto."},{"key":"e_1_3_2_2_19_1","first-page":"740","volume-title":"Zurich","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision-ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13. Springer, 740-755."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.740"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3206025.3206064"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_2_2_23_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_2_24_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research 21, 140 (2020), 1-67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_2_26_1","volume-title":"Measuring Style Similarity in Diffusion Models. arXiv preprint arXiv:2404.01292","author":"Somepalli Gowthami","year":"2024","unstructured":"Gowthami Somepalli, Anubhav Gupta, Kamal Gupta, Shramay Palta, Micah Goldblum, Jonas Geiping, Abhinav Shrivastava, and Tom Goldstein. 2024. Measuring Style Similarity in Diffusion Models. arXiv preprint arXiv:2404.01292 (2024)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2017.8296985"},{"key":"e_1_3_2_2_28_1","volume-title":"TREC: Experiment and evaluation in information retrieval.","author":"Harman Donna K","year":"2005","unstructured":"EllenMVoorhees, Donna K Harman, et al. 2005. TREC: Experiment and evaluation in information retrieval. Vol. 63. Citeseer."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Sheng-Yu Wang Alexei A. Efros Jun-Yan Zhu and Richard Zhang. 2023. Evaluating Data Attribution for Text-to-Image Models. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00661"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00706"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.136"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02091"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2013.2265675"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00978"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.244"}],"event":{"name":"KDD '25: The 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Toronto ON Canada","acronym":"KDD '25","sponsor":["SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 31st ACM SIGKDD Conference on Knowledge Discovery and Data Mining V.2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3711896.3737062","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T18:19:44Z","timestamp":1777573184000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3711896.3737062"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,3]]},"references-count":35,"alternative-id":["10.1145\/3711896.3737062","10.1145\/3711896"],"URL":"https:\/\/doi.org\/10.1145\/3711896.3737062","relation":{},"subject":[],"published":{"date-parts":[[2025,8,3]]},"assertion":[{"value":"2025-08-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}