{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T16:24:47Z","timestamp":1778171087696,"version":"3.51.4"},"reference-count":80,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62172398"],"award-info":[{"award-number":["62172398"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guangzhou Basic and Applied Basic Research Foundation","award":["2024A04J6462"],"award-info":[{"award-number":["2024A04J6462"]}]},{"name":"Guangzhou Basic and Applied Basic Research Foundation","award":["2023A03J0142"],"award-info":[{"award-number":["2023A03J0142"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Visual. Comput. Graphics"],"published-print":{"date-parts":[[2025,1]]},"DOI":"10.1109\/tvcg.2024.3456387","type":"journal-article","created":{"date-parts":[[2024,9,9]],"date-time":"2024-09-09T17:44:36Z","timestamp":1725903876000},"page":"294-304","source":"Crossref","is-referenced-by-count":7,"title":["ModalChorus: Visual Probing and Alignment of Multi-Modal Embeddings via Modal Fusion Map"],"prefix":"10.1109","volume":"31","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8874-5928","authenticated-orcid":false,"given":"Yilin","family":"Ye","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0262-5289","authenticated-orcid":false,"given":"Shishi","family":"Xiao","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xingchen","family":"Zeng","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei","family":"Zeng","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"CLIP-Interrogator"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/cvprw56347.2022.00543"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.02080"},{"key":"ref5","volume-title":"Modern multidimensional scaling: Theory and applications","author":"Borg","year":"2005"},{"key":"ref6","first-page":"543","article-title":"TopicRank: Graph-based topic ranking for keyphrase extraction","volume-title":"International Joint Conference on Natural Language Processing (IJCNLP)","author":"Bougouin"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/tvcg.2021.3138933"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1369-5"},{"key":"ref9","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","volume-title":"Proc. ICML","author":"Chen"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/tvcg.2015.2467552"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.3390\/pr5040075"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.23919\/FUSION52260.2023.10224157"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611972825.16"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3586183.3606777"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2023.3288356"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/vl\/N19-142"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/VAST.2011.6102449"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/BF00038687"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2023.3327168"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2021.3114807"},{"key":"ref21","article-title":"An image is worth one word: Personalizing text-to-image generation using textual inversion","volume-title":"Proceedings of International Conference on Learning Representations","author":"Gal"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01891-x"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.27955"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref25","article-title":"Euclidean embedding of co-occurrence data","volume-title":"Proc. NIPS","volume":"17","author":"Globerson"},{"key":"ref26","first-page":"381","article-title":"Embedding heterogeneous data by preserving multiple kernels","volume-title":"Proc. ECAI","author":"G\u00f6nen"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/tvcg.2020.3030350"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2024.3408255"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2021.3114855"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.13417"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2020.3045918"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/VISUAL.1997.663916"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1002\/wcs.1203"},{"key":"ref35","article-title":"LoRA: Low-rank adaptation of large language models","volume-title":"Proceedings of International Conference on Learning Representations","author":"Hu"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02244"},{"key":"ref37","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. ICML","author":"Jia"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1186\/1471-2105-4-48"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref40","author":"Krizhevsky","year":"2009","journal-title":"Learning multiple layers of features from tiny images"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3031549"},{"key":"ref42","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. ICML","author":"Li"},{"key":"ref43","first-page":"17612","article-title":"Mind the gap: Understanding the modality gap in multi-modal contrastive representation learning","volume-title":"Proc. NIPS","volume":"35","author":"Liang"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2017.2745141"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.13672"},{"key":"ref47","article-title":"VilBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume-title":"Proc. NIPS","volume":"32","author":"Lu"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1080\/10867651.2002.10487551"},{"key":"ref49","first-page":"1","article-title":"Distributed representations of words and phrases and their compositionality","volume-title":"Proc. NIPS","volume":"26","author":"Mikolov"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01424"},{"key":"ref51","first-page":"399","article-title":"Distributional scaling: An algorithm for structure-preserving embedding of metric and nonmetric spaces","volume":"5","author":"Quist","year":"2004","journal-title":"The Journal of Machine Learning Research"},{"key":"ref52","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. ICML","author":"Radford"},{"key":"ref53","article-title":"Hierarchical text-conditional image generation with CLIP latents","author":"Ramesh","year":"2022","journal-title":"arXiv preprint"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01042"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.11640"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00624"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2024.3364841"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2022.3209479"},{"issue":"11","key":"ref61","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"Van","year":"2008","journal-title":"Journal of Machine Learning Research"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1145\/3544548.3581127"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1016\/0169-7439(87)80084-9"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00189"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1145\/3491101.3519729"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2016.12.004"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2022.3209423"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2023.3326515"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2018.2835485"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-023-0393-x"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1016\/j.visinf.2024.04.003"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2022.3229023"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1145\/3641019"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2022.3223399"},{"key":"ref76","first-page":"1","article-title":"IntentTuner: An interactive framework for integrating human intents in fine-tuning text-to-image generative models","volume-title":"Proc. ACM CHI","volume":"182","author":"Zeng"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2024.3456159"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2022.3170531"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591532"}],"container-title":["IEEE Transactions on Visualization and Computer Graphics"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/2945\/10766346\/10669771.pdf?arnumber=10669771","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:35:03Z","timestamp":1732667703000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10669771\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1]]},"references-count":80,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tvcg.2024.3456387","relation":{},"ISSN":["1077-2626","1941-0506","2160-9306"],"issn-type":[{"value":"1077-2626","type":"print"},{"value":"1941-0506","type":"electronic"},{"value":"2160-9306","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,1]]}}}