{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T22:05:27Z","timestamp":1770933927305,"version":"3.50.1"},"reference-count":63,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,2,1]],"date-time":"2026-02-01T00:00:00Z","timestamp":1769904000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100012543","name":"Centrally Guided Local Science and Technology Development Special Fund of Anhui Province","doi-asserted-by":"publisher","award":["202407a12020009"],"award-info":[{"award-number":["202407a12020009"]}],"id":[{"id":"10.13039\/100012543","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62273319"],"award-info":[{"award-number":["62273319"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Video Technol."],"published-print":{"date-parts":[[2026,2]]},"DOI":"10.1109\/tcsvt.2025.3613997","type":"journal-article","created":{"date-parts":[[2025,9,24]],"date-time":"2025-09-24T17:35:21Z","timestamp":1758735321000},"page":"2164-2176","source":"Crossref","is-referenced-by-count":0,"title":["Modeling Cross-Modal Semantic Transformations From Coarse to Fine in CLIP"],"prefix":"10.1109","volume":"36","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-0123-7023","authenticated-orcid":false,"given":"Ziqi","family":"Peng","sequence":"first","affiliation":[{"name":"Department of Automation, University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1850-2380","authenticated-orcid":false,"given":"Zhenyu","family":"Qi","sequence":"additional","affiliation":[{"name":"Institute of Advanced Technology, University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2891-4379","authenticated-orcid":false,"given":"Yang","family":"Cao","sequence":"additional","affiliation":[{"name":"Department of Automation, University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8706-3252","authenticated-orcid":false,"given":"Yu","family":"Kang","sequence":"additional","affiliation":[{"name":"Department of Automation, University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7583-0944","authenticated-orcid":false,"given":"Wenjun","family":"Lv","sequence":"additional","affiliation":[{"name":"Department of Automation, University of Science and Technology of China, Hefei, China"}]}],"member":"263","reference":[{"key":"ref1","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Radford"},{"key":"ref2","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","volume":"139","author":"Jia"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.5555\/3524938.3525087"},{"key":"ref4","article-title":"Learning to decompose visual features with latent textual prompts","volume-title":"arXiv:2210.04287","author":"Wang","year":"2022"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72390-2_60"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3282777"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3386252"},{"key":"ref9","first-page":"2790","article-title":"Parameter-efficient transfer learning for NLP","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Houlsby"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00514"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01891-x"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_29"},{"key":"ref15","first-page":"3032","article-title":"Improving zero-shot generalization for CLIP with synthesized prompts","volume-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis. (ICCV)","author":"Wang"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3188462"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1111\/j.2517-6161.1975.tb01550.x"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1017\/cbo9780511623059"},{"key":"ref19","first-page":"1782","article-title":"Efficient gradient-based inference through transformations between Bayes nets and neural nets","volume-title":"Proc. Int. Conf. Mach. Learn.","volume":"32","author":"Kingma"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00844"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1312.6114"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.2307\/2347441"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01222"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2025.3529923"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3275382"},{"key":"ref26","first-page":"8821","article-title":"Zero-shot text-to-image generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ramesh"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref28","first-page":"1530","article-title":"Variational inference with normalizing flows","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Rezende"},{"key":"ref29","article-title":"Deep unsupervised clustering with Gaussian mixture variational autoencoders","author":"Dilokthanakul","year":"2016","journal-title":"arXiv:1611.02648"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00030"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ijcnn.2018.8489068"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref33","first-page":"2121","article-title":"DeViSE: A deep visual-semantic embedding model","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"26","author":"Frome"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.449"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3315133"},{"key":"ref37","article-title":"ClipCap: CLIP prefix for image captioning","author":"Mokady","year":"2021","journal-title":"arXiv:2111.09734"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_41"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01441"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3432753"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref42","article-title":"Importance weighted autoencoders","volume-title":"arXiv:1509.00519","author":"Burda","year":"2015"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2013.77"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2004.383"},{"key":"ref47","first-page":"2292","article-title":"Sinkhorn distances: Lightspeed computation of optimal transport","volume-title":"Proc. 27th Annu. Conf. Neural Inf. Process. Syst.","volume":"26","author":"Cuturi"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.461"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2019.2918242"},{"key":"ref51","article-title":"Fine-grained visual classification of aircraft","author":"Maji","year":"2013","journal-title":"arXiv:1306.5151"},{"key":"ref52","article-title":"The caltech-UCSD birds-200\u20132011 dataset","author":"Wah","year":"2011"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.27950"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1212.0402"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref57","first-page":"5389","article-title":"Do ImageNet classifiers generalize to ImageNet","volume-title":"Proc. 36th Int. Conf. Mach. Learn.","author":"Recht"},{"key":"ref58","first-page":"10506","article-title":"Learning robust global representations by penalizing local predictive power","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00823"},{"key":"ref61","first-page":"856","article-title":"Hyperspherical variational auto-encoders","volume-title":"Proc. 34th Conf. Uncertainty Artif. Intell.","author":"Davidson"},{"key":"ref62","first-page":"27896","article-title":"FineCLIP: Self-distilled region-based CLIP for better fine-grained understanding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"37","author":"Jing"},{"key":"ref63","article-title":"EVA-CLIP: Improved training techniques for CLIP at scale","author":"Sun","year":"2023","journal-title":"arXiv:2303.15389"}],"container-title":["IEEE Transactions on Circuits and Systems for Video Technology"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/76\/11392768\/11178074.pdf?arnumber=11178074","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,12]],"date-time":"2026-02-12T21:02:46Z","timestamp":1770930166000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11178074\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,2]]},"references-count":63,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tcsvt.2025.3613997","relation":{},"ISSN":["1051-8215","1558-2205"],"issn-type":[{"value":"1051-8215","type":"print"},{"value":"1558-2205","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,2]]}}}