{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T07:27:53Z","timestamp":1761895673720,"version":"build-2065373602"},"reference-count":24,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/icme59968.2025.11210028","type":"proceedings-article","created":{"date-parts":[[2025,10,30]],"date-time":"2025-10-30T17:57:42Z","timestamp":1761847062000},"page":"1-6","source":"Crossref","is-referenced-by-count":0,"title":["EPIC: Efficient Prompt Interaction for Text-Image Classification"],"prefix":"10.1109","author":[{"given":"Xinyao","family":"Yu","sequence":"first","affiliation":[{"name":"Zhejiang University,College of Computer Science and Technology,Hangzhou,China"}]},{"given":"Hao","family":"Sun","sequence":"additional","affiliation":[{"name":"Zhejiang University,College of Computer Science and Technology,Hangzhou,China"}]},{"given":"Zeyu","family":"Ling","sequence":"additional","affiliation":[{"name":"Zhejiang University,College of Computer Science and Technology,Hangzhou,China"}]},{"given":"Ziwei","family":"Niu","sequence":"additional","affiliation":[{"name":"Zhejiang University,College of Computer Science and Technology,Hangzhou,China"}]},{"given":"Zhenjia","family":"Bai","sequence":"additional","affiliation":[{"name":"Zhejiang University,College of Computer Science and Technology,Hangzhou,China"}]},{"given":"Rui","family":"Qin","sequence":"additional","affiliation":[{"name":"Zhejiang University,College of Computer Science and Technology,Hangzhou,China"}]},{"given":"Yen-Wei","family":"Chen","sequence":"additional","affiliation":[{"name":"Ritsumeikan University,College of Information Science,Japan"}]},{"given":"Lanfen","family":"Lin","sequence":"additional","affiliation":[{"name":"Zhejiang University,College of Computer Science and Technology,Hangzhou,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP46576.2022.9897323"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/3442381.3449791"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICME51207.2021.9428321"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1115"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413678"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-11024-6_44"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-acl.234"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00256"},{"article-title":"Supervised multimodal bitransformers for classifying images and text","year":"2019","author":"Kiela","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01832"},{"article-title":"Huse: Hierarchical universal semantic embeddings","year":"2019","author":"Narayana","key":"ref15"},{"article-title":"Multibench: Multiscale benchmarks for multimodal representation learning","year":"2021","author":"Liang","key":"ref16"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00256"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"ref19","first-page":"5583","article-title":"Vilt: Vision-and-language transformer without convolution or region supervision","volume-title":"Proceedings of the 38th International Conference on Machine Learning","volume":"139","author":"Kim"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICMEW.2015.7169757"},{"article-title":"Gated multimodal units for information fusion","year":"2017","author":"Arevalo","key":"ref21"},{"article-title":"Visual entailment task for visually-grounded language learning","year":"2018","author":"Xie","key":"ref22"},{"key":"ref23","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"author":"Li","key":"ref24","article-title":"Visualbert: A simple and performant baseline for vision and language"}],"event":{"name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","start":{"date-parts":[[2025,6,30]]},"location":"Nantes, France","end":{"date-parts":[[2025,7,4]]}},"container-title":["2025 IEEE International Conference on Multimedia and Expo (ICME)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11208895\/11208897\/11210028.pdf?arnumber=11210028","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T05:55:36Z","timestamp":1761890136000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11210028\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":24,"URL":"https:\/\/doi.org\/10.1109\/icme59968.2025.11210028","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}