{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T07:19:03Z","timestamp":1761895143298,"version":"build-2065373602"},"reference-count":38,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/icme59968.2025.11209197","type":"proceedings-article","created":{"date-parts":[[2025,10,30]],"date-time":"2025-10-30T17:57:42Z","timestamp":1761847062000},"page":"1-6","source":"Crossref","is-referenced-by-count":0,"title":["TEVLA: Text-oriented Enhancement for Vision-Language Alignment in Relation Extraction"],"prefix":"10.1109","author":[{"given":"Junlin","family":"Chen","sequence":"first","affiliation":[{"name":"HKUST"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qiushan","family":"Guo","sequence":"additional","affiliation":[{"name":"ByteDance"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ka Chun","family":"Cheung","sequence":"additional","affiliation":[{"name":"NVIDIA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingrui","family":"Liang","sequence":"additional","affiliation":[{"name":"Rice University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dezhi","family":"Chen","sequence":"additional","affiliation":[{"name":"HKMU"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02519"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72946-1_7"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i17.29873"},{"key":"ref5","article-title":"Visualbert: A simple and performant baseline for vision and language","volume":"abs\/1908.03557","author":"Li","year":"2019"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i16.17687"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.findings-naacl.121"},{"article-title":"Filip: Fine-grained interactive language-image pre-training","year":"2021","author":"Yao","key":"ref8"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3595916.3626399"},{"article-title":"Coca: Contrastive captioners are image-text foundation models","year":"2022","author":"Yu","key":"ref10"},{"key":"ref11","first-page":"27412","article-title":"Detecting corrupted labels without training a model to predict","volume-title":"International conference on machine learning","author":"Zhu"},{"article-title":"T-MARS: Improving visual representations by circumventing text feature learning","volume-title":"The Twelfth International Conference on Learning Representations","author":"Maini","key":"ref12"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.376"},{"article-title":"Improving CLIP training with language rewrites","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems","author":"Fan","key":"ref14"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01354"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICME51207.2021.9428274"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11962"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1185"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.230"},{"key":"ref21","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"International conference on machine learning","author":"Jia"},{"key":"ref22","first-page":"19730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"International conference on machine learning","author":"Li"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3476968"},{"article-title":"Visual instruction tuning","year":"2023","author":"Liu","key":"ref24"},{"article-title":"Language quantized autoencoders: Towards unsupervised text-image alignment","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems","author":"Liu","key":"ref25"},{"article-title":"SPAE: Semantic pyramid autoencoder for multimodal generation with frozen LLMs","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems","author":"Yu","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531992"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.12125"},{"key":"ref29","first-page":"40290","article-title":"Delving into noisy label detection with clean data","volume-title":"International Conference on Machine Learning","author":"Yu"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00292"},{"article-title":"Vdc: Versatile data cleanser for detecting dirty samples via visual- linguistic inconsistency","year":"2023","author":"Zhu","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01904"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.823"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.26"},{"article-title":"Complexity-based prompting for multi-step reasoning","volume-title":"The Eleventh International Conference on Learning Representations","author":"Fu","key":"ref35"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"article-title":"Visual instruction tuning","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems","author":"Liu","key":"ref37"},{"article-title":"CogVLM: Visual expert for large language models","year":"2024","author":"Wang","key":"ref38"}],"event":{"name":"2025 IEEE International Conference on Multimedia and Expo (ICME)","start":{"date-parts":[[2025,6,30]]},"location":"Nantes, France","end":{"date-parts":[[2025,7,4]]}},"container-title":["2025 IEEE International Conference on Multimedia and Expo (ICME)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11208895\/11208897\/11209197.pdf?arnumber=11209197","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T05:44:25Z","timestamp":1761889465000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11209197\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/icme59968.2025.11209197","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}