{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,6]],"date-time":"2026-02-06T04:10:00Z","timestamp":1770351000533,"version":"3.49.0"},"reference-count":42,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,7,15]],"date-time":"2024-07-15T00:00:00Z","timestamp":1721001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,7,15]],"date-time":"2024-07-15T00:00:00Z","timestamp":1721001600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,7,15]]},"DOI":"10.1109\/icme57554.2024.10688227","type":"proceedings-article","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T17:24:16Z","timestamp":1727717056000},"page":"1-6","source":"Crossref","is-referenced-by-count":1,"title":["VG-Annotator: Vision-Language Models as Query Annotators for Unsupervised Visual Grounding"],"prefix":"10.1109","author":[{"given":"Jiabo","family":"Ye","sequence":"first","affiliation":[{"name":"East China Normal University"}]},{"given":"Junfeng","family":"Tian","sequence":"additional","affiliation":[{"name":"Nyonic.ai"}]},{"given":"Xiaoshan","family":"Yang","sequence":"additional","affiliation":[{"name":"CASIA"}]},{"given":"Zhenru","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"given":"Anwen","family":"Hu","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"given":"Ming","family":"Yan","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"given":"Ji","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group"}]},{"given":"Liang","family":"He","sequence":"additional","affiliation":[{"name":"East China Normal University"}]},{"given":"Xin","family":"Lin","sequence":"additional","affiliation":[{"name":"East China Normal University"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.540"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01014"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00387"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3039522"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01499"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.503"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16452"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475313"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01506"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00928"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01387"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00269"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00556"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3058684"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00641"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00476"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01507"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"ref21","article-title":"Ernie-vilg: Unified generative pre-training for bidirectional vision-language generation","author":"Zhang","year":"2021"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"ref23","article-title":"Coca: Contrastive captioners are image-text foundation models","author":"Yu","year":"2022"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2024.01.004"},{"key":"ref25","article-title":"Coca: Contrastive captioners are image-text foundation models","author":"Yu","year":"2022"},{"key":"ref26","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume-title":"Proc. of NeurIPS","author":"Li"},{"key":"ref27","article-title":"Unifying vision-and-language tasks via text generation","volume-title":"Proc. of ICML","author":"Cho"},{"key":"ref28","article-title":"Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework","volume-title":"Proc. of ICML","author":"Wang"},{"key":"ref29","article-title":"Pali: A jointly-scaled multilingual language-image model","author":"Chen","year":"2022"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2010.11929"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.41"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.33540\/2168"},{"key":"ref33","article-title":"mplug-owl: Modularization empowers large language models with multimodality","author":"Ye","year":"2023"},{"key":"ref34","article-title":"Visual prompting: Modifying pixel space to adapt pre-trained models","author":"Bahng","year":"2022"},{"key":"ref35","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. of ICML","author":"Radford"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00437"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00270"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351074"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1086"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"}],"event":{"name":"2024 IEEE International Conference on Multimedia and Expo (ICME)","location":"Niagara Falls, ON, Canada","start":{"date-parts":[[2024,7,15]]},"end":{"date-parts":[[2024,7,19]]}},"container-title":["2024 IEEE International Conference on Multimedia and Expo (ICME)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10685847\/10687354\/10688227.pdf?arnumber=10688227","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T06:40:49Z","timestamp":1727764849000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10688227\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7,15]]},"references-count":42,"URL":"https:\/\/doi.org\/10.1109\/icme57554.2024.10688227","relation":{},"subject":[],"published":{"date-parts":[[2024,7,15]]}}}