{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T14:24:39Z","timestamp":1766067879165,"version":"build-2065373602"},"reference-count":67,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"5","license":[{"start":{"date-parts":[[2024,5,1]],"date-time":"2024-05-01T00:00:00Z","timestamp":1714521600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,5,1]],"date-time":"2024-05-01T00:00:00Z","timestamp":1714521600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,5,1]],"date-time":"2024-05-01T00:00:00Z","timestamp":1714521600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001381","name":"National Research Foundation, Singapore","doi-asserted-by":"crossref","award":["NRF-NRFF13-2021-0008"],"award-info":[{"award-number":["NRF-NRFF13-2021-0008"]}],"id":[{"id":"10.13039\/501100001381","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Mike Zheng Shou&#x0027;s Start-Up"},{"DOI":"10.13039\/501100001459","name":"Singapore Ministry of Education","doi-asserted-by":"crossref","id":[{"id":"10.13039\/501100001459","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2024,5]]},"DOI":"10.1109\/tpami.2023.3343736","type":"journal-article","created":{"date-parts":[[2023,12,18]],"date-time":"2023-12-18T19:36:04Z","timestamp":1702928164000},"page":"3406-3421","source":"Crossref","is-referenced-by-count":7,"title":["Enhancing Visual Grounding in Vision-Language Pre-Training With Position-Guided Text Prompts"],"prefix":"10.1109","volume":"46","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6127-9146","authenticated-orcid":false,"given":"Alex Jinpeng","family":"Wang","sequence":"first","affiliation":[{"name":"Show Lab, National University of Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3400-8943","authenticated-orcid":false,"given":"Pan","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Computing and Information Systems, Singapore Management University, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7681-2166","authenticated-orcid":false,"given":"Mike Zheng","family":"Shou","sequence":"additional","affiliation":[{"name":"Show Lab, National University of Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8906-3777","authenticated-orcid":false,"given":"Shuicheng","family":"Yan","sequence":"additional","affiliation":[{"name":"Sea AI Lab, Singapore"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00904"},{"article-title":"Flamingo: A visual language model for few-shot learning","year":"2022","author":"Alayrac","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"article-title":"OpenFlamingo: An open-source framework for training large autoregressive vision-language models","year":"2023","author":"Awadalla","key":"ref5"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref7","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Brown"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00356"},{"article-title":"Microsoft COCO captions: Data collection and evaluation server","year":"2015","author":"Chen","key":"ref9"},{"key":"ref10","first-page":"104","article-title":"Uniter: Learning universal image-text representations","author":"Chen","year":"2020","journal-title":"Proc. Eur. Conf. Comput. Vis."},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00359"},{"article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"Devlin","key":"ref12"},{"article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","year":"2020","author":"Dosovitskiy","key":"ref13"},{"article-title":"DataComp: In search of the next generation of multimodal datasets","year":"2023","author":"Gadre","key":"ref14"},{"key":"ref15","first-page":"6616","article-title":"Large-scale adversarial training for vision-and-language representation learning","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Gan"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01745"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01278"},{"article-title":"Openclip","year":"2021","author":"Ilharco","key":"ref18"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.149"},{"key":"ref20","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jia"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01832"},{"key":"ref22","first-page":"5583","article-title":"Vilt: Vision-and-language transformer without convolution or region supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.730"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"ref27","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","journal-title":"Proc. Int. Conf. Mach. Learn."},{"key":"ref28","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.202"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00217"},{"key":"ref35","first-page":"13","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Lu"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"ref37","first-page":"1143","article-title":"Im2Text: Describing images using 1 million captioned photographs","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ordonez"},{"key":"ref38","first-page":"8026","article-title":"PyTorch: An imperative style, high-performance deep learning library","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Paszke"},{"article-title":"ImageBERT: Cross-modal pre-training with large-scale weak-supervised image-text data","year":"2020","author":"Qi","key":"ref39"},{"key":"ref40","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"issue":"140","key":"ref41","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"J. Mach. Learn. Res."},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2016.2577031"},{"key":"ref43","first-page":"25278","article-title":"LAION-5B: An open large-scale dataset for training next generation image-text models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Schuhmann"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"article-title":"VL-BERT: Pre-training of generic visual-linguistic representations","year":"2019","author":"Su","key":"ref45"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/p19-1644"},{"article-title":"LLaMA: Open and efficient foundation language models","year":"2023","author":"Touvron","key":"ref47"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2587640"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00638"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02226"},{"article-title":"GIT: A generative image-to-text transformer for vision and language","year":"2022","author":"Wang","key":"ref51"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00331"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52729.2023.01838"},{"article-title":"SimVLM: Simple visual language model pretraining with weak supervision","year":"2021","author":"Wang","key":"ref54"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123427"},{"article-title":"FILIP: Fine-grained interactive language-image pre-training","year":"2021","author":"Yao","key":"ref56"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2024.01.004"},{"article-title":"COCA: Contrastive captioners are image-text foundation models","year":"2022","author":"Yu","key":"ref58"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46475-6_5"},{"article-title":"Florence: A new foundation model for computer vision","year":"2021","author":"Yuan","key":"ref60"},{"article-title":"Multi-grained vision language pre-training: Aligning texts with visual concepts","year":"2021","author":"Zeng","key":"ref61"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr46437.2021.00553"},{"article-title":"OPT: Open pre-trained transformer language models","year":"2022","author":"Zhang","key":"ref63"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"},{"article-title":"Multimodal C4: An open, billion-scale corpus of images interleaved with text","year":"2023","author":"Zhu","key":"ref67"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/10490207\/10363674.pdf?arnumber=10363674","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,9]],"date-time":"2024-04-09T19:36:04Z","timestamp":1712691364000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10363674\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5]]},"references-count":67,"journal-issue":{"issue":"5"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2023.3343736","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"type":"print","value":"0162-8828"},{"type":"electronic","value":"2160-9292"},{"type":"electronic","value":"1939-3539"}],"subject":[],"published":{"date-parts":[[2024,5]]}}}