{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T18:27:14Z","timestamp":1780511234264,"version":"3.54.1"},"reference-count":205,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"9","license":[{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,9,1]],"date-time":"2024-09-01T00:00:00Z","timestamp":1725148800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62101451"],"award-info":[{"award-number":["62101451"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100015401","name":"Key Research and Development Projects of Shaanxi Province","doi-asserted-by":"publisher","award":["2024GX-YBXM-117"],"award-info":[{"award-number":["2024GX-YBXM-117"]}],"id":[{"id":"10.13039\/501100015401","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100021171","name":"Basic and Applied Basic Research Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2024A1515011394"],"award-info":[{"award-number":["2024A1515011394"]}],"id":[{"id":"10.13039\/501100021171","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Post-Doctoral Innovation Talent Support Program","award":["BX20230498"],"award-info":[{"award-number":["BX20230498"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Video Technol."],"published-print":{"date-parts":[[2024,9]]},"DOI":"10.1109\/tcsvt.2024.3376373","type":"journal-article","created":{"date-parts":[[2024,3,18]],"date-time":"2024-03-18T20:24:07Z","timestamp":1710793447000},"page":"7803-7819","source":"Crossref","is-referenced-by-count":14,"title":["An Overview of Text-Based Person Search: Recent Advances and Future Directions"],"prefix":"10.1109","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7997-9930","authenticated-orcid":false,"given":"Kai","family":"Niu","sequence":"first","affiliation":[{"name":"National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, School of Computer Science, Northwestern Polytechnical University, Xi&#x2019;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-5502-7848","authenticated-orcid":false,"given":"Yanyi","family":"Liu","sequence":"additional","affiliation":[{"name":"National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, School of Computer Science, Northwestern Polytechnical University, Xi&#x2019;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuzhou","family":"Long","sequence":"additional","affiliation":[{"name":"National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, School of Computer Science, Northwestern Polytechnical University, Xi&#x2019;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8239-7229","authenticated-orcid":false,"given":"Yan","family":"Huang","sequence":"additional","affiliation":[{"name":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5224-8647","authenticated-orcid":false,"given":"Liang","family":"Wang","sequence":"additional","affiliation":[{"name":"National Laboratory of Pattern Recognition, Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2977-8057","authenticated-orcid":false,"given":"Yanning","family":"Zhang","sequence":"additional","affiliation":[{"name":"National Engineering Laboratory for Integrated Aero-Space-Ground-Ocean Big Data Application Technology, School of Computer Science, Northwestern Polytechnical University, Xi&#x2019;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.551"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_42"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2984883"},{"key":"ref4","article-title":"Semantically self-aligned network for text-to-image part-aware person re-identification","author":"Ding","year":"2021","journal-title":"arXiv:2107.12666"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.04.081"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547753"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3073718"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2006.891352"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2019.2900907"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3013269"},{"key":"ref11","article-title":"A comprehensive survey on cross-modal retrieval","author":"Wang","year":"2016","journal-title":"arXiv:1607.06215"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/759"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01064"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123326"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00586"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2017.2705068"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3235523"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3172716"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3203247"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3186714"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3263054"},{"key":"ref22","article-title":"Person re-identification: Past, present and future","author":"Zheng","year":"2016","journal-title":"arXiv:1610.02984"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2898940"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/692"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3054775"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00924"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00719"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3128214"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3002956"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3043026"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3205216"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475369"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3627631.3627648"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350991"},{"key":"ref36","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","volume-title":"Proc. NeurIPS","author":"Ren"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/s11280-021-00953-9"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240509"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.204"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.214"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00363"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.26599\/TST.2018.9010100"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2021.102039"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00763"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.143"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413864"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.3115\/1225403.1225421"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-93046-2_53"},{"key":"ref50","first-page":"3","article-title":"Fast exact inference with a factored model for natural language parsing","volume-title":"Proc. NeurIPS","author":"Klein"},{"key":"ref51","volume-title":"Textblob Documentation","author":"Loria","year":"2018"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3225754"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413895"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/p14-5010"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1162"},{"key":"ref56","article-title":"Very deep convolutional networks for large-scale image recognition","volume-title":"Proc. ICLR","author":"Simonyan"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.209"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1016\/j.neunet.2005.06.042"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-4012"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1145\/3383184"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104168"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i4.20370"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_3"},{"key":"ref66","article-title":"Graph attention networks","author":"Veli\u010d kovi\u0107","year":"2017","journal-title":"arXiv:1710.10903"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/TNN.2008.2005605"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00710"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2978386"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/3465055"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.03.091"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2972168"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/TNN.2008.2010350"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-93417-4_38"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICNLP55136.2022.00078"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3048627"},{"key":"ref77","first-page":"11592","article-title":"Optimization of graph neural networks: Implicit acceleration by skip connections and more depth","volume-title":"Proc. ICML","author":"Xu"},{"key":"ref78","article-title":"Person text-image matching via text-feature interpretability embedding and external attack node implantation","author":"Li","year":"2022","journal-title":"arXiv:2211.08657"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/148"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-021-06734-9"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-19-2266-4_21"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-18907-4_36"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref85","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","volume-title":"Proc. NAACL","author":"Devlin"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1145\/3591106.3592253"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611768"},{"key":"ref88","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. ICML","author":"Radford"},{"key":"ref89","article-title":"Text-based person search with limited data","volume-title":"Proc. BMVC","author":"Han"},{"key":"ref90","article-title":"CLIP-based synergistic knowledge transfer for text-based person retrieval","author":"Liu","year":"2023","journal-title":"arXiv:2309.09496"},{"key":"ref91","article-title":"PLIP: Language-image pre-training for person representation learning","author":"Zuo","year":"2023","journal-title":"arXiv:2305.08386"},{"key":"ref92","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","volume-title":"Proc. ICLR","author":"Dosovitskiy"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9746846"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/LSP.2022.3217682"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548028"},{"key":"ref97","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"Proc. ICML","author":"Touvron"},{"key":"ref98","article-title":"Calibrating cross-modal features for text-based person searching","author":"Wei","year":"2023","journal-title":"arXiv:2304.02278"},{"key":"ref99","article-title":"CLIP-driven fine-grained text-image person re-identification","author":"Yan","year":"2022","journal-title":"arXiv:2210.10276"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00273"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref102","article-title":"VL-BERT: Pre-training of generic visual-linguistic representations","volume-title":"Proc. ICLR","author":"Su"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-25072-9_42"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2022.108891"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3273719"},{"key":"ref109","article-title":"Contextual non-local alignment over full-scale representation for text-based person search","author":"Gao","year":"2021","journal-title":"arXiv:2101.03036"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01270-0_4"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3337653"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/IJCB48548.2020.9304940"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_42"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3285426"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.100"},{"key":"ref116","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv:1807.03748"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58610-2_24"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/WACV45572.2020.9093640"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP49359.2023.10222660"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2023.3329220"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/62"},{"key":"ref123","first-page":"297","article-title":"Noise-contrastive estimation: A new estimation principle for unnormalized statistical models","volume-title":"Proc. Int. Conf. Artif. Intell. Statist.","author":"Gutmann"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i1.27801"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2019.8682456"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1145\/3444685.3446314"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.5555\/2969033.2969125"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00591"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2022.105419"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1312.6114"},{"key":"ref131","article-title":"NICE: Non-linear independent components estimation","volume-title":"Proc. ICLR","author":"Dinh"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref133","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. NeurIPS","author":"Ho"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00251"},{"issue":"8","key":"ref135","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"ref136","first-page":"1","article-title":"Evaluating appearance models for recognition, reacquisition, and tracking","volume-title":"Proc. PETS","author":"Gray"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-37331-2_3"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.27"},{"key":"ref139","article-title":"Person re-identification meets image search","author":"Zheng","year":"2015","journal-title":"arXiv:1502.02171"},{"key":"ref140","article-title":"Joint detection and identification feature learning for person search","author":"Xiao","year":"2016","journal-title":"arXiv:1604.01850"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00016"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1145\/3460426.3463652"},{"key":"ref143","article-title":"MobileNets: Efficient convolutional neural networks for mobile vision applications","author":"Howard","year":"2017","journal-title":"arXiv:1704.04861"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548057"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_30"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/d14-1181"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2018.10.020"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00208"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01120"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP49359.2023.10222570"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548166"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01452"},{"key":"ref155","article-title":"Exploiting the textual potential from vision-language pre-training for text-based person search","author":"Wang","year":"2023","journal-title":"arXiv:2303.04497"},{"key":"ref156","article-title":"Unleashing the imagination of text: A novel framework for text-to-image person retrieval via exploring the power of words","author":"Liu","year":"2023","journal-title":"arXiv:2307.09059"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611709"},{"key":"ref158","first-page":"9694","article-title":"Align before fuse: Vision and language representation learning with momentum distillation","volume-title":"Proc. NeurIPS","author":"Li"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"ref160","first-page":"2414","article-title":"Modular networks: Learning to decompose neural computation","volume-title":"Proc. NeurIPS","author":"Kirsch"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00915"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3185487"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612285"},{"key":"ref164","article-title":"A survey on multimodal large language models","author":"Yin","year":"2023","journal-title":"arXiv:2306.13549"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1109\/BigData59044.2023.10386743"},{"key":"ref166","article-title":"LLM4Drive: A survey of large language models for autonomous driving","author":"Yang","year":"2023","journal-title":"arXiv:2311.01043"},{"key":"ref167","article-title":"MiniGPT-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv:2304.10592"},{"key":"ref168","article-title":"Gemini: A family of highly capable multimodal models","author":"Team","year":"2023","journal-title":"arXiv:2312.11805"},{"key":"ref169","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023","journal-title":"arXiv:2301.12597"},{"key":"ref170","article-title":"ChatGPT asks, BLIP-2 answers: Automatic questioning towards enriched visual descriptions","author":"Zhu","year":"2023","journal-title":"arXiv:2303.06594"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3085907"},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.1145\/3606041.3618058"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1016\/j.imavis.2021.104332"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00712"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00173"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654965"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.357"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01197"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00225"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1410-8"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2021.08.002"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2022.01.001"},{"key":"ref183","article-title":"Learning transferable pedestrian representation from multimodal information supervision","author":"Bao","year":"2023","journal-title":"arXiv:2304.05554"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.61969\/jai.1337500"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1503.02531"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00489"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01453-z"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3178485"},{"key":"ref191","article-title":"Deep compression: Compressing deep neural networks with pruning, trained quantization and Huffman coding","volume-title":"Proc. ICLR","author":"Han"},{"key":"ref192","article-title":"A survey of model compression and acceleration for deep neural networks","author":"Cheng","year":"2017","journal-title":"arXiv:1710.09282"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-020-09816-7"},{"key":"ref194","first-page":"5583","article-title":"ViLT: Vision-and-language transformer without convolution or region supervision","volume-title":"Proc. ICML","author":"Kim"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19945"},{"key":"ref196","first-page":"31292","article-title":"UPop: Unified and progressive pruning for compressing vision-language transformers","volume-title":"Proc. ICML","author":"Shi"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611732"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1109\/TIFS.2022.3224853"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00214"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.575"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1016\/j.inffus.2023.101988"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1145\/1459352.1459355"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1006\/jmla.2001.2810"},{"key":"ref204","first-page":"32897","article-title":"VLMo: Unified vision-language pre-training with mixture-of-modality-experts","volume-title":"Proc. NeurIPS","author":"Bao"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00700"}],"container-title":["IEEE Transactions on Circuits and Systems for Video Technology"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/76\/10700029\/10466731.pdf?arnumber=10466731","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T17:33:12Z","timestamp":1727803992000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10466731\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9]]},"references-count":205,"journal-issue":{"issue":"9"},"URL":"https:\/\/doi.org\/10.1109\/tcsvt.2024.3376373","relation":{},"ISSN":["1051-8215","1558-2205"],"issn-type":[{"value":"1051-8215","type":"print"},{"value":"1558-2205","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9]]}}}