{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T04:40:16Z","timestamp":1781584816421,"version":"3.54.5"},"reference-count":98,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"7","license":[{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100000923","name":"Australian Research Council","doi-asserted-by":"publisher","award":["DP200103223"],"award-info":[{"award-number":["DP200103223"]}],"id":[{"id":"10.13039\/501100000923","id-type":"DOI","asserted-by":"publisher"}]},{"name":"CRC-P Smart Material Recovery Facility"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1109\/tpami.2024.3410329","type":"journal-article","created":{"date-parts":[[2024,9,9]],"date-time":"2024-09-09T17:46:40Z","timestamp":1725904000000},"page":"5223-5237","source":"Crossref","is-referenced-by-count":4,"title":["Cap4Video++: Enhancing Video Understanding With Auxiliary Captions"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8511-743X","authenticated-orcid":false,"given":"Wenhao","family":"Wu","sequence":"first","affiliation":[{"name":"School of Computer Science, The University of Sydney, Camperdown, NSW, Australia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6206-7911","authenticated-orcid":false,"given":"Xiaohan","family":"Wang","sequence":"additional","affiliation":[{"name":"Stanford University, Stanford, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9625-2588","authenticated-orcid":false,"given":"Haipeng","family":"Luo","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4888-4445","authenticated-orcid":false,"given":"Jingdong","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Computer Vision Technology, Baidu Inc., Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0512-880X","authenticated-orcid":false,"given":"Yi","family":"Yang","sequence":"additional","affiliation":[{"name":"College of Computer Science and Technology, Zhejiang University, Zhejiang, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9163-2761","authenticated-orcid":false,"given":"Wanli","family":"Ouyang","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref2","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Jia"},{"key":"ref3","article-title":"CoCa: Contrastive captioners are image-text foundation models","author":"Yu","year":"2022"},{"key":"ref4","article-title":"Florence: A new foundation model for computer vision","author":"Yuan","year":"2021"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00504"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"ref9","article-title":"CLIP2TV: An empirical study on transformer-based methods for video-text retrieval","author":"Gao","year":"2021"},{"key":"ref10","article-title":"Disentangled representation learning for text-video retrieval","author":"Wang","year":"2022"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"ref12","article-title":"CLIP2video: Mastering video-text retrieval via image CLIP","author":"Fang","year":"2021"},{"key":"ref13","article-title":"HunYuan_tvr for text-video retrivial","author":"Min","year":"2022"},{"key":"ref14","article-title":"ST-Adapter: Parameter-efficient image-to-video transfer learning for action recognition","author":"Pan","year":"2022"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_23"},{"key":"ref16","article-title":"ActionCLIP: A new paradigm for video action recognition","author":"Wang","year":"2021"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_1"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25386"},{"key":"ref20","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI blog"},{"key":"ref21","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3122865.3122867"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"ref27","article-title":"The kinetics human action video dataset","author":"Kay","year":"2017"},{"key":"ref28","article-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"Soomro","year":"2012"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref31","article-title":"A short note about kinetics-600","author":"Carreira","year":"2018"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01739"},{"key":"ref34","article-title":"ChatGPT","year":"2023"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref36","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"ref37","article-title":"Use what you have: Video retrieval using representations from collaborative experts","author":"Liu","year":"2019"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/154"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00513"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531950"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19781-9_19"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01262"},{"key":"ref47","first-page":"568","article-title":"Two-stream convolutional networks for action recognition in videos","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Simonyan"},{"key":"ref48","first-page":"3468","article-title":"Spatiotemporal residual networks for video action recognition","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Feichtenhofer"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16401"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475344"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6836"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01345"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.590"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00193"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_40"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_42"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW50498.2020.00346"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00632"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01594"},{"key":"ref65","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"ref66","first-page":"15908","article-title":"Transformer in transformer","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Han"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2102.05095"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01261"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01876-w"},{"key":"ref74","article-title":"FILIP: Fine-grained interactive language-image pre-training","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yao"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401075"},{"key":"ref76","first-page":"13","article-title":"VilBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume":"32","author":"Lu","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"ref77","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Kingma"},{"key":"ref78","article-title":"Accurate, large minibatch SGD: Training imagenet in 1 hour","author":"Goyal","year":"2017"},{"key":"ref79","article-title":"Improving video-text retrieval by multi-stream corpus alignment and dual softmax loss","author":"Cheng","year":"2021"},{"key":"ref80","article-title":"Support-set bottlenecks for video-text representation learning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Patrick"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01569"},{"key":"ref82","article-title":"Text-adaptive multiple visual prototype matching for video-text retrieval","author":"Lin","year":"2022"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00813"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref87","article-title":"TokenLearner: What can 8 learned tokens do for images and videos?","author":"Ryoo","year":"2021"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00333"},{"key":"ref89","article-title":"Co-training transformer with videos and images improves action recognition","author":"Zhang","year":"2021"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01338"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00047"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018303"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00467"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16276"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01935"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00640"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.97"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01179"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11026037\/10670217.pdf?arnumber=10670217","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T04:14:38Z","timestamp":1749183278000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10670217\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7]]},"references-count":98,"journal-issue":{"issue":"7"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2024.3410329","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7]]}}}