{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T14:40:57Z","timestamp":1777128057561,"version":"3.51.4"},"reference-count":103,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"Key Program of the National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U20B2069"],"award-info":[{"award-number":["U20B2069"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Video Technol."],"published-print":{"date-parts":[[2024,10]]},"DOI":"10.1109\/tcsvt.2024.3399933","type":"journal-article","created":{"date-parts":[[2024,5,13]],"date-time":"2024-05-13T17:41:40Z","timestamp":1715622100000},"page":"9718-9731","source":"Crossref","is-referenced-by-count":12,"title":["EvCap: Element-Aware Video Captioning"],"prefix":"10.1109","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9608-0524","authenticated-orcid":false,"given":"Sheng","family":"Liu","sequence":"first","affiliation":[{"name":"State Key Laboratory of Virual Reality Technology and Systems, School of Computer Science and Engineering, Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3497-5052","authenticated-orcid":false,"given":"Annan","family":"Li","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Virual Reality Technology and Systems, School of Computer Science and Engineering, Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuwei","family":"Zhao","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Virual Reality Technology and Systems, School of Computer Science and Engineering, Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiahao","family":"Wang","sequence":"additional","affiliation":[{"name":"Huawei Technologies Inc., Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8001-2703","authenticated-orcid":false,"given":"Yunhong","family":"Wang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Virual Reality Technology and Systems, School of Computer Science and Engineering, Beihang University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01741"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01743"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3165934"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2936526"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1023\/A:1020346032608"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/N15-1173"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1038\/323533a0"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v27i1.8679"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16353"},{"key":"ref11","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"28","author":"Ren"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref13","first-page":"102","article-title":"Video in sentences out","volume-title":"Proc. 28th Conf. Uncertainty Artif. Intell.","author":"Barbu"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/2324796.2324799"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5540235"},{"key":"ref16","first-page":"27","article-title":"Describing video contents in natural language","volume-title":"Proc. Workshop Innov. Hybrid Approaches Process. Textual Data","author":"Khan"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.340"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16421"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-26316-3_37"},{"key":"ref21","first-page":"2579","article-title":"Visualizing data using t-SNE","volume":"9","author":"van der Maaten","year":"2008","journal-title":"J. Mach. Learn. Res."},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.23915\/distill.00007"},{"key":"ref24","first-page":"12","article-title":"Learning distributed representations of concepts","volume-title":"Proc. 8th Conf. Cognit. Sci. Soc.","author":"Hinton"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33863-2_37"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2011.6130425"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3177320"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3058626"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3045735"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/381"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00693"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3232634"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2921655"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2984066"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123354"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00852"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01088"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2988435"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01109"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3131721"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107702"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00971"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3158546"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2956593"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.5555\/3298023.3298188"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.4324\/9781410605337-29"},{"key":"ref51","article-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Dosovitskiy"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref53","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. 38th Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2017\/239"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2010.127"},{"key":"ref62","article-title":"Gaussian error linear units (GELUs)","author":"Hendrycks","year":"2016","journal-title":"arXiv:1606.08415"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3079209"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1038\/s41592-018-0019-x"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1162\/089120103321337458"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN.1992.227257"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2984064"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01277"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00273"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2940007"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01311"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_20"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3551581"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109202"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01816"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25484"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2022.109204"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3479207"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"ref80","article-title":"Support-set bottlenecks for video-text representation learning","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Patrick"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_27"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref85","first-page":"937","article-title":"Value: A multi-task benchmark for video-and-language understanding evaluation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Li"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3159472"},{"key":"ref87","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","volume-title":"Proc. 49th Annu. Meeting Assoc. Comput. Linguistics, Hum. Lang. Technol.","author":"Chen"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00795"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref92","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","volume-title":"Proc. ACL Workshop Intrinsic Extrinsic Eval. Measures Mach. Transl. Summarization","author":"Banerjee"},{"key":"ref93","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","volume-title":"Proc. Text Summarization Branches Out","author":"Lin"},{"key":"ref94","article-title":"Microsoft COCO captions: Data collection and evaluation server","author":"Chen","year":"2015","journal-title":"arXiv:1504.00325"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.2307\/j.ctvcm4g18.8"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref97","article-title":"The kinetics human action video dataset","author":"Kay","year":"2017","journal-title":"arXiv:1705.06950"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref100","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Kingma"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01740"},{"key":"ref102","first-page":"3320","article-title":"How transferable are features in deep neural networks?","volume-title":"Proc. 27th Int. Conf. Neural Inf. Process. Syst.","author":"Yosinski"},{"key":"ref103","article-title":"ChatGPT asks, BLIP-2 answers: Automatic questioning towards enriched visual descriptions","author":"Zhu","year":"2023","journal-title":"arXiv:2303.06594"}],"container-title":["IEEE Transactions on Circuits and Systems for Video Technology"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/76\/10739868\/10529278.pdf?arnumber=10529278","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,9]],"date-time":"2025-01-09T20:13:40Z","timestamp":1736453620000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10529278\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10]]},"references-count":103,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tcsvt.2024.3399933","relation":{},"ISSN":["1051-8215","1558-2205"],"issn-type":[{"value":"1051-8215","type":"print"},{"value":"1558-2205","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10]]}}}