{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,7]],"date-time":"2024-09-07T18:50:06Z","timestamp":1725735006402},"reference-count":27,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,10,31]],"date-time":"2023-10-31T00:00:00Z","timestamp":1698710400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,10,31]],"date-time":"2023-10-31T00:00:00Z","timestamp":1698710400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100020950","name":"National Science and Technology Council","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100020950","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,10,31]]},"DOI":"10.1109\/apsipaasc58517.2023.10317231","type":"proceedings-article","created":{"date-parts":[[2023,11,20]],"date-time":"2023-11-20T19:07:46Z","timestamp":1700507266000},"page":"1506-1511","source":"Crossref","is-referenced-by-count":0,"title":["Exploring a CLIP-Enhanced Automated Approach for Video Description Generation"],"prefix":"10.1109","author":[{"given":"Siang-Ling","family":"Zhang","sequence":"first","affiliation":[{"name":"National Taiwan Normal University,Department of Computer Science and Information Engineering,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huai-Hsun","family":"Cheng","sequence":"additional","affiliation":[{"name":"National Taiwan Normal University,Department of Computer Science and Information Engineering,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yen-Hsin","family":"Chen","sequence":"additional","affiliation":[{"name":"National Taiwan Normal University,Department of Computer Science and Information Engineering,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mei-Chen","family":"Yeh","sequence":"additional","affiliation":[{"name":"National Taiwan Normal University,Department of Computer Science and Information Engineering,Taiwan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"volume-title":"Proc. of ICML","author":"Alec","article-title":"Learning transferable visual models from natural language supervision","key":"ref1"},{"year":"2021","author":"Radford","article-title":"Learning transferable visual models from natural language supervision","key":"ref2"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1109\/ICIP46576.2022.9897766"},{"volume-title":"Proc. of ICCV","author":"Ranjay","article-title":"Dense-captioning events in videos","key":"ref4"},{"volume-title":"Proc. of CVPR","author":"Steven J","article-title":"Self-critical sequence training for image captioning","key":"ref5"},{"volume-title":"Proc. of IJCAI","author":"Gancho","article-title":"Learning to discretely compose reasoning module networks for video captioning","key":"ref6"},{"volume-title":"Reinforcement learning: An introduction","year":"2018","author":"Richard S","key":"ref7"},{"volume-title":"Proc. of ICML","author":"Junnan","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","key":"ref8"},{"volume-title":"Proc. of CVPR","author":"Marcella","article-title":"Meshed-memory transformer for image captioning","key":"ref9"},{"volume-title":"Proc. of BMVC","author":"Vladimir","article-title":"A better use of audio-visual cues: Dense video captioning with bi-modal transformer","key":"ref10"},{"volume-title":"Proc. of ICCV","author":"Max","article-title":"Frozen in time: A joint video and image encoder for end-to-end retrieval","key":"ref11"},{"volume-title":"TMLR","author":"Jianfeng","article-title":"Git: A generative image-to-text transformer for vision and language","key":"ref12"},{"volume-title":"EMNLP","author":"Chenliang","article-title":"Mplug: Effective and efficient vision-language learning by cross-modal skip-connections","key":"ref13"},{"volume-title":"Proc. of BMVC","author":"Vo","article-title":"Aei: Actors- environment interaction with adaptive attention for temporal action proposals generation","key":"ref14"},{"volume-title":"Proc. of ICLR","author":"Alexey","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","key":"ref15"},{"volume-title":"Proc. of ICML","author":"Alec","article-title":"Learning transferable visual models from natural language supervision","key":"ref16"},{"author":"Vladimir","article-title":"Video features","key":"ref17"},{"volume-title":"Proc. of ACL","author":"Kishore","article-title":"Bleu: A method for automatic evaluation of machine translation","key":"ref18"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.3115\/v1\/W14-3348"},{"volume-title":"Proc. of IEEE CVPR","author":"Ramakrishna","article-title":"Cider: Consensus-based image description evaluation","key":"ref20"},{"volume-title":"Proc. of ECCV","author":"Yilei","article-title":"Move forward and tell: A progressive generator of video descriptions","key":"ref21"},{"volume-title":"Proc. of CVPR","author":"Luowei","article-title":"End-to-end dense video captioning with masked transformer","key":"ref22"},{"volume-title":"Proc. of CVPR","author":"Jae Sung","article-title":"Adversarial inference for multi-sentence video description","key":"ref23"},{"volume-title":"Proc. of CVPR","author":"Luowei","article-title":"Grounded video description","key":"ref24"},{"doi-asserted-by":"publisher","key":"ref25","DOI":"10.18653\/v1\/P19-1285"},{"volume-title":"Proc. of ACL","author":"Jie","article-title":"Mart: Memory-augmented recurrent transformer for coherent video paragraph captioning","key":"ref26"},{"volume-title":"Proc. of CVPR","author":"Teng","article-title":"End-to-end dense video captioning with parallel decoding","key":"ref27"}],"event":{"name":"2023 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)","start":{"date-parts":[[2023,10,31]]},"location":"Taipei, Taiwan","end":{"date-parts":[[2023,11,3]]}},"container-title":["2023 Asia Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10317071\/10317095\/10317231.pdf?arnumber=10317231","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,13]],"date-time":"2024-03-13T20:26:27Z","timestamp":1710361587000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10317231\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,31]]},"references-count":27,"URL":"https:\/\/doi.org\/10.1109\/apsipaasc58517.2023.10317231","relation":{},"subject":[],"published":{"date-parts":[[2023,10,31]]}}}