{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,6]],"date-time":"2025-11-06T11:45:42Z","timestamp":1762429542150,"version":"3.37.3"},"reference-count":34,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,10,16]],"date-time":"2022-10-16T00:00:00Z","timestamp":1665878400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,10,16]],"date-time":"2022-10-16T00:00:00Z","timestamp":1665878400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,10,16]]},"DOI":"10.1109\/icip46576.2022.9897766","type":"proceedings-article","created":{"date-parts":[[2022,11,3]],"date-time":"2022-11-03T21:27:24Z","timestamp":1667510844000},"page":"3656-3661","source":"Crossref","is-referenced-by-count":23,"title":["VLCAP: Vision-Language with Contrastive Learning for Coherent Video Paragraph Captioning"],"prefix":"10.1109","author":[{"given":"Kashu","family":"Yamazaki","sequence":"first","affiliation":[{"name":"University of Arkansas,Fayetteville,AR,USA,72701"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sang","family":"Truong","sequence":"additional","affiliation":[{"name":"University of Arkansas,Fayetteville,AR,USA,72701"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Khoa","family":"Vo","sequence":"additional","affiliation":[{"name":"University of Arkansas,Fayetteville,AR,USA,72701"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Michael","family":"Kidd","sequence":"additional","affiliation":[{"name":"University of Arkansas,Fayetteville,AR,USA,72701"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chase","family":"Rainwater","sequence":"additional","affiliation":[{"name":"University of Arkansas,Fayetteville,AR,USA,72701"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Khoa","family":"Luu","sequence":"additional","affiliation":[{"name":"University of Arkansas,Fayetteville,AR,USA,72701"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ngan","family":"Le","sequence":"additional","affiliation":[{"name":"University of Arkansas,Fayetteville,AR,USA,72701"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"ref1","DOI":"10.18653\/v1\/P19-1285"},{"doi-asserted-by":"publisher","key":"ref2","DOI":"10.18653\/v1\/2020.acl-main.233"},{"key":"ref3","article-title":"AEI: Actors-Environment Interaction with Adaptive Attention for Temporal Action Proposals Generation","author":"Vo","year":"2021","journal-title":"BMVC"},{"doi-asserted-by":"publisher","key":"ref4","DOI":"10.1109\/CVPR.2018.00911"},{"key":"ref5","first-page":"5998","article-title":"Attention is all you need","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017","author":"Vaswani"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.1007\/978-3-030-01252-6_29"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref8","first-page":"1310","article-title":"On the difficulty of training recurrent neural networks","volume-title":"Proceedings of the 30th International Conference on Machine Learning, ICML 2013, Atlanta, GA, USA, 16-21 June 2013","volume":"28","author":"Pascanu"},{"year":"2001","author":"Hochreiter","article-title":"Gradient flow in recurrent nets: the difficulty of learning long-term dependencies","key":"ref9"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1016\/j.tics.2020.08.005"},{"doi-asserted-by":"publisher","key":"ref11","DOI":"10.1109\/CVPR.2019.00676"},{"doi-asserted-by":"publisher","key":"ref12","DOI":"10.1109\/CVPR.2019.00674"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.1109\/ICCV48922.2021.00677"},{"doi-asserted-by":"publisher","key":"ref14","DOI":"10.1109\/TPAMI.2012.59"},{"doi-asserted-by":"publisher","key":"ref15","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref16","first-page":"568","article-title":"Two-stream convolutional networks for action recognition in videos","volume-title":"Advances in Neural Information Processing Systems 27: Annual Conference on Neural Information Processing Systems 2014, December 8-13 2014","author":"Simonyan"},{"doi-asserted-by":"publisher","key":"ref17","DOI":"10.1109\/CVPR.2016.213"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.1109\/ICCV.2019.00630"},{"volume-title":"7th International Conference on Learning Representations, ICLR 2019, New Orleans, LA, USA, May 6-9, 2019","author":"Hjelm","article-title":"Learning deep representations by mutual information estimation and maximization","key":"ref19"},{"doi-asserted-by":"publisher","key":"ref20","DOI":"10.1007\/978-3-030-58621-8_45"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.1109\/ICCV.2017.83"},{"key":"ref22","first-page":"7590","article-title":"Towards automatic learning of procedures from web instructional videos","volume-title":"Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence, (AAAI-18), the 30th innovative Applications of Artificial Intelligence (IAAI-18), and the 8th AAAI Symposium on Educational Advances in Artificial Intelligence (EAAI-18), New Orleans, Louisiana, USA, February 2-7, 2018","author":"Zhou"},{"volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual","author":"Ging","article-title":"COOT: cooperative hierarchical transformer for video-text representation learning","key":"ref23"},{"doi-asserted-by":"publisher","key":"ref24","DOI":"10.1109\/ICCV48922.2021.00209"},{"year":"2021","author":"Yang","article-title":"CLIP Meets Video Captioners: Attribute-Aware Representation Learning Promotes Accurate Captioning","key":"ref25"},{"key":"ref26","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proceedings of Machine Learning Research","volume":"139","author":"Radford"},{"volume-title":"9th International Conference on Learning Representations, ICLR 2021, Virtual Event","author":"Dosovitskiy","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","key":"ref27"},{"doi-asserted-by":"publisher","key":"ref28","DOI":"10.1109\/TMM.2020.3003592"},{"doi-asserted-by":"publisher","key":"ref29","DOI":"10.1007\/978-3-030-58577-8_7"},{"doi-asserted-by":"publisher","key":"ref30","DOI":"10.3115\/1073083.1073135"},{"doi-asserted-by":"publisher","key":"ref31","DOI":"10.3115\/v1\/W14-3348"},{"doi-asserted-by":"publisher","key":"ref32","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref33","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","volume-title":"Text Summarization Branches Out","author":"Lin"},{"doi-asserted-by":"publisher","key":"ref34","DOI":"10.1109\/ICCV.2017.445"}],"event":{"name":"2022 IEEE International Conference on Image Processing (ICIP)","start":{"date-parts":[[2022,10,16]]},"location":"Bordeaux, France","end":{"date-parts":[[2022,10,19]]}},"container-title":["2022 IEEE International Conference on Image Processing (ICIP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9897158\/9897159\/09897766.pdf?arnumber=9897766","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,22]],"date-time":"2024-01-22T21:16:41Z","timestamp":1705958201000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9897766\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,10,16]]},"references-count":34,"URL":"https:\/\/doi.org\/10.1109\/icip46576.2022.9897766","relation":{},"subject":[],"published":{"date-parts":[[2022,10,16]]}}}