{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:40:25Z","timestamp":1730302825534,"version":"3.28.0"},"reference-count":42,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,12,13]],"date-time":"2022-12-13T00:00:00Z","timestamp":1670889600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,12,13]],"date-time":"2022-12-13T00:00:00Z","timestamp":1670889600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,12,13]]},"DOI":"10.1109\/vcip56404.2022.10008808","type":"proceedings-article","created":{"date-parts":[[2023,1,16]],"date-time":"2023-01-16T19:26:56Z","timestamp":1673897216000},"page":"1-5","source":"Crossref","is-referenced-by-count":0,"title":["STSI: Efficiently Mine Spatio- Temporal Semantic Information between Different Multimodal for Video Captioning"],"prefix":"10.1109","author":[{"given":"Huiyu","family":"Xiong","sequence":"first","affiliation":[{"name":"School of Information and Communication Engineering, University of Electronic Science and Technology of China,Chengdu,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lanxiao","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Information and Communication Engineering, University of Electronic Science and Technology of China,Chengdu,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.337"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11231"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref6","article-title":"Faster r-cnn: Towards real-time object detection with region proposal networks","volume":"28","author":"Ren","year":"2015","journal-title":"Advances in neural information processing systems"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.117"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00795"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00901"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00852"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/104"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01088"},{"journal-title":"Categorical reparameterization with gumbel-softmax","year":"2016","author":"Jang","key":"ref14"},{"key":"ref15","first-page":"190","article-title":"Collecting highly parallel data for paraphrase evaluation","volume-title":"Proceedings of the 49th annual meeting of the association for computational linguistics: human language technologies","author":"Chen"},{"issue":"2","key":"ref16","doi-asserted-by":"crossref","first-page":"171","DOI":"10.1023\/A:1020346032608","article-title":"Natural language description of human activities from video images based on concept hierarchy of actions","volume":"50","author":"Kojima","year":"2002","journal-title":"International Journal of Computer Vision"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-11752-2_15"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/n15-1173"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/2964284.2984065"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.512"},{"key":"ref21","first-page":"358","article-title":"Less is more: Picking in-formative frames for video captioning","volume-title":"Proceedings of the European conference on computer vision (ECCV)","author":"Chen"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123448"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00854"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/VCIP53242.2021.9675348"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.12.137"},{"journal-title":"Neural machine translation by jointly learning to align and translate","year":"2014","author":"Bahdanau","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018167"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2956593"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2894139"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2920899"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107702"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3132229"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2021.108332"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"journal-title":"The kinetics human action video dataset","year":"2017","author":"Kay","key":"ref35"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"journal-title":"Adam: A method for stochastic optimization","year":"2014","author":"Kingma","key":"ref37"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref39","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"Lin","year":"2004","journal-title":"Text summarization branches out"},{"key":"ref40","first-page":"65","article-title":"Meteor: An automatic metric for mt evaluation with improved correlation with human judgments","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization","author":"Banerjee"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"journal-title":"Microsoft coco captions: Data collection and evaluation server","year":"2015","author":"Chen","key":"ref42"}],"event":{"name":"2022 IEEE International Conference on Visual Communications and Image Processing (VCIP)","start":{"date-parts":[[2022,12,13]]},"location":"Suzhou, China","end":{"date-parts":[[2022,12,16]]}},"container-title":["2022 IEEE International Conference on Visual Communications and Image Processing (VCIP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10008391\/10008793\/10008808.pdf?arnumber=10008808","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,2]],"date-time":"2024-03-02T14:30:31Z","timestamp":1709389831000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10008808\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,12,13]]},"references-count":42,"URL":"https:\/\/doi.org\/10.1109\/vcip56404.2022.10008808","relation":{},"subject":[],"published":{"date-parts":[[2022,12,13]]}}}