{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:45:53Z","timestamp":1777657553544,"version":"3.51.4"},"reference-count":73,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2020,2,1]],"date-time":"2020-02-01T00:00:00Z","timestamp":1580515200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"}],"funder":[{"DOI":"10.13039\/100011512","name":"National Research Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100011512","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2020,2]]},"DOI":"10.1109\/tmm.2019.2930041","type":"journal-article","created":{"date-parts":[[2019,7,22]],"date-time":"2019-07-22T23:16:16Z","timestamp":1563837376000},"page":"554-565","source":"Crossref","is-referenced-by-count":34,"title":["Video Storytelling: Textual Summaries for Events"],"prefix":"10.1109","volume":"22","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5217-9204","authenticated-orcid":false,"given":"Junnan","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1239-4428","authenticated-orcid":false,"given":"Yongkang","family":"Wong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qi","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4846-2015","authenticated-orcid":false,"given":"Mohan S.","family":"Kankanhalli","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref73","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"0","journal-title":"Proc North Amer Ch Ass Comput Lingu Human Langu Techno"},{"key":"ref72","first-page":"766","article-title":"Video summarization with long short-term memory","volume":"9911","author":"zhang","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref71","first-page":"2069","article-title":"Diverse sequential subset selection for supervised video summarization","author":"gong","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/E17-1019"},{"key":"ref39","first-page":"1?13","article-title":"VSE++: Improving visual-semantic embeddings with hard negatives","author":"faghri","year":"0","journal-title":"Proc Brit Mach Vis Conf"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.386"},{"key":"ref33","first-page":"1143","article-title":"Im2text: Describing images using 1 million captioned photographs","author":"ordonez","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3994"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2729019"},{"key":"ref30","first-page":"1260","article-title":"Unsupervised learning of view-invariant action representations","author":"li","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2832602"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.11"},{"key":"ref35","first-page":"2346","article-title":"Jointly modeling deep video and compositional text to bridge vision and language in a unified framework","author":"xu","year":"0","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00177"},{"key":"ref60","first-page":"1","article-title":"Empirical evaluation of gated recurrent neural networks on sequence modeling","author":"chung","year":"0","journal-title":"NIPS Deep Learning Workshop"},{"key":"ref62","first-page":"1","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"ref64","first-page":"311","article-title":"BLUE: A method for automatic evaluation of machine translation","author":"papineni","year":"0","journal-title":"Proc Assoc Comput Linguistics"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/N15-1173"},{"key":"ref65","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","author":"banerjee","year":"0","journal-title":"Proc ACL Workshop Intrinsic Extrinsic Eval Measures Mach Transl Summarization"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2599174"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5540112"},{"key":"ref68","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"lin","year":"0","journal-title":"Proc Assoc Comput Linguistics Workshop"},{"key":"ref69","article-title":"Microsoft COCO captions: Data collection and evaluation server","author":"chen","year":"2015","journal-title":"arXiv 1504 00325"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.364"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.356"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1014"},{"key":"ref22","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref21","first-page":"1","article-title":"Deep captioning with multimodal recurrent neural networks (m-RNN)","author":"mao","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123432"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.289"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.512"},{"key":"ref50","first-page":"1","article-title":"Multiple object recognition with visual attention","author":"ba","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref51","first-page":"2204","article-title":"Recurrent models of visual attention","author":"mnih","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref59","first-page":"1724","article-title":"Learning phrase representations using RNN encoder-decoder for statistical smachine translation","author":"cho","year":"0","journal-title":"Proc Empirical Methods Natural Lang Process"},{"key":"ref58","first-page":"3111","article-title":"Distributed representations of words and phrases and their compositionality","author":"mikolov","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref57","first-page":"1","article-title":"Unifying visual-semantic embeddings with multimodal neural language models","author":"kiros","year":"0","journal-title":"NIPS Deep Learning Workshop"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00708"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.293"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"ref53","first-page":"1","article-title":"Sequence level training with recurrent neural networks","author":"ranzato","year":"0","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.100"},{"key":"ref10","first-page":"1445","article-title":"Let your photos talk: Generating narrative paragraph for photo stream via bidirectional attention recurrent neural networks","author":"liu","year":"0","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"ref11","first-page":"275","article-title":"Key frame selection to represent a video","author":"dufaux","year":"0","journal-title":"Proc Int Conf Image Process"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2014.2319778"},{"key":"ref12","doi-asserted-by":"crossref","first-page":"862","DOI":"10.1145\/1141911.1141967","article-title":"Schematic storyboarding for video visualization and editing","volume":"25","author":"goldman","year":"2006","journal-title":"ACM Trans Graph"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298928"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.350"},{"key":"ref15","first-page":"540","article-title":"Category-specific video summarization","volume":"8694","author":"potapov","year":"0","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.120"},{"key":"ref17","first-page":"5179","article-title":"Tvsum: Summarizing web videos using titles","author":"song","year":"0","journal-title":"Proc Comput Vis Pattern Recognit"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N16-1147"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.323"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.496"},{"key":"ref3","first-page":"184","article-title":"Coherent multi-sentence video description with variable level of detail","author":"rohrbach","year":"0","journal-title":"Proc German Conf Pattern Recognit"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2867286"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2017.04.013"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2879642"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/BF00992696"},{"key":"ref9","first-page":"73","article-title":"Expressing an image stream with a sequence of natural sentences","author":"park","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref46","first-page":"1","article-title":"VideoSET: Video summary evaluation through text","author":"yeung","year":"0","journal-title":"Proc Comput Vis Pattern Recognit Workshop"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123297"},{"key":"ref48","first-page":"1","article-title":"Video to text summary: Joint video summarization and captioning with recurrent neural networks","author":"chen","year":"0","journal-title":"Proc Brit Mach Vis Conf"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2017.115"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2705915"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.118"},{"key":"ref44","first-page":"2019","article-title":"Learning to detect human-object interactions with knowledge","author":"xu","year":"0","journal-title":"Proc Comput Vis Pattern Recognit"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2794265"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6046\/8968789\/08768045.pdf?arnumber=8768045","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,27]],"date-time":"2022-01-27T10:26:13Z","timestamp":1643279173000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8768045\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,2]]},"references-count":73,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tmm.2019.2930041","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,2]]}}}