{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T16:07:46Z","timestamp":1781021266611,"version":"3.54.1"},"reference-count":64,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,8,1]],"date-time":"2026-08-01T00:00:00Z","timestamp":1785542400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100014188","name":"Korea Ministry of Science and ICT","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100014188","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Applied Soft Computing"],"published-print":{"date-parts":[[2026,8]]},"DOI":"10.1016\/j.asoc.2026.115420","type":"journal-article","created":{"date-parts":[[2026,5,12]],"date-time":"2026-05-12T13:10:47Z","timestamp":1778591447000},"page":"115420","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Multi-modality based future predictive representation learning for video anticipation captioning"],"prefix":"10.1016","volume":"200","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-0473-216X","authenticated-orcid":false,"given":"Jiyeon","family":"Hwang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Suyoung","family":"Kim","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0398-831X","authenticated-orcid":false,"given":"Ho-Young","family":"Jung","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.asoc.2026.115420_bib0005","series-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","first-page":"980","article-title":"Video captioning of future frames","author":"Hosseinzadeh","year":"2021"},{"key":"10.1016\/j.asoc.2026.115420_bib0010","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10714","article-title":"Vid2seq: large-scale pretraining of a visual language model for dense video captioning","author":"Yang","year":"2023"},{"key":"10.1016\/j.asoc.2026.115420_bib0015","series-title":"Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations","first-page":"543","article-title":"Video-llama: an instruction-tuned audio-visual language model for video understanding","author":"Zhang","year":"2023"},{"key":"10.1016\/j.asoc.2026.115420_bib0020","author":"Luo"},{"key":"10.1016\/j.asoc.2026.115420_bib0025","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence, 37","first-page":"3081","article-title":"VLTinT: visual-linguistic transformer-in-transformer for coherent video paragraph captioning","author":"Yamazaki","year":"2023"},{"key":"10.1016\/j.asoc.2026.115420_bib0030","author":"Chen"},{"key":"10.1016\/j.asoc.2026.115420_bib0035","doi-asserted-by":"crossref","DOI":"10.3389\/frobt.2022.929267","article-title":"Two ways to make your robot proactive: reasoning about human intentions or reasoning about possible futures","volume":"9","author":"Buyukgoz","year":"2022","journal-title":"Front. Robot. AI"},{"key":"10.1016\/j.asoc.2026.115420_bib0040","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.asoc.2026.115420_bib0045","author":"Lei"},{"key":"10.1016\/j.asoc.2026.115420_bib0050","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"7365","article-title":"Predictive feature learning for future segmentation prediction","author":"Lin","year":"2021"},{"key":"10.1016\/j.asoc.2026.115420_bib0055","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"648","article-title":"Predicting deeper into the future of semantic segmentation","author":"Luc","year":"2017"},{"issue":"7","key":"10.1016\/j.asoc.2026.115420_bib0060","first-page":"3386","article-title":"Apanet: auto-path aggregation for future instance segmentation prediction","volume":"44","author":"Hu","year":"2021","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.asoc.2026.115420_bib0065","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"584","article-title":"Predicting future instance segmentation by forecasting convolutional features","author":"Luc","year":"2018"},{"key":"10.1016\/j.asoc.2026.115420_bib0070","series-title":"Proceedings of the 27th ACM International Conference on Multimedia","first-page":"2043","article-title":"Predicting future instance segmentation with contextual pyramid convlstms","author":"Sun","year":"2019"},{"key":"10.1016\/j.asoc.2026.115420_bib0075","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"5343","article-title":"When will you do what?-Anticipating temporal occurrences of activities","author":"Abu Farha","year":"2018"},{"key":"10.1016\/j.asoc.2026.115420_bib0080","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops","first-page":"0","article-title":"Uncertainty-aware anticipation of activities","author":"Abu Farha","year":"2019"},{"key":"10.1016\/j.asoc.2026.115420_bib0085","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"9925","article-title":"Time-conditioned action anticipation in one shot","author":"Ke","year":"2019"},{"key":"10.1016\/j.asoc.2026.115420_bib0090","series-title":"Pattern Recognition: 42nd DAGM German Conference, DAGM GCPR 2020, T\u00fcbingen, Germany, September 28\u2013October 1, 2020, Proceedings 42","first-page":"159","article-title":"Long-term anticipation of activities with cycle consistency","author":"Abu Farha","year":"2021"},{"key":"10.1016\/j.asoc.2026.115420_bib0095","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XVI 16","first-page":"154","article-title":"Temporal aggregate representations for long-range video understanding","author":"Sener","year":"2020"},{"key":"10.1016\/j.asoc.2026.115420_bib0100","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3052","article-title":"Future transformer for long-term action anticipation","author":"Gong","year":"2022"},{"key":"10.1016\/j.asoc.2026.115420_bib0105","series-title":"Frontiers of Multimedia Research","first-page":"3","article-title":"Deep learning for video classification and captioning","author":"Wu","year":"2017"},{"key":"10.1016\/j.asoc.2026.115420_bib0110","series-title":"2022 IEEE International Conference on Image Processing (ICIP)","first-page":"2601","article-title":"Relational future captioning model for explaining likely collisions in daily tasks","author":"Kambara","year":"2022"},{"issue":"3","key":"10.1016\/j.asoc.2026.115420_bib0115","doi-asserted-by":"crossref","first-page":"4373","DOI":"10.1109\/TIV.2024.3353172","article-title":"Anticipation video captioning of aerial refueling based on combined attention masking mechanism","volume":"9","author":"Wu","year":"2024","journal-title":"IEEE Trans. Intell. Veh."},{"key":"10.1016\/j.asoc.2026.115420_bib0120","series-title":"International Conference on Machine Learning","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","author":"Chen","year":"2020"},{"key":"10.1016\/j.asoc.2026.115420_bib0125","series-title":"International Conference on Machine Learning","first-page":"9929","article-title":"Understanding contrastive representation learning through alignment and uniformity on the hypersphere","author":"Wang","year":"2020"},{"key":"10.1016\/j.asoc.2026.115420_bib0130","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"706","article-title":"Dense-captioning events in videos","author":"Krishna","year":"2017"},{"key":"10.1016\/j.asoc.2026.115420_bib0135","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence, 32","article-title":"Towards automatic learning of procedures from web instructional videos","author":"Zhou","year":"2018"},{"key":"10.1016\/j.asoc.2026.115420_bib0140","author":"Huang"},{"key":"10.1016\/j.asoc.2026.115420_bib0145","author":"Rochan"},{"key":"10.1016\/j.asoc.2026.115420_bib0150","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10648","article-title":"Warp to the future: joint forecasting of features and feature motion","author":"Saric","year":"2020"},{"key":"10.1016\/j.asoc.2026.115420_bib0155","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"5562","article-title":"Predicting the future: a jointly learnt model for action anticipation","author":"Gammulle","year":"2019"},{"key":"10.1016\/j.asoc.2026.115420_bib0160","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"13505","article-title":"Anticipative video transformer","author":"Girdhar","year":"2021"},{"key":"10.1016\/j.asoc.2026.115420_bib0165","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"13224","article-title":"Anticipating human actions by correlating past with the future with jaccard similarity measures","author":"Fernando","year":"2021"},{"key":"10.1016\/j.asoc.2026.115420_bib0170","doi-asserted-by":"crossref","first-page":"3456","DOI":"10.1109\/TMM.2022.3161189","article-title":"Moving towards centers: re-ranking with attention and memory for re-identification","volume":"25","author":"Zhou","year":"2022","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.asoc.2026.115420_bib0175","series-title":"ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"2422","article-title":"Object counting in video surveillance using multi-scale density map regression","author":"Wang","year":"2019"},{"key":"10.1016\/j.asoc.2026.115420_bib0180","series-title":"International Conference on Machine Learning","first-page":"17782","article-title":"Geometric multimodal contrastive representation learning","author":"Poklukar","year":"2022"},{"key":"10.1016\/j.asoc.2026.115420_bib0185","author":"Lin"},{"key":"10.1016\/j.asoc.2026.115420_bib0190","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"6914","article-title":"Positive-augmented contrastive learning for image and video captioning evaluation","author":"Sarto","year":"2023"},{"key":"10.1016\/j.asoc.2026.115420_bib0195","series-title":"Findings of the Association for Computational Linguistics ACL 2024","first-page":"6561","article-title":"Learning multimodal contrast with cross-modal memory and reinforced contrast recognition","author":"Tian","year":"2024"},{"key":"10.1016\/j.asoc.2026.115420_bib0200","series-title":"ICASSP 2024-2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"3830","article-title":"Semantic-guided network with contrastive learning for video caption","author":"Chen","year":"2024"},{"key":"10.1016\/j.asoc.2026.115420_bib0205","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2023.102216","article-title":"Adapt and explore: multimodal mixup for representation learning","volume":"105","author":"Lin","year":"2024","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.asoc.2026.115420_bib0210","author":"Dufumier"},{"key":"10.1016\/j.asoc.2026.115420_bib0215","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"7190","article-title":"Bidirectional attentive fusion with context gating for dense video captioning","author":"Wang","year":"2018"},{"key":"10.1016\/j.asoc.2026.115420_bib0220","series-title":"The 31st British Machine Vision Virtual Conference","article-title":"A better use of audio-visual cues: dense video captioning with bi-modal transformer","author":"Iashin","year":"2020"},{"key":"10.1016\/j.asoc.2026.115420_bib0225","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"8739","article-title":"End-to-end dense video captioning with masked transformer","author":"Zhou","year":"2018"},{"key":"10.1016\/j.asoc.2026.115420_bib0230","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"6847","article-title":"End-to-end dense video captioning with parallel decoding","author":"Wang","year":"2021"},{"key":"10.1016\/j.asoc.2026.115420_bib0235","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.asoc.2026.115420_bib0240","author":"Devlin"},{"key":"10.1016\/j.asoc.2026.115420_bib0245","series-title":"International Conference on Learning Representations","article-title":"An image is worth 16x16 words: transformers for image recognition at scale","author":"Dosovitskiy","year":"2021"},{"key":"10.1016\/j.asoc.2026.115420_bib0250","series-title":"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing","first-page":"6894","article-title":"Simcse: simple contrastive learning of sentence embeddings","author":"Gao","year":"2021"},{"key":"10.1016\/j.asoc.2026.115420_bib0255","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"19968","article-title":"Alignment-uniformity aware representation learning for zero-shot video classification","author":"Pu","year":"2022"},{"key":"10.1016\/j.asoc.2026.115420_bib0260","first-page":"25","article-title":"Self-supervised multimodal versatile networks","volume":"33","author":"Alayrac","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.asoc.2026.115420_bib0265","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"9879","article-title":"End-to-end learning of visual representations from uncurated instructional videos","author":"Miech","year":"2020"},{"key":"10.1016\/j.asoc.2026.115420_bib0270","first-page":"24206","article-title":"Vatt: transformers for multimodal self-supervised learning from raw video, audio and text","volume":"34","author":"Akbari","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.asoc.2026.115420_bib0275","author":"Zellers"},{"key":"10.1016\/j.asoc.2026.115420_bib0280","author":"Loshchilov"},{"key":"10.1016\/j.asoc.2026.115420_bib0285","series-title":"Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics","first-page":"311","article-title":"BLEU: a method for automatic evaluation of machine translation","author":"Papineni","year":"2002"},{"key":"10.1016\/j.asoc.2026.115420_bib0290","series-title":"Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization","first-page":"65","article-title":"METEOR: an automatic metric for MT evaluation with improved correlation with human judgments","author":"Banerjee","year":"2005"},{"key":"10.1016\/j.asoc.2026.115420_bib0295","series-title":"Proceedings of the 42nd Annual Meeting of the Association for Computational Linguistics (ACL-04)","first-page":"605","article-title":"Automatic evaluation of machine translation quality using longest common subsequence and skip-bigram statistics","author":"Lin","year":"2004"},{"key":"10.1016\/j.asoc.2026.115420_bib0300","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"4566","article-title":"Cider: consensus-based image description evaluation","author":"Vedantam","year":"2015"},{"key":"10.1016\/j.asoc.2026.115420_bib0305","article-title":"Evaluation metrics for video captioning: a survey","volume":"13","author":"de Souza In\u00e1cio","year":"2023","journal-title":"Mach. Learn. Appl."},{"key":"10.1016\/j.asoc.2026.115420_bib0310","author":"Caglayan"},{"key":"10.1016\/j.asoc.2026.115420_bib0315","author":"Bai"},{"key":"10.1016\/j.asoc.2026.115420_bib0320","author":"Wang"}],"container-title":["Applied Soft Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1568494626008689?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1568494626008689?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T15:54:06Z","timestamp":1781020446000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1568494626008689"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,8]]},"references-count":64,"alternative-id":["S1568494626008689"],"URL":"https:\/\/doi.org\/10.1016\/j.asoc.2026.115420","relation":{},"ISSN":["1568-4946"],"issn-type":[{"value":"1568-4946","type":"print"}],"subject":[],"published":{"date-parts":[[2026,8]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Multi-modality based future predictive representation learning for video anticipation captioning","name":"articletitle","label":"Article Title"},{"value":"Applied Soft Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.asoc.2026.115420","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"115420"}}