{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,11]],"date-time":"2026-05-11T15:14:43Z","timestamp":1778512483502,"version":"3.51.4"},"reference-count":51,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2024,4,1]],"date-time":"2024-04-01T00:00:00Z","timestamp":1711929600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2024,4,1]],"date-time":"2024-04-01T00:00:00Z","timestamp":1711929600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2024,4,1]],"date-time":"2024-04-01T00:00:00Z","timestamp":1711929600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2024,4,1]],"date-time":"2024-04-01T00:00:00Z","timestamp":1711929600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2024,4,1]],"date-time":"2024-04-01T00:00:00Z","timestamp":1711929600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2024,4,1]],"date-time":"2024-04-01T00:00:00Z","timestamp":1711929600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,4,1]],"date-time":"2024-04-01T00:00:00Z","timestamp":1711929600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100005046","name":"Natural Science Foundation of Heilongjiang Province","doi-asserted-by":"publisher","award":["LH2021F023"],"award-info":[{"award-number":["LH2021F023"]}],"id":[{"id":"10.13039\/501100005046","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["51935005"],"award-info":[{"award-number":["51935005"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Image and Vision Computing"],"published-print":{"date-parts":[[2024,4]]},"DOI":"10.1016\/j.imavis.2024.104962","type":"journal-article","created":{"date-parts":[[2024,2,27]],"date-time":"2024-02-27T20:51:50Z","timestamp":1709067110000},"page":"104962","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":20,"special_numbering":"C","title":["C2F: An effective coarse-to-fine network for video summarization"],"prefix":"10.1016","volume":"144","author":[{"given":"Ye","family":"Jin","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoyan","family":"Tian","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhao","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Peng","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xianglong","family":"Tang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.imavis.2024.104962_bb0005","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2022.104467","article-title":"E2e-vsdl: end-to-end video surveillance-based deep learning model to detect and prevent criminal activities","volume":"123","author":"Gandapur","year":"2022","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2024.104962_bb0010","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2019.10.002","article-title":"Visual appearance based person retrieval in unconstrained environment videos","volume":"92","author":"Galiyawala","year":"2019","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2024.104962_bb0015","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2022.104567","article-title":"Multistage temporal convolution transformer for action segmentation","volume":"128","author":"Aziere","year":"2022","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2024.104962_bb0020","first-page":"1","article-title":"Tsrn: two-stage refinement network for temporal action segmentation","author":"Tian","year":"2023","journal-title":"Pattern. Anal. Applic."},{"issue":"2","key":"10.1016\/j.imavis.2024.104962_bb0025","doi-asserted-by":"crossref","first-page":"615","DOI":"10.1007\/s00530-022-00998-4","article-title":"Local\u2013global transformer neural network for temporal action segmentation","volume":"29","author":"Tian","year":"2023","journal-title":"Multimedia Systems"},{"issue":"11","key":"10.1016\/j.imavis.2024.104962_bb0030","doi-asserted-by":"crossref","first-page":"1838","DOI":"10.1109\/JPROC.2021.3117472","article-title":"Video summarization using deep neural networks: a survey","volume":"109","author":"Apostolidis","year":"2021","journal-title":"Proc. IEEE"},{"issue":"1","key":"10.1016\/j.imavis.2024.104962_bb0035","doi-asserted-by":"crossref","first-page":"70","DOI":"10.1016\/j.imavis.2006.01.003","article-title":"Evaluating the mid-secretory endometrium appearance using hysteroscopic digital video summarization","volume":"25","author":"Gavi\u00e3o","year":"2007","journal-title":"Image Vis. Comput."},{"issue":"7","key":"10.1016\/j.imavis.2024.104962_bb0040","doi-asserted-by":"crossref","first-page":"1212","DOI":"10.1016\/j.jvcir.2013.08.003","article-title":"Video key frame extraction through dynamic delaunay clustering with a structural constraint","volume":"24","author":"Kuanar","year":"2013","journal-title":"J. Vis. Commun. Image Represent."},{"issue":"1","key":"10.1016\/j.imavis.2024.104962_bb0045","doi-asserted-by":"crossref","first-page":"66","DOI":"10.1109\/TMM.2011.2166951","article-title":"Towards scalable summarization of consumer videos via sparse dictionary selection","volume":"14","author":"Cong","year":"2011","journal-title":"IEEE Trans. Multimed."},{"issue":"6","key":"10.1016\/j.imavis.2024.104962_bb0050","doi-asserted-by":"crossref","first-page":"1923","DOI":"10.1109\/TCYB.2017.2718579","article-title":"Key frame extraction in the summary space","volume":"48","author":"Li","year":"2017","journal-title":"IEEE Trans. Cybernet."},{"key":"10.1016\/j.imavis.2024.104962_bb0055","series-title":"Proceedings of the 25th ACM international conference on Multimedia","first-page":"863","article-title":"Hierarchical recurrent neural network for video summarization","author":"Zhao","year":"2017"},{"key":"10.1016\/j.imavis.2024.104962_bb0060","series-title":"Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part VII 14","first-page":"766","article-title":"Video summarization with long short-term memory","author":"Zhang","year":"2016"},{"key":"10.1016\/j.imavis.2024.104962_bb0065","doi-asserted-by":"crossref","DOI":"10.1016\/j.compeleceng.2021.107618","article-title":"Deep hierarchical lstm networks with attention for video summarization","volume":"97","author":"Lin","year":"2022","journal-title":"Comput. Electr. Eng."},{"issue":"6","key":"10.1016\/j.imavis.2024.104962_bb0070","doi-asserted-by":"crossref","first-page":"1709","DOI":"10.1109\/TCSVT.2019.2904996","article-title":"Video summarization with attention-based encoder\u2013decoder networks","volume":"30","author":"Ji","year":"2019","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.imavis.2024.104962_bb0075","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1016\/j.neucom.2021.09.015","article-title":"Video summarization with a dual-path attentive network","volume":"467","author":"Liang","year":"2022","journal-title":"Neurocomputing"},{"key":"10.1016\/j.imavis.2024.104962_bb0080","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","article-title":"Deep reinforcement learning for unsupervised video summarization with diversity-representativeness reward","volume":"32","author":"Zhou","year":"2018"},{"key":"10.1016\/j.imavis.2024.104962_bb0085","series-title":"European Conference on Computer Vision","first-page":"167","article-title":"Global-and-local relative position embedding for unsupervised video summarization","author":"Jung","year":"2020"},{"key":"10.1016\/j.imavis.2024.104962_bb0090","series-title":"Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part VI 13","first-page":"540","article-title":"Category-specific video summarization","author":"Potapov","year":"2014"},{"key":"10.1016\/j.imavis.2024.104962_bb0095","series-title":"Proceedings of the 27th ACM International Conference on Multimedia","first-page":"2296","article-title":"Unsupervised video summarization with attentive conditional generative adversarial networks","author":"He","year":"2019"},{"key":"10.1016\/j.imavis.2024.104962_bb0100","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","article-title":"Video summarization via semantic attended networks","volume":"32","author":"Wei","year":"2018"},{"key":"10.1016\/j.imavis.2024.104962_bb0105","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"7902","article-title":"Video summarization by learning from unpaired data","author":"Rochan","year":"2019"},{"key":"10.1016\/j.imavis.2024.104962_bb0110","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Proces. Syst."},{"key":"10.1016\/j.imavis.2024.104962_bb0115","doi-asserted-by":"crossref","DOI":"10.2352\/J.ImagingSci.Technol.2020.64.2.020508","article-title":"Medical image segmentation based on u-net: a review","volume":"64","author":"Du","year":"2020","journal-title":"J. Imag. Sci. Technol."},{"key":"10.1016\/j.imavis.2024.104962_bb0120","series-title":"Proceedings of the European conference on computer vision (ECCV)","first-page":"3","article-title":"Bsn: Boundary sensitive network for temporal action proposal generation","author":"Lin","year":"2018"},{"key":"10.1016\/j.imavis.2024.104962_bb0125","series-title":"Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part VII 13","first-page":"505","article-title":"Creating summaries from user videos","author":"Gygli","year":"2014"},{"key":"10.1016\/j.imavis.2024.104962_bb0130","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"5179","article-title":"Tvsum: Summarizing web videos using titles","author":"Song","year":"2015"},{"issue":"4","key":"10.1016\/j.imavis.2024.104962_bb0135","doi-asserted-by":"crossref","first-page":"1765","DOI":"10.1109\/TNNLS.2020.2991083","article-title":"Deep attentive video summarization with distribution consistency learning","volume":"32","author":"Ji","year":"2020","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.imavis.2024.104962_bb0140","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"347","article-title":"Video summarization using fully convolutional sequence networks","author":"Rochan","year":"2018"},{"key":"10.1016\/j.imavis.2024.104962_bb0145","doi-asserted-by":"crossref","first-page":"200","DOI":"10.1016\/j.neucom.2020.04.132","article-title":"Deep attentive and semantic preserving video summarization","volume":"405","author":"Ji","year":"2020","journal-title":"Neurocomputing"},{"key":"10.1016\/j.imavis.2024.104962_bb0150","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops","article-title":"Video summarization by learning relationships between action and scene","author":"Park","year":"2019"},{"key":"10.1016\/j.imavis.2024.104962_bb0155","doi-asserted-by":"crossref","first-page":"3017","DOI":"10.1109\/TIP.2022.3163855","article-title":"Relational reasoning over spatial-temporal graphs for video summarization","volume":"31","author":"Zhu","year":"2022","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.imavis.2024.104962_bb0160","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3042","article-title":"Umt: Unified multi-modal transformers for joint video moment retrieval and highlight detection","author":"Liu","year":"2022"},{"key":"10.1016\/j.imavis.2024.104962_bb0165","first-page":"13988","article-title":"Clip-it! Language-guided video summarization","volume":"34","author":"Narasimhan","year":"2021","journal-title":"Adv. Neural Inf. Proces. Syst."},{"key":"10.1016\/j.imavis.2024.104962_bb0170","series-title":"IEEE Transactions on Neural Networks and Learning Systems","article-title":"Audiovisual video summarization","author":"Zhao","year":"2021"},{"key":"10.1016\/j.imavis.2024.104962_bb0175","series-title":"ICASSP 2023\u20132023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"1","article-title":"Mhscnet: A multimodal hierarchical shot-aware convolutional network for video summarization","author":"Xu","year":"2023"},{"key":"10.1016\/j.imavis.2024.104962_bb0180","first-page":"1","article-title":"Semantic segmentation of metal surface defects and corresponding strategies","volume":"72","author":"Zhang","year":"2023","journal-title":"IEEE Trans. Instrum. Meas."},{"key":"10.1016\/j.imavis.2024.104962_bb0185","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-030-96530-3","article-title":"Mcanet: hierarchical cross-fusion lightweight transformer based on multi-convhead attention for object detection","volume":"104715","author":"Zhao","year":"2023","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2024.104962_bb0190","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2023.104809","article-title":"Aespnet: attention enhanced stacked parallel network to improve automatic diabetic foot ulcer identification","volume":"138","author":"Das","year":"2023","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2024.104962_bb0195","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2020.107677","article-title":"Exploring global diverse attention via pairwise temporal relation for video summarization","volume":"111","author":"Li","year":"2021","journal-title":"Pattern Recogn."},{"key":"10.1016\/j.imavis.2024.104962_bb0200","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1","article-title":"Going deeper with convolutions","author":"Szegedy","year":"2015"},{"key":"10.1016\/j.imavis.2024.104962_bb0205","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2022.108840","article-title":"Video summarization with a convolutional attentive adversarial network","volume":"131","author":"Liang","year":"2022","journal-title":"Pattern Recogn."},{"key":"10.1016\/j.imavis.2024.104962_bb0210","doi-asserted-by":"crossref","first-page":"5889","DOI":"10.1109\/TIP.2020.2985868","article-title":"Query-biased self-attentive network for query-focused video summarization","volume":"29","author":"Xiao","year":"2020","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.imavis.2024.104962_bb0215","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2021.108312","article-title":"Learning multiscale hierarchical attention for video summarization","volume":"122","author":"Zhu","year":"2022","journal-title":"Pattern Recogn."},{"key":"10.1016\/j.imavis.2024.104962_bb0220","doi-asserted-by":"crossref","first-page":"948","DOI":"10.1109\/TIP.2020.3039886","article-title":"Dsnet: a flexible detect-to-summarize network for video summarization","volume":"30","author":"Zhu","year":"2020","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.imavis.2024.104962_bb0225","article-title":"Faster r-cnn: towards real-time object detection with region proposal networks","volume":"28","author":"Ren","year":"2015","journal-title":"Adv. Neural Inf. Proces. Syst."},{"key":"10.1016\/j.imavis.2024.104962_bb0230","series-title":"Proceedings of the 24th International Conference on Machine Learning","first-page":"129","article-title":"Learning to rank: from pairwise approach to listwise approach","author":"Cao","year":"2007"},{"issue":"1","key":"10.1016\/j.imavis.2024.104962_bb0235","doi-asserted-by":"crossref","first-page":"56","DOI":"10.1016\/j.patrec.2010.08.004","article-title":"Vsumm: a mechanism designed to produce static video summaries and a novel evaluation method","volume":"32","author":"De Avila","year":"2011","journal-title":"Pattern Recogn. Lett."},{"issue":"15","key":"10.1016\/j.imavis.2024.104962_bb0240","doi-asserted-by":"crossref","first-page":"17864","DOI":"10.1007\/s10489-022-03451-1","article-title":"Video summarization with u-shaped transformer","volume":"52","author":"Chen","year":"2022","journal-title":"Appl. Intell."},{"key":"10.1016\/j.imavis.2024.104962_bb0245","doi-asserted-by":"crossref","first-page":"3013","DOI":"10.1109\/TIP.2023.3275069","article-title":"Video summarization with spatiotemporal vision transformer","volume":"32","author":"Hsu","year":"2023","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.imavis.2024.104962_bb0250","article-title":"Vss-net: visual semantic self-mining network for video summarization","author":"Zhang","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.imavis.2024.104962_bb0255","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"202","article-title":"Unsupervised video summarization with adversarial LSTM networks","author":"Mahasseni","year":"2017"}],"container-title":["Image and Vision Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885624000660?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885624000660?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,11]],"date-time":"2026-05-11T14:53:03Z","timestamp":1778511183000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0262885624000660"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4]]},"references-count":51,"alternative-id":["S0262885624000660"],"URL":"https:\/\/doi.org\/10.1016\/j.imavis.2024.104962","relation":{},"ISSN":["0262-8856"],"issn-type":[{"value":"0262-8856","type":"print"}],"subject":[],"published":{"date-parts":[[2024,4]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"C2F: An effective coarse-to-fine network for video summarization","name":"articletitle","label":"Article Title"},{"value":"Image and Vision Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.imavis.2024.104962","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2024 Elsevier B.V. All rights reserved.","name":"copyright","label":"Copyright"}],"article-number":"104962"}}