{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T11:54:07Z","timestamp":1775562847486,"version":"3.50.1"},"reference-count":60,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Image and Vision Computing"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1016\/j.imavis.2026.105946","type":"journal-article","created":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T08:00:51Z","timestamp":1773129651000},"page":"105946","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["MSENet: High efficiency video compression via Multivariate Spatiotemporal Entropy Network"],"prefix":"10.1016","volume":"169","author":[{"given":"Huimin","family":"Lu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8615-7517","authenticated-orcid":false,"given":"Liangfan","family":"Shi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuchao","family":"Zheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yujie","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.imavis.2026.105946_b1","series-title":"Enhancing selective encryption for H. 264\/AVC using advanced encryption standard","author":"Abomhara","year":"2022"},{"key":"10.1016\/j.imavis.2026.105946_b2","doi-asserted-by":"crossref","first-page":"55506","DOI":"10.1109\/ACCESS.2021.3059654","article-title":"A video steganography method based on transform block decision for H. 265\/HEVC","volume":"9","author":"Zhao","year":"2021","journal-title":"IEEE Access"},{"key":"10.1016\/j.imavis.2026.105946_b3","doi-asserted-by":"crossref","first-page":"6318","DOI":"10.1109\/TIP.2023.3330607","article-title":"Adaptive chroma prediction based on luma difference for H. 266\/VVC","volume":"32","author":"Huo","year":"2023","journal-title":"IEEE Trans. Image Process."},{"issue":"10","key":"10.1016\/j.imavis.2026.105946_b4","doi-asserted-by":"crossref","first-page":"3736","DOI":"10.1109\/TCSVT.2021.3101953","article-title":"Overview of the versatile video coding (VVC) standard and its applications","volume":"31","author":"Bross","year":"2021","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.imavis.2026.105946_b5","article-title":"Gacnet: Generate adversarial-driven cross-aware network for hyperspectral wheat variety identification","author":"Zhang","year":"2023","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"10.1016\/j.imavis.2026.105946_b6","first-page":"5118","article-title":"Offline and online optical flow enhancement for deep video compression","volume":"vol. 38","author":"Tang","year":"2024"},{"key":"10.1016\/j.imavis.2026.105946_b7","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2025.105678","article-title":"Combining spatio-temporal attention and multi-level feature fusion for video saliency prediction","author":"Luo","year":"2025","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.105946_b8","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2025.105413","article-title":"Hierarchical spatiotemporal feature interaction network for video saliency prediction","volume":"154","author":"Jin","year":"2025","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.105946_b9","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2023.104674","article-title":"Human activity recognition from UAV videos using a novel DMLC-CNN model","volume":"134","author":"Sinha","year":"2023","journal-title":"Image Vis. Comput."},{"issue":"12","key":"10.1016\/j.imavis.2026.105946_b10","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3571727","article-title":"A survey on perceptually optimized video coding","volume":"55","author":"Zhang","year":"2023","journal-title":"ACM Comput. Surv."},{"issue":"6","key":"10.1016\/j.imavis.2026.105946_b11","doi-asserted-by":"crossref","first-page":"2401","DOI":"10.1109\/TCSVT.2020.3019919","article-title":"A robust quality enhancement method based on joint spatial-temporal priors for video coding","volume":"31","author":"Meng","year":"2020","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.imavis.2026.105946_b12","series-title":"2019 IEEE International Conference on Image Processing","first-page":"2661","article-title":"Machine learning accelerated partition search for video encoding","author":"Su","year":"2019"},{"key":"10.1016\/j.imavis.2026.105946_b13","series-title":"Computer Vision-ECCV 2004: 8th European Conference on Computer Vision, Prague, Czech Republic, May 11-14, 2004. Proceedings, Part IV 8","first-page":"25","article-title":"High accuracy optical flow estimation based on a theory for warping","author":"Brox","year":"2004"},{"key":"10.1016\/j.imavis.2026.105946_b14","doi-asserted-by":"crossref","unstructured":"D. Sun, X. Yang, M.Y. Liu, J. Kautz, Pwc-net: Cnns for optical flow using pyramid, warping, and cost volume, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 8934\u20138943.","DOI":"10.1109\/CVPR.2018.00931"},{"key":"10.1016\/j.imavis.2026.105946_b15","doi-asserted-by":"crossref","unstructured":"O. Rippel, A.G. Anderson, K. Tatwawadi, S. Nair, C. Lytle, L. Bourdev, Elf-vc: Efficient learned flexible-rate video coding, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 14479\u201314488.","DOI":"10.1109\/ICCV48922.2021.01421"},{"key":"10.1016\/j.imavis.2026.105946_b16","doi-asserted-by":"crossref","unstructured":"A. Djelouah, J. Campos, S. Schaub-Meyer, C. Schroers, Neural inter-frame compression for video coding, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 6421\u20136429.","DOI":"10.1109\/ICCV.2019.00652"},{"key":"10.1016\/j.imavis.2026.105946_b17","doi-asserted-by":"crossref","DOI":"10.1109\/TMM.2024.3453033","article-title":"Multi-prior driven resolution rescaling blocks for intra frame coding","author":"Wu","year":"2024","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.imavis.2026.105946_b18","doi-asserted-by":"crossref","DOI":"10.1109\/TMM.2024.3414549","article-title":"Toward efficient video compression artifact detection and removal: A benchmark dataset","author":"Lin","year":"2024","journal-title":"IEEE Trans. Multimed."},{"issue":"8","key":"10.1016\/j.imavis.2026.105946_b19","doi-asserted-by":"crossref","first-page":"1443","DOI":"10.1109\/TIP.2008.925381","article-title":"Occlusion-aware optical flow estimation","volume":"17","author":"Ince","year":"2008","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.imavis.2026.105946_b20","first-page":"18114","article-title":"Deep contextual video compression","volume":"34","author":"Li","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105946_b21","article-title":"Temporal context mining for learned video compression","author":"Sheng","year":"2022","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.imavis.2026.105946_b22","doi-asserted-by":"crossref","unstructured":"J. Li, B. Li, Y. Lu, Hybrid spatial-temporal entropy modelling for neural video compression, in: Proceedings of the 30th ACM International Conference on Multimedia, 2022, pp. 1503\u20131511.","DOI":"10.1145\/3503161.3547845"},{"key":"10.1016\/j.imavis.2026.105946_b23","series-title":"Neural video compression with feature modulation","author":"Li","year":"2024"},{"key":"10.1016\/j.imavis.2026.105946_b24","unstructured":"J. Xiang, K. Tian, J. Zhang, Mimt: Masked image modeling transformer for video compression, in: The Eleventh International Conference on Learning Representations, 2022."},{"key":"10.1016\/j.imavis.2026.105946_b25","doi-asserted-by":"crossref","DOI":"10.1109\/TCSVT.2024.3360248","article-title":"Spatial decomposition and temporal fusion based inter prediction for learned video compression","author":"Sheng","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.imavis.2026.105946_b26","series-title":"Variational image compression with a scale hyperprior","author":"Ball\u00e9","year":"2018"},{"key":"10.1016\/j.imavis.2026.105946_b27","article-title":"Joint autoregressive and hierarchical priors for learned image compression","volume":"31","author":"Minnen","year":"2018","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"1","key":"10.1016\/j.imavis.2026.105946_b28","first-page":"1","article-title":"Objective Bayesian model selection for spatial hierarchical models with intrinsic conditional autoregressive priors","volume":"1","author":"Porter","year":"2023","journal-title":"Bayesian Anal."},{"key":"10.1016\/j.imavis.2026.105946_b29","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2024.105037","article-title":"Image recognition based on lightweight convolutional neural network: Recent advances","volume":"146","author":"Liu","year":"2024","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.105946_b30","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2024.105281","article-title":"A dictionary learning based unsupervised neural network for single image compressed sensing","volume":"151","author":"Luo","year":"2024","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.105946_b31","series-title":"End-to-end optimized image compression","author":"Ball\u00e9","year":"2016"},{"key":"10.1016\/j.imavis.2026.105946_b32","doi-asserted-by":"crossref","unstructured":"Z. Cheng, H. Sun, M. Takeuchi, J. Katto, Learned image compression with discretized gaussian mixture likelihoods and attention modules, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 7939\u20137948.","DOI":"10.1109\/CVPR42600.2020.00796"},{"issue":"1","key":"10.1016\/j.imavis.2026.105946_b33","doi-asserted-by":"crossref","first-page":"421","DOI":"10.1109\/TCSVT.2022.3199472","article-title":"Joint graph attention and asymmetric convolutional neural network for deep image compression","volume":"33","author":"Tang","year":"2022","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.imavis.2026.105946_b34","doi-asserted-by":"crossref","DOI":"10.1016\/j.media.2024.103280","article-title":"TransUNet: Rethinking the U-net architecture design for medical image segmentation through the lens of transformers","volume":"97","author":"Chen","year":"2024","journal-title":"Med. Image Anal."},{"key":"10.1016\/j.imavis.2026.105946_b35","doi-asserted-by":"crossref","unstructured":"R. Zou, C. Song, Z. Zhang, The devil is in the details: Window-based attention for image compression, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 17492\u201317501.","DOI":"10.1109\/CVPR52688.2022.01697"},{"key":"10.1016\/j.imavis.2026.105946_b36","doi-asserted-by":"crossref","unstructured":"J. Liu, H. Sun, J. Katto, Learned image compression with mixed transformer-cnn architectures, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 14388\u201314397.","DOI":"10.1109\/CVPR52729.2023.01383"},{"key":"10.1016\/j.imavis.2026.105946_b37","doi-asserted-by":"crossref","unstructured":"R. Gong, X. Liu, S. Jiang, T. Li, P. Hu, J. Lin, F. Yu, J. Yan, Differentiable soft quantization: Bridging full-precision and low-bit neural networks, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 4852\u20134861.","DOI":"10.1109\/ICCV.2019.00495"},{"key":"10.1016\/j.imavis.2026.105946_b38","article-title":"Soft-to-hard vector quantization for end-to-end learning compressible representations","volume":"30","author":"Agustsson","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105946_b39","doi-asserted-by":"crossref","unstructured":"J. Yang, X. Shen, J. Xing, X. Tian, H. Li, B. Deng, J. Huang, X.s. Hua, Quantization networks, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019, pp. 7308\u20137316.","DOI":"10.1109\/CVPR.2019.00748"},{"key":"10.1016\/j.imavis.2026.105946_b40","first-page":"6","article-title":"Multiple frames prediction for learned video compression. 2020 IEEE","volume":"vol. 2","author":"Lin","year":"2020"},{"key":"10.1016\/j.imavis.2026.105946_b41","doi-asserted-by":"crossref","unstructured":"R. Yang, F. Mentzer, L.V. Gool, R. Timofte, Learning for video compression with hierarchical quality and recurrent enhancement, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 6628\u20136637.","DOI":"10.1109\/CVPR42600.2020.00666"},{"key":"10.1016\/j.imavis.2026.105946_b42","series-title":"Vct: A video compression transformer","author":"Mentzer","year":"2022"},{"key":"10.1016\/j.imavis.2026.105946_b43","doi-asserted-by":"crossref","unstructured":"J. Li, B. Li, Y. Lu, Neural video compression with diverse contexts, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 22616\u201322626.","DOI":"10.1109\/CVPR52729.2023.02166"},{"key":"10.1016\/j.imavis.2026.105946_b44","doi-asserted-by":"crossref","unstructured":"Z. Liu, J. Ning, Y. Cao, Y. Wei, Z. Zhang, S. Lin, H. Hu, Video swin transformer, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 3202\u20133211.","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"10.1016\/j.imavis.2026.105946_b45","doi-asserted-by":"crossref","unstructured":"Z. Liu, Y. Lin, Y. Cao, H. Hu, Y. Wei, Z. Zhang, S. Lin, B. Guo, Swin transformer: Hierarchical vision transformer using shifted windows, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2021, pp. 10012\u201310022.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"10.1016\/j.imavis.2026.105946_b46","article-title":"Tensorformer: Normalized matrix attention transformer for high-quality point cloud reconstruction","author":"Tian","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.imavis.2026.105946_b47","series-title":"Extreme video compression with pre-trained diffusion models","author":"Li","year":"2024"},{"key":"10.1016\/j.imavis.2026.105946_b48","doi-asserted-by":"crossref","unstructured":"C. Zhang, H. Sun, J. Katto, FLAVC: Learned Video Compression with Feature Level Attention, in: Proceedings of the Computer Vision and Pattern Recognition Conference, 2025, pp. 28019\u201328028.","DOI":"10.1109\/CVPR52734.2025.02609"},{"key":"10.1016\/j.imavis.2026.105946_b49","series-title":"GIViC: Generative implicit video compression","author":"Gao","year":"2025"},{"key":"10.1016\/j.imavis.2026.105946_b50","doi-asserted-by":"crossref","unstructured":"A. Ranjan, M.J. Black, Optical flow estimation using a spatial pyramid network, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 4161\u20134170.","DOI":"10.1109\/CVPR.2017.291"},{"key":"10.1016\/j.imavis.2026.105946_b51","doi-asserted-by":"crossref","unstructured":"D. He, Y. Zheng, B. Sun, Y. Wang, H. Qin, Checkerboard context model for efficient learned image compression, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2021, pp. 14771\u201314780.","DOI":"10.1109\/CVPR46437.2021.01453"},{"key":"10.1016\/j.imavis.2026.105946_b52","doi-asserted-by":"crossref","DOI":"10.1016\/j.media.2024.103280","article-title":"TransUNet: Rethinking the U-net architecture design for medical image segmentation through the lens of transformers","volume":"97","author":"Chen","year":"2024","journal-title":"Med. Image Anal."},{"key":"10.1016\/j.imavis.2026.105946_b53","doi-asserted-by":"crossref","first-page":"1106","DOI":"10.1007\/s11263-018-01144-2","article-title":"Video enhancement with task-oriented flow","volume":"127","author":"Xue","year":"2019","journal-title":"Int. J. Comput. Vis."},{"key":"10.1016\/j.imavis.2026.105946_b54","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2024.110465","article-title":"IBVC: Interpolation-driven B-frame video compression","volume":"153","author":"Xu","year":"2024","journal-title":"Pattern Recognit."},{"key":"10.1016\/j.imavis.2026.105946_b55","doi-asserted-by":"crossref","unstructured":"A. Mercat, M. Viitanen, J. Vanne, UVG dataset: 50\/120fps 4K sequences for video codec analysis and development, in: Proceedings of the 11th ACM Multimedia Systems Conference, 2020, pp. 297\u2013302.","DOI":"10.1145\/3339825.3394937"},{"key":"10.1016\/j.imavis.2026.105946_b56","series-title":"2016 IEEE International Conference on Image Processing","first-page":"1509","article-title":"MCL-JCV: a JND-based H. 264\/AVC video quality assessment dataset","author":"Wang","year":"2016"},{"key":"10.1016\/j.imavis.2026.105946_b57","series-title":"European Conference on Computer Vision","first-page":"207","article-title":"Canf-vc: Conditional augmented normalizing flows for video compression","author":"Ho","year":"2022"},{"key":"10.1016\/j.imavis.2026.105946_b58","doi-asserted-by":"crossref","DOI":"10.1109\/TCSVT.2024.3360248","article-title":"Spatial decomposition and temporal fusion based inter prediction for learned video compression","author":"Sheng","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.imavis.2026.105946_b59","doi-asserted-by":"crossref","DOI":"10.1016\/j.eswa.2024.124932","article-title":"CATNet: Cascaded attention transformer network for marine species image classification","volume":"256","author":"Zhang","year":"2024","journal-title":"Expert Syst. Appl."},{"key":"10.1016\/j.imavis.2026.105946_b60","first-page":"1398","article-title":"Multiscale structural similarity for image quality assessment","volume":"vol. 2","author":"Wang","year":"2003"}],"container-title":["Image and Vision Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885626000521?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885626000521?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T11:07:52Z","timestamp":1775560072000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0262885626000521"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5]]},"references-count":60,"alternative-id":["S0262885626000521"],"URL":"https:\/\/doi.org\/10.1016\/j.imavis.2026.105946","relation":{},"ISSN":["0262-8856"],"issn-type":[{"value":"0262-8856","type":"print"}],"subject":[],"published":{"date-parts":[[2026,5]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"MSENet: High efficiency video compression via Multivariate Spatiotemporal Entropy Network","name":"articletitle","label":"Article Title"},{"value":"Image and Vision Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.imavis.2026.105946","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Published by Elsevier B.V.","name":"copyright","label":"Copyright"}],"article-number":"105946"}}