{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,2]],"date-time":"2026-05-02T15:21:55Z","timestamp":1777735315109,"version":"3.51.4"},"publisher-location":"Cham","reference-count":117,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729669","type":"print"},{"value":"9783031729676","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72967-6_10","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:09:25Z","timestamp":1730574565000},"page":"163-183","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Free-VSC: Free Semantics from\u00a0Visual Foundation Models for\u00a0Unsupervised Video Semantic Compression"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6073-8582","authenticated-orcid":false,"given":"Yuan","family":"Tian","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6951-0090","authenticated-orcid":false,"given":"Guo","family":"Lu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8165-9322","authenticated-orcid":false,"given":"Guangtao","family":"Zhai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"key":"10_CR1","unstructured":"Mmtracking: Openmmlab video perception toolbox and benchmark (2020). https:\/\/github.com\/open-mmlab\/mmtracking"},{"key":"10_CR2","unstructured":"Openmmlab\u2019s next generation video understanding toolbox and benchmark (2020). https:\/\/github.com\/open-mmlab\/mmaction2"},{"key":"10_CR3","doi-asserted-by":"crossref","unstructured":"Akbari, M., Liang, J., Han, J.: Dsslic: deep semantic segmentation-based layered image compression. In: International Conference on Acoustics, Speech and Signal Processing (2019)","DOI":"10.1109\/ICASSP.2019.8683541"},{"key":"10_CR4","doi-asserted-by":"crossref","unstructured":"Bai, Y., et al.: Towards end-to-end image compression and analysis with transformers. In: Annual AAAI Conference on Artificial Intelligence (2022)","DOI":"10.1609\/aaai.v36i1.19884"},{"key":"10_CR5","unstructured":"Ball\u00e9, J., Minnen, D., Singh, S., Hwang, S.J., Johnston, N.: Variational image compression with a scale hyperprior. In: International Conference on Learning Representations (2018)"},{"key":"10_CR6","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? In: International Conference on Machine Learning (2021)"},{"key":"10_CR7","doi-asserted-by":"publisher","first-page":"3736","DOI":"10.1109\/TCSVT.2021.3101953","volume":"31","author":"B Bross","year":"2021","unstructured":"Bross, B., et al.: Overview of the versatile video coding (vvc) standard and its applications. IEEE Trans. Circuits Syst. Video Technol. 31, 3736\u20133764 (2021)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10_CR8","doi-asserted-by":"publisher","first-page":"4924","DOI":"10.1109\/TCSVT.2021.3056134","volume":"31","author":"Q Cai","year":"2021","unstructured":"Cai, Q., Chen, Z., Wu, D.O., Liu, S., Li, X.: A novel video coding strategy in hevc for object detection. IEEE Trans. Circ. Syst. Video Technol. 31, 4924\u20134937 (2021)","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"10_CR9","doi-asserted-by":"crossref","unstructured":"Cao, J., Pang, J., Weng, X., Khirodkar, R., Kitani, K.: Observation-centric sort: rethinking sort for robust multi-object tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.00934"},{"key":"10_CR10","unstructured":"Caron, M., Misra, I., Mairal, J., Goyal, P., Bojanowski, P., Joulin, A.: Unsupervised learning of visual features by contrasting cluster assignments. Adv. Neural Inf. Process. Syst. (2020)"},{"key":"10_CR11","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"10_CR12","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"10_CR13","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1109\/TMM.2015.2502552","volume":"18","author":"J Chao","year":"2015","unstructured":"Chao, J., Steinbach, E.: Keypoint encoding for improved feature extraction from compressed video at low bitrates. IEEE Trans. Multimedia 18, 25\u201339 (2015)","journal-title":"IEEE Trans. Multimedia"},{"key":"10_CR14","unstructured":"Chen, X., Fan, H., Girshick, R., He, K.: Improved baselines with momentum contrastive learning. arXiv (2020)"},{"key":"10_CR15","doi-asserted-by":"crossref","unstructured":"Chen, Y.H., Weng, Y.C., Kao, C.H., Chien, C., Chiu, W.C., Peng, W.H.: Transtic: Transferring transformer-based image compression from human perception to machine perception. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.02129"},{"key":"10_CR16","doi-asserted-by":"crossref","unstructured":"Chen, Y., Dai, X., Liu, M., Chen, D., Yuan, L., Liu, Z.: Dynamic convolution: attention over convolution kernels. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2020)","DOI":"10.1109\/CVPR42600.2020.01104"},{"key":"10_CR17","doi-asserted-by":"crossref","unstructured":"Chen, Z., Fan, K., Wang, S., Duan, L.Y., Lin, W., Kot, A.: Lossy intermediate deep learning feature compression and evaluation. In: ACM International Conference on Multimedia (2019)","DOI":"10.1145\/3343031.3350849"},{"key":"10_CR18","doi-asserted-by":"publisher","first-page":"2230","DOI":"10.1109\/TIP.2019.2941660","volume":"29","author":"Z Chen","year":"2019","unstructured":"Chen, Z., Fan, K., Wang, S., Duan, L., Lin, W., Kot, A.C.: Toward intelligent sensing: intermediate deep feature compression. IEEE Trans. Image Process. 29, 2230\u20132243 (2019)","journal-title":"IEEE Trans. Image Process."},{"key":"10_CR19","doi-asserted-by":"crossref","unstructured":"Cheng, H.K., Schwing, A.G.: Xmem: long-term video object segmentation with an atkinson-shiffrin memory model. arXiv (2022)","DOI":"10.1007\/978-3-031-19815-1_37"},{"key":"10_CR20","doi-asserted-by":"crossref","unstructured":"Choi, H., Bajic, I.V.: High efficiency compression for object detection. In: International Conference on Acoustics, Speech and Signal Processing (2018)","DOI":"10.1109\/ICASSP.2018.8462653"},{"key":"10_CR21","doi-asserted-by":"crossref","unstructured":"Choi, H., Baji\u0107, I.V.: Near-lossless deep feature compression for collaborative intelligence. In: International Workshop on Multimedia Signal Processing (2018)","DOI":"10.1109\/MMSP.2018.8547134"},{"key":"10_CR22","doi-asserted-by":"publisher","first-page":"2739","DOI":"10.1109\/TIP.2022.3160602","volume":"31","author":"H Choi","year":"2022","unstructured":"Choi, H., Bajic, I.V.: Scalable image coding for humans and machines. IEEE Trans. Image Process. 31, 2739\u20132754 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"10_CR23","doi-asserted-by":"crossref","unstructured":"Choi, J., Han, B.: Task-aware quantization network for jpeg image compression. In: European Conference on Computer Vision (2020)","DOI":"10.1007\/978-3-030-58565-5_19"},{"key":"10_CR24","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2020)"},{"key":"10_CR25","doi-asserted-by":"publisher","first-page":"179","DOI":"10.1109\/TIP.2015.2500034","volume":"25","author":"LY Duan","year":"2015","unstructured":"Duan, L.Y., et al.: Overview of the mpeg-cdvs standard. IEEE Trans. Image Process. 25, 179\u2013194 (2015)","journal-title":"IEEE Trans. Image Process."},{"key":"10_CR26","unstructured":"Duan, L.Y., Gao, F., Chen, J., Lin, J., Huang, T.: Compact descriptors for mobile visual search and mpeg cdvs standardization. In: IEEE International Symposium on Circuits and Systems (2013)"},{"key":"10_CR27","doi-asserted-by":"publisher","first-page":"44","DOI":"10.1109\/MMUL.2018.2873844","volume":"26","author":"LY Duan","year":"2018","unstructured":"Duan, L.Y., et al.: Compact descriptors for video analysis: the emerging mpeg standard. IEEE Trans. Multimedia 26, 44\u201354 (2018)","journal-title":"IEEE Trans. Multimedia"},{"key":"10_CR28","doi-asserted-by":"publisher","first-page":"8680","DOI":"10.1109\/TIP.2020.3016485","volume":"29","author":"L Duan","year":"2020","unstructured":"Duan, L., Liu, J., Yang, W., Huang, T., Gao, W.: Video coding for machines: a paradigm of collaborative compression and intelligent analytics. IEEE Trans. Image Process. 29, 8680\u20138695 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"10_CR29","doi-asserted-by":"publisher","first-page":"4405","DOI":"10.1109\/TIP.2022.3180208","volume":"31","author":"S Duan","year":"2022","unstructured":"Duan, S., Chen, H., Gu, J.: Jpd-se: high-level semantics for joint perception-distortion enhancement in image compression. IEEE Trans. Image Process. 31, 4405\u20134416 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"10_CR30","unstructured":"Dubois, Y., Bloem-Reddy, B., Ullrich, K., Maddison, C.J.: Lossy compression for lossless prediction. Adv. Neural Inf. Process. Syst. (2021)"},{"key":"10_CR31","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"10_CR32","doi-asserted-by":"publisher","first-page":"2950","DOI":"10.1109\/TCSVT.2022.3229296","volume":"33","author":"Z Fang","year":"2022","unstructured":"Fang, Z., Shen, L., Li, M., Wang, Z., Jin, Y.: Prior-guided contrastive image compression for underwater machine vision. IEEE Trans. Circ. Syst. Video Technol. 33, 2950\u20132961 (2022)","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"10_CR33","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"10_CR34","doi-asserted-by":"crossref","unstructured":"Feng, R., et al.: Image coding for machines with omnipotent feature learning. arXiv (2022)","DOI":"10.1007\/978-3-031-19836-6_29"},{"key":"10_CR35","doi-asserted-by":"crossref","unstructured":"Galteri, L., Bertini, M., Seidenari, L., Del\u00a0Bimbo, A.: Video compression for object detection algorithms. In: International Conference on Pattern Recognition (2018)","DOI":"10.1109\/ICPR.2018.8546064"},{"key":"10_CR36","doi-asserted-by":"crossref","unstructured":"Ge, X., et al.: Task-aware encoder control for deep video compression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2024)","DOI":"10.1109\/CVPR52733.2024.02460"},{"key":"10_CR37","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., et al.: Generative adversarial networks. Commun. ACM 63, 139\u2013144 (2020)","journal-title":"Commun. ACM"},{"key":"10_CR38","unstructured":"Grill, J.B., et\u00a0al.: Bootstrap your own latent-a new approach to self-supervised learning (2020)"},{"key":"10_CR39","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"10_CR40","doi-asserted-by":"crossref","unstructured":"He, K., Fan, H., Wu, Y., Xie, S., Girshick, R.: Momentum contrast for unsupervised visual representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2020)","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"10_CR41","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. arXiv (2021)"},{"key":"10_CR42","doi-asserted-by":"crossref","unstructured":"Hu, Y., Yang, S., Yang, W., Duan, L.Y., Liu, J.: Towards coding for human and machine vision: a scalable image coding approach. In: International Conference on Multimedia and Expo (2020)","DOI":"10.1109\/ICME46284.2020.9102750"},{"key":"10_CR43","doi-asserted-by":"crossref","unstructured":"Hu, Z., Lu, G., Guo, J., Liu, S., Jiang, W., Xu, D.: Coarse-to-fine deep video coding with hyperprior-guided mode prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.00583"},{"key":"10_CR44","doi-asserted-by":"crossref","unstructured":"Hu, Z., Lu, G., Xu, D.: Fvc: a new framework towards deep video compression in feature space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.00155"},{"key":"10_CR45","doi-asserted-by":"crossref","unstructured":"Huang, Z., et al.: Contrastive masked autoencoders are stronger vision learners. IEEE Trans. Pattern Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3336525"},{"key":"10_CR46","doi-asserted-by":"crossref","unstructured":"Huang, Z., Jia, C., Wang, S., Ma, S.: Visual analysis motivated rate-distortion model for image coding. In: International Conference on Multimedia and Expo (2021)","DOI":"10.1109\/ICME51207.2021.9428417"},{"key":"10_CR47","unstructured":"Huang, Z., Jia, C., Wang, S., Ma, S.: Hmfvc: a human-machine friendly video compression scheme. IEEE Trans. Circ. Syst. Video Technol. (2022)"},{"key":"10_CR48","doi-asserted-by":"crossref","unstructured":"Huynh-Thu, Q., Garcia, M.N., Speranza, F., Corriveau, P., Raake, A.: Study of rating scales for subjective quality assessment of high-definition video. IEEE Trans. Broadcast. 57, 1\u201314 (2010)","DOI":"10.1109\/TBC.2010.2086750"},{"key":"10_CR49","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J.Y., Zhou, T., Efros, A.A.: Image-to-image translation with conditional adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.632"},{"key":"10_CR50","doi-asserted-by":"crossref","unstructured":"Jia, M., Tang, L., Chen, B.C., Cardie, C., Belongie, S., Hariharan, B., Lim, S.N.: Visual prompt tuning. In: European Conference on Computer Vision (2022)","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"10_CR51","doi-asserted-by":"publisher","first-page":"319","DOI":"10.1109\/TPAMI.2008.57","volume":"31","author":"R Kasturi","year":"2008","unstructured":"Kasturi, R., et al.: Framework for performance evaluation of face, text, and vehicle detection and tracking in video: Data, metrics, and protocol. IEEE Trans. Pattern Anal. Mach. Intell. 31, 319\u2013336 (2008)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10_CR52","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv (2014)"},{"key":"10_CR53","doi-asserted-by":"crossref","unstructured":"Kuehne, H., Jhuang, H., Garrote, E., Poggio, T., Serre, T.: Hmdb: a large video database for human motion recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"10_CR54","unstructured":"Li, J., Li, B., Lu, Y.: Deep contextual video compression. Adv. Neural Inf. Process. Syst. (2021)"},{"key":"10_CR55","doi-asserted-by":"crossref","unstructured":"Li, J., Li, B., Lu, Y.: Neural video compression with diverse contexts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.02166"},{"key":"10_CR56","doi-asserted-by":"crossref","unstructured":"Li, K., et al.: Uniformer: unifying convolution and self-attention for visual recognition. IEEE Trans. Pattern Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3282631"},{"key":"10_CR57","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Mvitv2: improved multiscale vision transformers for classification and detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"10_CR58","doi-asserted-by":"crossref","unstructured":"Li, Y., Li, Y., Vasconcelos, N.: Resound: towards action recognition without representation bias. In: European Conference on Computer Vision (2018)","DOI":"10.1007\/978-3-030-01231-1_32"},{"key":"10_CR59","doi-asserted-by":"crossref","unstructured":"Lin, H., Chen, B., Zhang, Z., Lin, J., Wang, X., Zhao, T.: Deepsvc: deep scalable video coding for both machine and human vision. In: ACM MM (2023)","DOI":"10.1145\/3581783.3612500"},{"key":"10_CR60","doi-asserted-by":"crossref","unstructured":"Lin, J., Gan, C., Han, S.: Tsm: temporal shift module for efficient video understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00718"},{"key":"10_CR61","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Doll\u00e1r, P., Girshick, R., He, K., Hariharan, B., Belongie, S.: Feature pyramid networks for object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.106"},{"key":"10_CR62","doi-asserted-by":"crossref","unstructured":"Liu, J., et al.: Conditional entropy coding for efficient video compression. In: European Conference on Computer Vision (2020)","DOI":"10.1007\/978-3-030-58520-4_27"},{"key":"10_CR63","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"10_CR64","doi-asserted-by":"crossref","unstructured":"Lu, G., Ouyang, W., Xu, D., Zhang, X., Cai, C., Gao, Z.: Dvc: an end-to-end deep video compression framework. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2019)","DOI":"10.1109\/CVPR.2019.01126"},{"key":"10_CR65","doi-asserted-by":"crossref","unstructured":"Lu, G., Zhang, X., Ouyang, W., Chen, L., Gao, Z., Xu, D.: An end-to-end learning framework for video compression. IEEE Trans. Pattern Anal. Mach. Intell. (2020)","DOI":"10.1109\/TPAMI.2020.2988453"},{"key":"10_CR66","unstructured":"Mentzer, F., et al.: Vct: a video compression transformer. Adv. Neural Inf. Process. Syst. (2022)"},{"key":"10_CR67","unstructured":"Milan, A., Leal-Taix\u00e9, L., Reid, I., Roth, S., Schindler, K.: Mot16: a benchmark for multi-object tracking. arXiv (2016)"},{"key":"10_CR68","doi-asserted-by":"crossref","unstructured":"Minnen, D., Singh, S.: Channel-wise autoregressive entropy models for learned image compression. In: IEEE International Conference on Image Processing (2020)","DOI":"10.1109\/ICIP40778.2020.9190935"},{"key":"10_CR69","unstructured":"Oquab, M., et\u00a0al.: Dinov2: learning robust visual features without supervision. arXiv (2023)"},{"key":"10_CR70","unstructured":"Pan, J., Lin, Z., Zhu, X., Shao, J., Li, H.: St-adapter: parameter-efficient image-to-video transfer learning. Adv. Neural Inf. Process. Syst. (2022)"},{"key":"10_CR71","unstructured":"Paszke, A., et\u00a0al.: Pytorch: an imperative style, high-performance deep learning library. Adv. Neural Inf. Process. Syst. (2019)"},{"key":"10_CR72","unstructured":"Pont-Tuset, J., Perazzi, F., Caelles, S., Arbel\u00e1ez, P., Sorkine-Hornung, A., Van\u00a0Gool, L.: The 2017 davis challenge on video object segmentation. arXiv (2017)"},{"key":"10_CR73","doi-asserted-by":"crossref","unstructured":"Sandler, M., Howard, A., Zhu, M., Zhmoginov, A., Chen, L.C.: Mobilenetv2: inverted residuals and linear bottlenecks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00474"},{"key":"10_CR74","doi-asserted-by":"crossref","unstructured":"Shannon, C.E.: A mathematical theory of communication. Bell Syst. Tech. J. (1948)","DOI":"10.1002\/j.1538-7305.1948.tb00917.x"},{"key":"10_CR75","doi-asserted-by":"crossref","unstructured":"Shao, D., Zhao, Y., Dai, B., Lin, D.: Finegym: a hierarchical video dataset for fine-grained action understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2020)","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"10_CR76","doi-asserted-by":"crossref","unstructured":"Singh, S., Abu-El-Haija, S., Johnston, N., Ball\u00e9, J., Shrivastava, A., Toderici, G.: End-to-end learning of compressible features. In: IEEE International Conference on Image Processing (2020)","DOI":"10.1109\/ICIP40778.2020.9190860"},{"key":"10_CR77","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: Ucf101: a dataset of 101 human actions classes from videos in the wild. arXiv (2012)"},{"key":"10_CR78","doi-asserted-by":"crossref","unstructured":"Sullivan, G.J., Ohm, J.R., Han, W.J., Wiegand, T.: Overview of the high efficiency video coding (hevc) standard. IEEE Trans. Circ. Syst. Video Technol. (2012)","DOI":"10.1109\/TCSVT.2012.2221191"},{"key":"10_CR79","doi-asserted-by":"crossref","unstructured":"Tan, Z., et al.: Diverse semantic image synthesis via probability distribution modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.00787"},{"key":"10_CR80","doi-asserted-by":"crossref","unstructured":"Tian, Y., Che, Z., Bao, W., Zhai, G., Gao, Z.: Self-supervised motion representation via scattering local motion cues. In: European Conference on Computer Vision (2020)","DOI":"10.1007\/978-3-030-58568-6_5"},{"key":"10_CR81","doi-asserted-by":"crossref","unstructured":"Tian, Y., et al.: Self-conditioned probabilistic learning of video rescaling. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00445"},{"key":"10_CR82","doi-asserted-by":"crossref","unstructured":"Tian, Y., Lu, G., Yan, Y., Zhai, G., Chen, L., Gao, Z.: A coding framework and benchmark towards low-bitrate video understanding. IEEE Trans. Pattern Anal. Mach. Intell. (2024)","DOI":"10.1109\/TPAMI.2024.3367879"},{"key":"10_CR83","doi-asserted-by":"crossref","unstructured":"Tian, Y., Lu, G., Zhai, G.: Smc++: masked learning of unsupervised video semantic compression. arXiv (2024)","DOI":"10.1109\/ICCV51070.2023.01252"},{"key":"10_CR84","doi-asserted-by":"crossref","unstructured":"Tian, Y., Lu, G., Zhai, G., Gao, Z.: Non-semantics suppressed mask learning for unsupervised video semantic compression. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.01252"},{"key":"10_CR85","doi-asserted-by":"crossref","unstructured":"Tian, Y., Min, X., Zhai, G., Gao, Z.: Video-based early asd detection via temporal pyramid networks. In: International Conference on Multimedia and Expo (2019)","DOI":"10.1109\/ICME.2019.00055"},{"key":"10_CR86","doi-asserted-by":"crossref","unstructured":"Tian, Y., Yan, Y., Zhai, G., Chen, L., Gao, Z.: Clsa: a contrastive learning framework with selective aggregation for video rescaling. IEEE Trans. Image Process. 32, 1300\u20131314 (2023)","DOI":"10.1109\/TIP.2023.3242774"},{"key":"10_CR87","doi-asserted-by":"publisher","first-page":"2453","DOI":"10.1007\/s11263-022-01661-1","volume":"130","author":"Y Tian","year":"2022","unstructured":"Tian, Y., Yan, Y., Zhai, G., Guo, G., Gao, Z.: Ean: event adaptive network for enhanced action recognition. Int. J. Comput. Vision 130, 2453\u20132471 (2022)","journal-title":"Int. J. Comput. Vision"},{"key":"10_CR88","unstructured":"Tomar, S.: Converting video formats with ffmpeg. Linux J. (2006)"},{"key":"10_CR89","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: Videomae: masked autoencoders are data-efficient learners for self-supervised video pre-training. Adv. Neural Inf. Process. Syst. (2022)"},{"key":"10_CR90","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. (2017)"},{"key":"10_CR91","unstructured":"Veselov, A.I., Chen, H., Romano, F., Zhijie, Z., Gilmutdinov, M.R.: Hybrid video and feature coding and decoding (2021). uS Patent App. 17\/197,500"},{"key":"10_CR92","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Look before you match: instance understanding matters in video object segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.00225"},{"key":"10_CR93","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: Videomae v2: scaling video masked autoencoders with dual masking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01398"},{"key":"10_CR94","doi-asserted-by":"crossref","unstructured":"Wang, L., Tong, Z., Ji, B., Wu, G.: Tdn: temporal difference networks for efficient action recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.00193"},{"key":"10_CR95","unstructured":"Wang, Y., et\u00a0al.: Internvideo: general video foundation models via generative and discriminative learning. arXiv (2022)"},{"key":"10_CR96","unstructured":"Wieckowski, A., et al.: Vvenc: an open and optimized vvc encoder implementation. In: IEEE International Conference on Multimedia and Expo Workshops"},{"key":"10_CR97","doi-asserted-by":"crossref","unstructured":"Wiegand, T., Sullivan, G.J., Bjontegaard, G., Luthra, A.: Overview of the h. 264\/avc video coding standard. IEEE Trans. Circ. Syst. Video Technol. (2003)","DOI":"10.1109\/TCSVT.2003.815165"},{"key":"10_CR98","doi-asserted-by":"crossref","unstructured":"Wu, C.Y., Singhal, N., Krahenbuhl, P.: Video compression through image interpolation. In: European Conference on Computer Vision (2018)","DOI":"10.1007\/978-3-030-01237-3_26"},{"key":"10_CR99","unstructured":"Xie, Z., et al.: Self-supervised learning with swin transformers. arXiv (2021)"},{"key":"10_CR100","unstructured":"Xu, B., Wang, N., Chen, T., Li, M.: Empirical evaluation of rectified activations in convolutional network. arXiv (2015)"},{"key":"10_CR101","doi-asserted-by":"crossref","unstructured":"Yan, Z., et al.: Dehib: deep hidden backdoor attack on semi-supervised learning via adversarial perturbation. In: Annual AAAI Conference on Artificial Intelligence (2021)","DOI":"10.1609\/aaai.v35i12.17266"},{"key":"10_CR102","doi-asserted-by":"crossref","unstructured":"Yan, Z., Li, S., Zhao, R., Tian, Y., Zhao, Y.: Dhbe: data-free holistic backdoor erasing in deep neural networks via restricted adversarial distillation. In: ACM ASIA Conference on Computer and Communications Security (2023)","DOI":"10.1145\/3579856.3582822"},{"key":"10_CR103","doi-asserted-by":"publisher","first-page":"331","DOI":"10.1109\/LSP.2020.2970539","volume":"27","author":"F Yang","year":"2020","unstructured":"Yang, F., Herranz, L., Van De Weijer, J., Guiti\u00e1n, J.A.I., L\u00f3pez, A.M., Mozerov, M.G.: Variable rate deep image compression with modulated autoencoder. IEEE Signal Process. Lett. 27, 331\u2013335 (2020)","journal-title":"IEEE Signal Process. Lett."},{"key":"10_CR104","doi-asserted-by":"publisher","first-page":"388","DOI":"10.1109\/JSTSP.2020.3043590","volume":"15","author":"R Yang","year":"2020","unstructured":"Yang, R., Mentzer, F., Van Gool, L., Timofte, R.: Learning for video compression with recurrent auto-encoder and recurrent probability model. IEEE J. Sel. Topics Signal Process. 15, 388\u2013401 (2020)","journal-title":"IEEE J. Sel. Topics Signal Process."},{"key":"10_CR105","doi-asserted-by":"crossref","unstructured":"Yang, R., Timofte, R., Van\u00a0Gool, L.: Advancing learned video compression with in-loop frame prediction. IEEE Trans. Circ. Syst. Video Technol. (2022)","DOI":"10.1109\/TCSVT.2022.3222418"},{"key":"10_CR106","doi-asserted-by":"crossref","unstructured":"Yang, R., Van\u00a0Gool, L., Timofte, R.: Perceptual learned video compression with recurrent conditional gan. arXiv (2021)","DOI":"10.24963\/ijcai.2022\/214"},{"key":"10_CR107","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: Discernible image compression. In: ACM International Conference on Multimedia (2020)","DOI":"10.1145\/3394171.3413968"},{"key":"10_CR108","unstructured":"Yi, C., Yang, S., Li, H., Tan, Y.P., Kot, A.: Benchmarking the robustness of spatial-temporal models against corruptions. Adv. Neural Inf. Process. Syst. (2021)"},{"key":"10_CR109","doi-asserted-by":"publisher","first-page":"1378","DOI":"10.1109\/JSTSP.2011.2165201","volume":"5","author":"F Zhang","year":"2011","unstructured":"Zhang, F., Bull, D.R.: A parametric framework for video compression using region-based texture models. IEEE J. Sel. Topics Signal Process. 5, 1378\u20131392 (2011)","journal-title":"IEEE J. Sel. Topics Signal Process."},{"key":"10_CR110","doi-asserted-by":"publisher","first-page":"2889","DOI":"10.1007\/s11263-021-01505-4","volume":"129","author":"Q Zhang","year":"2021","unstructured":"Zhang, Q., Wang, S., Zhang, X., Ma, S., Gao, W.: Just recognizable distortion for machine vision oriented image and video coding. Int. J. Comput. Vision 129, 2889\u20132906 (2021)","journal-title":"Int. J. Comput. Vision"},{"key":"10_CR111","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"10_CR112","doi-asserted-by":"crossref","unstructured":"Zhang, X., Wu, X.: Lvqac: lattice vector quantization coupled with spatially adaptive companding for efficient learned image compression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.00987"},{"key":"10_CR113","doi-asserted-by":"publisher","first-page":"633","DOI":"10.1109\/TIP.2016.2629447","volume":"26","author":"X Zhang","year":"2016","unstructured":"Zhang, X., Ma, S., Wang, S., Zhang, X., Sun, H., Gao, W.: A joint compression scheme of video feature descriptors and visual content. IEEE Trans. Image Process. 26, 633\u2013647 (2016)","journal-title":"IEEE Trans. Image Process."},{"key":"10_CR114","doi-asserted-by":"crossref","unstructured":"Zhang, X., et al.: Gaussianimage: 1000 fps image representation and compression by 2d gaussian splatting. arXiv (2024)","DOI":"10.1007\/978-3-031-72673-6_18"},{"key":"10_CR115","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Bytetrack: multi-object tracking by associating every detection box. In: European Conference on Computer Vision (2022)","DOI":"10.1007\/978-3-031-20047-2_1"},{"key":"10_CR116","unstructured":"Zhao, L., et\u00a0al.: Videoprism: a foundational visual encoder for video understanding. In: International Conference on Machine Learning (2024)"},{"key":"10_CR117","unstructured":"Zhou, J., et al.: ibot: image bert pre-training with online tokenizer. In: International Conference on Learning Representations (2021)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72967-6_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:14:21Z","timestamp":1730574861000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72967-6_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031729669","9783031729676"],"references-count":117,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72967-6_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}