{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T15:25:30Z","timestamp":1773156330626,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611960","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"8543-8551","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":15,"title":["Neural Video Compression with Spatio-Temporal Cross-Covariance Transformers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0155-4462","authenticated-orcid":false,"given":"Zhenghao","family":"Chen","sequence":"first","affiliation":[{"name":"The University of Sydney, Sydney, NSW, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2109-9823","authenticated-orcid":false,"given":"Lucas","family":"Relic","sequence":"additional","affiliation":[{"name":"ETH Z\u00fcrich, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5473-506X","authenticated-orcid":false,"given":"Roberto","family":"Azevedo","sequence":"additional","affiliation":[{"name":"DisneyResearch|Studios, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2381-6067","authenticated-orcid":false,"given":"Yang","family":"Zhang","sequence":"additional","affiliation":[{"name":"DisneyResearch|Studios, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9324-779X","authenticated-orcid":false,"given":"Markus","family":"Gross","sequence":"additional","affiliation":[{"name":"ETH Z\u00fcrich, Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2775-9730","authenticated-orcid":false,"given":"Dong","family":"Xu","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong SAR, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8762-2424","authenticated-orcid":false,"given":"Luping","family":"Zhou","sequence":"additional","affiliation":[{"name":"The University of Sydney, Sydney, NSW, Australia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1473-1878","authenticated-orcid":false,"given":"Christopher","family":"Schroers","sequence":"additional","affiliation":[{"name":"DisneyResearch|Studios, Zurich, Switzerland"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. Hevc test model (hm). https:\/\/hevc.hhi.fraunhofer.de\/HM-doc\/. Accessed: 2023-03-06."},{"key":"e_1_3_2_1_2_1","volume-title":"d.]. Ultra video group test sequences","unstructured":"[n. d.]. Ultra video group test sequences. http:\/\/ultravideo.cs.tut.fi. Accessed: 2023-03-06."},{"key":"e_1_3_2_1_3_1","unstructured":"[n. d.]. VVC Reference Model (VTM). https:\/\/vcgit.hhi.fraunhofer.de\/jvet\/ VVCSoftware_VTM\/. Accessed: 2023-03-06."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00853"},{"key":"e_1_3_2_1_5_1","volume-title":"Xcit: Cross-covariance image transformers. Advances in neural information processing systems","author":"Ali Alaaeldin","year":"2021","unstructured":"Alaaeldin Ali, Hugo Touvron, Mathilde Caron, Piotr Bojanowski, Matthijs Douze, Armand Joulin, Ivan Laptev, Natalia Neverova, Gabriel Synnaeve, Jakob Verbeek, et al. 2021. Xcit: Cross-covariance image transformers. Advances in neural information processing systems, Vol. 34 (2021), 20014--20027."},{"key":"e_1_3_2_1_6_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Ball\u00e9 Johannes","year":"2018","unstructured":"Johannes Ball\u00e9, David Minnen, Saurabh Singh, Sung Jin Hwang, and Nick Johnston. 2018. Variational image compression with a scale hyperprior. International Conference on Learning Representations (ICLR) (2018)."},{"key":"e_1_3_2_1_7_1","volume-title":"BPG Image format. URL https:\/\/bellard.org\/bpg","author":"Bellard Fabrice","year":"2015","unstructured":"Fabrice Bellard. 2015. BPG Image format. URL https:\/\/bellard.org\/bpg (2015)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3101953"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3140608"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00598"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00796"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00652"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00713"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_12"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00583"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00155"},{"key":"e_1_3_2_1_17_1","first-page":"18114","article-title":"Deep contextual video compression","volume":"34","author":"Li Jiahao","year":"2021","unstructured":"Jiahao Li, Bin Li, and Yan Lu. 2021. Deep contextual video compression. Advances in Neural Information Processing Systems, Vol. 34 (2021), 18114--18125.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547845"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00210"},{"key":"e_1_3_2_1_20_1","first-page":"378","article-title":"Recurrent video restoration transformer with guided deformable attention","volume":"35","author":"Liang Jingyun","year":"2022","unstructured":"Jingyun Liang, Yuchen Fan, Xiaoyu Xiang, Rakesh Ranjan, Eddy Ilg, Simon Green, Jiezhang Cao, Kai Zhang, Radu Timofte, and Luc V Gool. 2022. Recurrent video restoration transformer with guided deformable attention. Advances in Neural Information Processing Systems, Vol. 35 (2022), 378--393.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00360"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612041"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_24_1","unstructured":"Salvator Lombardo Jun Han Christopher Schroers and Stephan Mandt. 2019. Deep generative video compression. In Advances in Neural Information Processing Systems. 9287--9298."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_27"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01126"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2988453"},{"key":"e_1_3_2_1_28_1","volume-title":"An end-to-end learning framework for video compression","author":"Lu Guo","year":"2020","unstructured":"Guo Lu, Xiaoyun Zhang, Wanli Ouyang, Li Chen, Zhiyong Gao, and Dong Xu. 2020b. An end-to-end learning framework for video compression. IEEE transactions on pattern analysis and machine intelligence, Vol. 43, 10 (2020), 3292--3308."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Ming Lu Peiyao Guo Huiqing Shi Chuntong Cao and Zhan Ma. 2022. Transformer-based Image Compression. (2022) 469--469.","DOI":"10.1109\/DCC52660.2022.00080"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01088"},{"key":"e_1_3_2_1_31_1","first-page":"13091","article-title":"VCT: A Video Compression Transformer","volume":"35","author":"Mentzer Fabian","year":"2022","unstructured":"Fabian Mentzer, George D Toderici, David Minnen, Sergi Caelles, Sung Jin Hwang, Mario Lucic, and Eirikur Agustsson. 2022. VCT: A Video Compression Transformer. Advances in Neural Information Processing Systems, Vol. 35 (2022), 13091--13103.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_32_1","unstructured":"David Minnen Johannes Ball\u00e9 and George D Toderici. 2018. Joint autoregressive and hierarchical priors for learned image compression. In Advances in Neural Information Processing Systems. 10771--10780."},{"key":"e_1_3_2_1_33_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library.","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library."},{"key":"e_1_3_2_1_34_1","volume-title":"Entroformer: A Transformer-based Entropy Model for Learned Image Compression. (May","author":"Qian Yichen","year":"2022","unstructured":"Yichen Qian, Ming Lin, Xiuyu Sun, Zhiyu Tan, and Rong Jin. 2022. Entroformer: A Transformer-based Entropy Model for Learned Image Compression. (May 2022)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.291"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3220421"},{"key":"e_1_3_2_1_37_1","volume-title":"Tel Aviv","author":"Song Mingyang","year":"2022","unstructured":"Mingyang Song, Yang Zhang, and Tunc O Aydin. 2022. TempFormer: Temporally Consistent Transformer for Video Denoising. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XIX. Springer, 481--496."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2012.2221191"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2002.800725"},{"key":"e_1_3_2_1_40_1","volume-title":"International Conference for Learning Representations","author":"Theis Lucas","year":"2017","unstructured":"Lucas Theis, Wenzhe Shi, Andrew Cunningham, and Ferenc Husz\u00e1r. 2017. Lossy image compression with compressive autoencoders. International Conference for Learning Representations (2017)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.577"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/30.125072"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2016.7532610"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-018-01144-2"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00666"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00564"},{"key":"e_1_3_2_1_47_1","volume-title":"International Conference on Learning Representations.","author":"Zhu Yinhao","year":"2022","unstructured":"Yinhao Zhu, Yang Yang, and Taco Cohen. 2022. Transformer-based transform coding. In International Conference on Learning Representations."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611960","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611960","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:07:10Z","timestamp":1755821230000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611960"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":47,"alternative-id":["10.1145\/3581783.3611960","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611960","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}