{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,25]],"date-time":"2026-02-25T18:17:11Z","timestamp":1772043431147,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62201604,62201600"],"award-info":[{"award-number":["62201604,62201600"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755644","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"6549-6557","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["VSumMamba: Mamba Empowered Efficient Video Summarization with Multi-Scale Spatial-Temporal Modeling"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-2128-9525","authenticated-orcid":false,"given":"Yamiao","family":"Ding","sequence":"first","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7926-3310","authenticated-orcid":false,"given":"Tianrui","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5897-0595","authenticated-orcid":false,"given":"Zhizhou","family":"Lu","sequence":"additional","affiliation":[{"name":"National University of Defence Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2986-4665","authenticated-orcid":false,"given":"Jun-Jie","family":"Huang","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2906-3238","authenticated-orcid":false,"given":"Wentao","family":"Zhao","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9066-1475","authenticated-orcid":false,"given":"Xinwang","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3094-7735","authenticated-orcid":false,"given":"Meng","family":"Wang","sequence":"additional","affiliation":[{"name":"Hefei University of Technology, Hefei, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Video Summarization with Global-Local Positional Encoding. In IEEE International Symposium on Multimedia (ISM). 226-234","author":"Apostolidis Evlampios","year":"2021","unstructured":"Evlampios Apostolidis, Georgios Balaouras, Vasileios Mezaris, and Ioannis Patras. 2021. Video Summarization with Global-Local Positional Encoding. In IEEE International Symposium on Multimedia (ISM). 226-234."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMSP.2019.8901741"},{"key":"e_1_3_2_2_3_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. International Conference on Learning Representations (ICLR) (2021)."},{"key":"e_1_3_2_2_4_1","volume-title":"Summarizing Videos with Attention. In Asian Conference on Computer Vision Workshops. 39-54","author":"Fajtl Jiri","year":"2019","unstructured":"Jiri Fajtl, Hajar Sadeghi Sokeh, Vasileios Argyriou, Dorothy Monekosso, and Paolo Remagnino. 2019. Summarizing Videos with Attention. In Asian Conference on Computer Vision Workshops. 39-54."},{"key":"e_1_3_2_2_5_1","volume-title":"Multi-Source Visual Attention for Video Summarization. In IEEE International Conference on Multimedia and Expo (ICME). 1-6.","author":"Ghauri Jawad Ahmad","year":"2021","unstructured":"Jawad Ahmad Ghauri, Sherzod Hakimov, and Ralph Ewerth. 2021. Multi-Source Visual Attention for Video Summarization. In IEEE International Conference on Multimedia and Expo (ICME). 1-6."},{"key":"e_1_3_2_2_6_1","unstructured":"Ian J. Goodfellow Jean Pouget-Abadie Mehdi Mirza Bing Xu David Warde-Farley Sherjil Ozair Aaron Courville and Yoshua Bengio. 2014. Generative Adversarial Networks. arXiv:1406.2661 [stat.ML] https:\/\/arxiv.org\/abs\/1406.2661"},{"key":"e_1_3_2_2_7_1","volume-title":"Mamba: Linear-Time Sequence Modeling with Selective State Spaces. In Conference on Language Modeling (COLM). arXiv:2312","author":"Gu Albert","year":"2023","unstructured":"Albert Gu and Tri Dao. 2023. Mamba: Linear-Time Sequence Modeling with Selective State Spaces. In Conference on Language Modeling (COLM). arXiv:2312.00752"},{"key":"e_1_3_2_2_8_1","volume-title":"Efficiently modeling long sequences with structured state spaces. arXiv preprint arXiv:2111.00396","author":"Gu Albert","year":"2021","unstructured":"Albert Gu, Karan Goel, and Christopher R\u00e9. 2021. Efficiently modeling long sequences with structured state spaces. arXiv preprint arXiv:2111.00396 (2021)."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"crossref","unstructured":"Michael Gygli Helmut Grabner Hayko Riemenschneider and Luc Van Gool. 2014. Creating Summaries from User Videos. In ECCV.","DOI":"10.1007\/978-3-319-10584-0_33"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298928"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.23919\/MVA51890.2021.9511350"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3275069"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2889265"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.04.132"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2904996"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2991083"},{"key":"e_1_3_2_2_17_1","unstructured":"Yunjae Jung Donghyeon Cho Dahun Kim Sanghyun Woo and In So Kweon. 2018. Discriminative Feature Learning for Unsupervised Video Summarization. arXiv:1811.09791 [cs.CV] https:\/\/arxiv.org\/abs\/1811.09791"},{"key":"e_1_3_2_2_18_1","volume-title":"FullTransNet: Full Transformer with Local-Global Attention for Video Summarization. arXiv preprint arXiv:2501.00882","author":"Lan Libin","year":"2025","unstructured":"Libin Lan, Lu Jiang, Tianshu Yu, Xiaojuan Liu, and Zhongshi He. 2025. FullTransNet: Full Transformer with Local-Global Attention for Video Summarization. arXiv preprint arXiv:2501.00882 (2025)."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2860797"},{"key":"e_1_3_2_2_20_1","volume-title":"Computer Vision - ECCV","author":"Li Kunchang","year":"2024","unstructured":"Kunchang Li, Xinhao Li, Yi Wang, Yinan He, Yali Wang, Limin Wang, and Yu Qiao. 2025. VideoMamba: State Space Model for Efficient Video Understanding. In Computer Vision - ECCV 2024, Ale\u0161 Leonardis, Elisa Ricci, Stefan Roth, Olga Russakovsky, Torsten Sattler, and G\u00fcl Varol (Eds.). Springer Nature Switzerland, Cham, 237-255."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107677"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_10"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3143699"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-59716-0_46"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDSP.2014.6900724"},{"key":"e_1_3_2_2_26_1","volume-title":"IEEE International Conference on Image Processing (ICIP). 3377-3381","author":"Liu Yang","year":"2019","unstructured":"Yang Liu, Fanglei Li, Sheng Yang, Shuhui Chen, and Yueming Wang. 2019. Video Summarization via Hierarchical Multi-Attention Networks. In IEEE International Conference on Image Processing (ICIP). 3377-3381."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME59968.2025.11209023"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.318"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00778"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_35"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01258-8_22"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01783"},{"key":"e_1_3_2_2_33_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.11297"},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299154"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019143"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2959451"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_47"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123328"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00773"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00773"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12255"},{"key":"e_1_3_2_2_43_1","volume-title":"Vision Mamba: Efficient Visual Representation Learning with Bidirectional State Space Model. In International Conference on Machine Learning (ICML). 1-20","author":"Zhu Lianghui","year":"2024","unstructured":"Lianghui Zhu, Bencheng Liao, Qian Zhang, Xinlong Wang, Wenyu Liu, and Xinggang Wang. 2024. Vision Mamba: Efficient Visual Representation Learning with Bidirectional State Space Model. In International Conference on Machine Learning (ICML). 1-20. arXiv:2401.09417 https:\/\/arxiv.org\/abs\/2401.09417"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3163855"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3039886"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109578"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755644","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:56:31Z","timestamp":1765342591000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755644"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":46,"alternative-id":["10.1145\/3746027.3755644","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755644","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}