{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T18:26:40Z","timestamp":1771957600965,"version":"3.50.1"},"reference-count":40,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iros60139.2025.11245824","type":"proceedings-article","created":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T18:54:45Z","timestamp":1764269685000},"page":"9654-9660","source":"Crossref","is-referenced-by-count":1,"title":["ToSA: Token Merging with Spatial Awareness"],"prefix":"10.1109","author":[{"given":"Hsiang-Wei","family":"Huang","sequence":"first","affiliation":[{"name":"University of Washington,Electrical and Computer Engineering Department"}]},{"given":"Wenhao","family":"Chai","sequence":"additional","affiliation":[{"name":"University of Washington,Electrical and Computer Engineering Department"}]},{"given":"Kuang-Ming","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Washington,Electrical and Computer Engineering Department"}]},{"given":"Cheng-Yen","family":"Yang","sequence":"additional","affiliation":[{"name":"University of Washington,Electrical and Computer Engineering Department"}]},{"given":"Jenq-Neng","family":"Hwang","sequence":"additional","affiliation":[{"name":"University of Washington,Electrical and Computer Engineering Department"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Dinov2: Learning robust visual features without supervision","author":"Oquab","year":"2024","journal-title":"Transactions on Machine Learning Research"},{"key":"ref2","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"ref4","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"ref5","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2024","journal-title":"Advances in neural information processing systems"},{"key":"ref6","article-title":"Llava-onevision: Easy visual task transfer","author":"Li","year":"2024"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01386"},{"key":"ref8","first-page":"13937","article-title":"Dynamicvit: Efficient vision transformers with dynamic token sparsification","volume":"34","author":"Rao","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01054"},{"key":"ref10","article-title":"Tempura: Temporal event masked prediction and understanding for reasoning in action","author":"Cheng","year":"2025"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00010"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01809"},{"key":"ref13","article-title":"Token merging: Your ViT but faster","volume-title":"International Conference on Learning Representations","author":"Bolya"},{"key":"ref14","article-title":"What do vision transformers learn? a visual exploration","author":"Ghiasi","year":"2022"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/icra55743.2025.11128671"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01560"},{"key":"ref19","article-title":"Sp-vit: Learning 2d spatial priors for vision transformers","volume-title":"The 33rd British Machine Vision Conference","author":"Zhou"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20202"},{"key":"ref21","article-title":"Not all patches are what you need: Expediting vision transformers via token reorganizations","volume-title":"International Conference on Learning Representations","author":"Liang"},{"key":"ref22","article-title":"Ppt: Token pruning and pooling for efficient vision transformers","author":"Wu","year":"2023"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73004-7_2"},{"key":"ref24","article-title":"Sparsevlm: Visual token sparsification for efficient vision-language model inference","volume-title":"International Conference on Machine Learning","author":"Zhang"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01370"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.52202\/079017-4293"},{"key":"ref27","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref28","first-page":"19730","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"International conference on machine learning","author":"Li"},{"key":"ref29","article-title":"Instructblip: Towards general-purpose vision-language models with instruction tuning","author":"Dai","year":"2023"},{"key":"ref30","article-title":"Qwen-vl: A frontier large vision-language model with versatile abilities","author":"Bai","year":"2023"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72952-2_19"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02520"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0688"},{"key":"ref35","article-title":"Auroracap: Efficient, performant video detailed captioning and a new benchmark","author":"Chai","year":"2024"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73414-4_26"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-demo.49"},{"key":"ref39","article-title":"Videollama 2: Advancing spatial-temporal modeling and audio understanding in video-llms","author":"Cheng","year":"2024"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01300"}],"event":{"name":"2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)","location":"Hangzhou, China","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11245651\/11245652\/11245824.pdf?arnumber=11245824","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,18]],"date-time":"2025-12-18T12:35:15Z","timestamp":1766061315000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11245824\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/iros60139.2025.11245824","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}