{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,7]],"date-time":"2026-01-07T07:54:01Z","timestamp":1767772441916,"version":"3.44.0"},"reference-count":16,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,4,14]],"date-time":"2024-04-14T00:00:00Z","timestamp":1713052800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,4,14]]},"DOI":"10.1109\/icassp48485.2024.10446293","type":"proceedings-article","created":{"date-parts":[[2024,3,18]],"date-time":"2024-03-18T14:56:31Z","timestamp":1710773791000},"page":"7690-7694","source":"Crossref","is-referenced-by-count":1,"title":["Vision Transformer with 2D Explicit Position Encoding"],"prefix":"10.1109","author":[{"given":"Yujie","family":"Li","sequence":"first","affiliation":[{"name":"Guilin University of Electronic Technology,School of Artificial Intelligence,Guilin,China"}]},{"given":"Zihang","family":"Ma","sequence":"additional","affiliation":[{"name":"Guilin University of Electronic Technology,School of Artificial Intelligence,Guilin,China"}]},{"given":"Xinghe","family":"Wang","sequence":"additional","affiliation":[{"name":"Guilin University of Electronic Technology,School of Artificial Intelligence,Guilin,China"}]},{"given":"Yifu","family":"Wang","sequence":"additional","affiliation":[{"name":"Guilin University of Electronic Technology,School of Artificial Intelligence,Guilin,China"}]},{"given":"Benying","family":"Tan","sequence":"additional","affiliation":[{"name":"Guilin University of Electronic Technology,School of Artificial Intelligence,Guilin,China"}]}],"member":"263","reference":[{"article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"9th International Conference on Learning Representations, ICLR 2021, Virtual Event","author":"Dosovitskiy","key":"ref1"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"article-title":"Gaussian Error Linear Units (GELUs)","year":"2020","author":"Hendrycks","key":"ref3"},{"article-title":"Fractalnet: Ultra-deep neural networks without residuals","volume-title":"5th International Conference on Learning Representations, ICLR 2017, Toulon, France, April 24-26, 2017, Conference Track Proceedings","author":"Larsson","key":"ref4"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref6","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"Proceedings of the 38th International Conference on Machine Learning","author":"Touvron"},{"article-title":"Distilling the knowledge in a neural network","volume-title":"NIPS Deep Learning and Representation Learning Workshop","author":"Hinton","key":"ref7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01170"},{"article-title":"Conditional positional encodings for vision transformers","volume-title":"The Eleventh International Conference on Learning Representations, ICLR 2023","author":"Chu","key":"ref10"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00988"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01185"},{"article-title":"Muti-Scale And Token Mergence: Make Your ViT More Efficient","year":"2023","author":"Bian","key":"ref13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i3.20202"},{"article-title":"Decoupled weight decay regularization","volume-title":"7th International Conference on Learning Representations, ICLR 2019","author":"Loshchilov","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"}],"event":{"name":"ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","start":{"date-parts":[[2024,4,14]]},"location":"Seoul, Korea, Republic of","end":{"date-parts":[[2024,4,19]]}},"container-title":["ICASSP 2024 - 2024 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10445798\/10445803\/10446293.pdf?arnumber=10446293","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,15]],"date-time":"2025-08-15T18:10:49Z","timestamp":1755281449000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10446293\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,14]]},"references-count":16,"URL":"https:\/\/doi.org\/10.1109\/icassp48485.2024.10446293","relation":{},"subject":[],"published":{"date-parts":[[2024,4,14]]}}}