{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,24]],"date-time":"2026-07-24T14:54:54Z","timestamp":1784904894570,"version":"3.55.0"},"reference-count":472,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"name":"Key Scientific Technological Innovation Research Project by Ministry of Education"},{"name":"State Key Program and the Foundation for Innovative Research Groups of the National Natural Science Foundation of China","award":["61836009"],"award-info":[{"award-number":["61836009"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U22B2054"],"award-info":[{"award-number":["U22B2054"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62076192"],"award-info":[{"award-number":["62076192"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62006177"],"award-info":[{"award-number":["62006177"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61902298"],"award-info":[{"award-number":["61902298"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61573267"],"award-info":[{"award-number":["61573267"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61906150"],"award-info":[{"award-number":["61906150"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62276199"],"award-info":[{"award-number":["62276199"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013314","name":"Higher Education Discipline Innovation Project","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100013314","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Program for Cheung Kong Scholars and Innovative Research Team in University","award":["IRT 15R53"],"award-info":[{"award-number":["IRT 15R53"]}]},{"name":"ST Innovation Project from the Chinese Ministry of Education"},{"name":"Key Research and Development Program in Shaanxi Province of China","award":["2019ZDLGY03-06"],"award-info":[{"award-number":["2019ZDLGY03-06"]}]},{"name":"National Science Basic Research Plan in Shaanxi Province of China","award":["2022JQ-607"],"award-info":[{"award-number":["2022JQ-607"]}]},{"name":"China Postdoctoral Fund","award":["2022T150506"],"award-info":[{"award-number":["2022T150506"]}]},{"name":"Scientific Research Project of Education Department in Shaanxi Province of China","award":["20JY023"],"award-info":[{"award-number":["20JY023"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE J. Sel. Top. Appl. Earth Observations Remote Sensing"],"published-print":{"date-parts":[[2023]]},"DOI":"10.1109\/jstars.2023.3289293","type":"journal-article","created":{"date-parts":[[2023,6,26]],"date-time":"2023-06-26T18:38:24Z","timestamp":1687804704000},"page":"1-45","source":"Crossref","is-referenced-by-count":60,"title":["Transformer Meets Remote Sensing Video Detection and Tracking: A Comprehensive Survey"],"prefix":"10.1109","volume":"16","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3354-9617","authenticated-orcid":false,"given":"Licheng","family":"Jiao","sequence":"first","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0296-0393","authenticated-orcid":false,"given":"Xin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8780-5455","authenticated-orcid":false,"given":"Xu","family":"Liu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5669-9354","authenticated-orcid":false,"given":"Fang","family":"Liu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4796-5737","authenticated-orcid":false,"given":"Shuyuan","family":"Yang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8872-2195","authenticated-orcid":false,"given":"Wenping","family":"Ma","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6130-2518","authenticated-orcid":false,"given":"Lingling","family":"Li","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5472-1426","authenticated-orcid":false,"given":"Puhua","family":"Chen","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7372-9180","authenticated-orcid":false,"given":"Zhixi","family":"Feng","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6095-8830","authenticated-orcid":false,"given":"Yuwei","family":"Guo","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1375-0778","authenticated-orcid":false,"given":"Xu","family":"Tang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1996-186X","authenticated-orcid":false,"given":"Biao","family":"Hou","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0379-2042","authenticated-orcid":false,"given":"Xiangrong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5412-7793","authenticated-orcid":false,"given":"Jing","family":"Bai","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6943-4657","authenticated-orcid":false,"given":"Dou","family":"Quan","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8068-6767","authenticated-orcid":false,"given":"Junpeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Key Laboratory of Intelligent Perception and Image Understanding of the Ministry of Education of China, International Research Center of Intelligent Perception and Computation, School of Artificial Intelligence, Xidian University, Xi&#x0027;an, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2023.3247455"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2021.3119654"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2022.3176858"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2021.3104603"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2022.3146035"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2021.3116798"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/MGRS.2022.3198643"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3154922"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/MGRS.2021.3105440"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2020.3036602"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3169815"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3168465"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3144158"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3163410"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3168697"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/tgrs.2021.3095166"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3183468"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3162964"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2022.3174135"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3160007"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2022.3177235"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2019.2934760"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00475"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00943"},{"key":"ref27","first-page":"7487","article-title":"Stabilizing transformers for reinforcement learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Parisotto","year":"2020"},{"key":"ref28","first-page":"15475","article-title":"ResT: An efficient transformer for visual recognition","volume":"34","author":"Zhang","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref29","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Touvron"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3152247"},{"key":"ref32","article-title":"Relating transformers to models and neural representations of the hippocampal formation","author":"Whittington","year":"2021"},{"key":"ref33","first-page":"1180","article-title":"Dense associative memory for pattern recognition","author":"Krotov","year":"2016","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/MGRS.2021.3115137"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3130436"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2882926"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2019.2921827"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3101398"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00317"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2868561"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00803"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00162"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3154286"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3009034"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3113169"},{"key":"ref46","first-page":"13695","article-title":"Self-supervised multi-object tracking with cross-input consistency","volume":"34","author":"Bastani","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2017.2776899"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2021.3137606"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3045634"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3158652"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS47720.2021.9554763"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1561\/2200000016"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2021.3077257"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS.2019.8900196"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2944097"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00421"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.2980419"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2017.2770319"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00699"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.468"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2019.2922648"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2019.2943366"},{"key":"ref63","first-page":"1935","article-title":"Deep attentive tracking via reciprocative learning","author":"Pu","year":"2018","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3055362"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2929034"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2019.00260"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00064"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6828"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3125504"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3060862"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2018.10.005"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICME46284.2020.9102840"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3050073"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.2990089"},{"key":"ref75","first-page":"3056","article-title":"Hierarchical attentive recurrent tracking","volume":"30","author":"Kosiorek","year":"2017","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2869277"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3148876"},{"key":"ref78","first-page":"16743","article-title":"SwinTrack: A simple and strong baseline for transformer tracking","volume-title":"Adv. Neural Inform. Process. Syst.","author":"Lin","year":"2021"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2691769"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01314-1"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2779856"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3139121"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-018-1087-1"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2843129"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2876253"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3115491"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-018-01147-z"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2018.2878467"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01513-4"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3152250"},{"key":"ref91","first-page":"1192","article-title":"Prototypical cross-attention networks for multiple object tracking and segmentation","volume":"34","author":"Ke","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3165376"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01332"},{"key":"ref95","article-title":"VRT: A video restoration transformer","author":"Liang","year":"2022"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01325"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01696"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00932"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20028"},{"key":"ref101","article-title":"Time is MattEr: Temporal self-supervision for video transformers","author":"Yun","year":"2022"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2023.3243465"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3142279"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3187135"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3143368"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3182809"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2020.2971763"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3171551"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3165885"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11205"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/tgrs.2021.3111183"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3140809"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01266-1"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS.2019.8900572"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2021.3124222"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2017.2746262"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2926164"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00739"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2020.3034677"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2843761"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3066675"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS.2016.7729316"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00256"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3109028"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00859"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3211695"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2017.2668278"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-019-01231-y"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2022.3143532"},{"key":"ref131","article-title":"Neural machine translation by jointly learning to align and translate","author":"Bahdanau","year":"2014"},{"key":"ref132","first-page":"577","article-title":"Attention-based models for speech recognition","author":"Chorowski","year":"2015","journal-title":"in Int. Conf. Neural Inf. Process. Syst."},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1176"},{"key":"ref134","first-page":"2793","article-title":"Attention is not all you need: Pure attention loses rank doubly exponentially with depth","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Dong"},{"key":"ref135","article-title":"Conditional positional encodings for vision transformers","author":"Chu","year":"2021"},{"key":"ref136","article-title":"Graph-Bert: Only attention is needed for learning graph representations","author":"Zhang","year":"2020"},{"key":"ref137","first-page":"11307","article-title":"Generative video transformer: Can objects be the words?","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wu"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N18-2074"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00599"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01181"},{"key":"ref141","first-page":"1243","article-title":"Convolutional sequence to sequence learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Gehring","year":"2017"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00983"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3186400"},{"key":"ref144","first-page":"21618","article-title":"Rethinking graph transformers with spectral attention","volume":"34","author":"Kreuzer","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref145","article-title":"UniFormer: Unified transformer for efficient spatiotemporal representation learning","author":"Li","year":"2022"},{"key":"ref146","article-title":"A mathematical framework for transformer circuits","volume-title":"Transformer Circuits Thread","author":"Elhage","year":"2021"},{"key":"ref147","article-title":"In-context learning and induction heads","volume-title":"Transformer Circuits Thread","author":"Olsson","year":"2022"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20083-0_18"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1145\/3530811"},{"key":"ref150","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00476"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3093977"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20142"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00718"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3186634"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2021.3063381"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1167"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1016\/j.specom.2020.10.004"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2021.3109061"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00069"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-87589-3_28"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00259"},{"key":"ref164","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19818-2_17"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.3026051"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3130716"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3144894"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3152425"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2018.8451652"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01155"},{"key":"ref171","first-page":"12559","article-title":"Self-supervised graph transformer on large-scale molecular data","volume":"33","author":"Rong","year":"2020","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref172","doi-asserted-by":"publisher","DOI":"10.5555\/3495724.3497510"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.1109\/MGRS.2022.3198244"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3185088"},{"key":"ref175","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"ref176","article-title":"How do vision transformers work?","author":"Park","year":"2022"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3185640"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3115699"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3151353"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2020.3009352"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3172410"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2021.3095505"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2021.3058049"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2020.3026587"},{"key":"ref185","article-title":"Competitive inner-imaging squeeze and excitation for residual network","author":"Hu","year":"2018"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2020.2968550"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3144017"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.3043267"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1109\/tgrs.2021.3058549"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3066432"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.2997200"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2020.2997081"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1807.06521"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3127232"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3178479"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2020.2988294"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3144165"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3189044"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2020.3037893"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS46834.2022.9883686"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3085870"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3159544"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3091758"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.19"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3154390"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3169479"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3137967"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2021.3132027"},{"key":"ref210","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00326"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-022-0274-8"},{"key":"ref213","first-page":"18590","article-title":"All tokens matter: Token labeling for training better vision transformers","volume":"34","author":"Jiang","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref214","article-title":"Focal self-attention for local-global interactions in vision transformers","author":"Yang","year":"2021"},{"key":"ref215","article-title":"ELSA: Enhanced local self-attention for vision transformer","author":"Zhou","year":"2021"},{"key":"ref216","article-title":"BOAT: Bilateral local attention vision transformer","author":"Yu","year":"2022"},{"key":"ref217","first-page":"15908","article-title":"Transformer in transformer","volume":"34","author":"Han","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref218","article-title":"PyramidTNT: Improved transformer-in-transformer baselines with pyramid architecture","author":"Han","year":"2022"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00165"},{"key":"ref220","first-page":"9355","article-title":"Twins: Revisiting the design of spatial attention in vision transformers","volume":"34","author":"Chu","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref221","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-023-0364-2"},{"key":"ref223","article-title":"On the connection between local attention and dynamic depth-wise convolution","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Han","year":"2021"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00089"},{"key":"ref225","article-title":"TRT-ViT: TensorRT-oriented vision transformer","author":"Xia","year":"2022"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00042"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00518"},{"key":"ref228","first-page":"28522","article-title":"ViTAE: Vision transformer advanced by exploring intrinsic inductive bias","volume":"34","author":"Xu","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref229","article-title":"HRViT: Multi-scale high-resolution vision transformer","author":"Gu","year":"2021"},{"key":"ref230","first-page":"7281","article-title":"HRFormer: High-resolution vision transformer for dense predict","volume":"34","author":"Yuan","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01625"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.02007"},{"key":"ref233","article-title":"Adaptive transformers in RL","author":"Kumar","year":"2020"},{"key":"ref234","article-title":"CoBERL: Contrastive BERT for reinforcement learning","author":"Banino","year":"2021"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475272"},{"key":"ref236","first-page":"12493","article-title":"Keeping your eye on the ball: Trajectory attention in video transformers","volume":"34","author":"Patrick","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2102.05095"},{"key":"ref238","first-page":"19594","article-title":"Space-time mixing attention for video transformer","volume":"34","author":"Bulat","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00560"},{"key":"ref240","first-page":"2491","article-title":"Associating objects with transformers for video object segmentation","volume":"34","author":"Yang","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00352"},{"key":"ref242","first-page":"3965","article-title":"CoAtNet: Marrying convolution and attention for all data sizes","volume":"34","author":"Dai","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3355890"},{"key":"ref244","article-title":"Next-ViT: Next generation vision transformer for efficient deployment in realistic industrial scenarios","author":"Li","year":"2022"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref246","article-title":"Lite transformer with long-short range attention","author":"Wu","year":"2020"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2020-3015"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52688.2022.01166"},{"key":"ref249","first-page":"2204","article-title":"Recurrent models of visual attention","author":"Mnih","year":"2014","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref250","article-title":"Multiple object recognition with visual attention","author":"Ba","year":"2014"},{"key":"ref251","first-page":"15084","article-title":"Decision transformer: Reinforcement learning via sequence modeling","volume":"34","author":"Chen","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref252","first-page":"1273","article-title":"Offline reinforcement learning as one big sequence modeling problem","volume":"34","author":"Janner","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref253","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3072381"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2019.2953181"},{"key":"ref255","article-title":"Detecting tiny moving vehicles in satellite videos","author":"Ao","year":"2018"},{"key":"ref256","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3023175"},{"key":"ref257","first-page":"33","article-title":"GoDec: Randomized low-rank & sparse matrix decomposition in noisy case","volume-title":"Proc. 28th Int. Conf. Mach. Learn.","author":"Zhou","year":"2011"},{"key":"ref258","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3066696"},{"key":"ref259","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2021.3117054"},{"key":"ref260","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3150760"},{"key":"ref261","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.132"},{"key":"ref262","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2828606"},{"key":"ref263","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2016.2630731"},{"key":"ref264","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.2976855"},{"key":"ref265","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3004696"},{"key":"ref266","doi-asserted-by":"publisher","DOI":"10.3390\/rs11202372"},{"key":"ref267","doi-asserted-by":"publisher","DOI":"10.1109\/RTEICT.2016.7808064"},{"key":"ref268","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS47720.2021.9554146"},{"key":"ref269","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.2998782"},{"key":"ref270","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00097"},{"key":"ref271","doi-asserted-by":"publisher","DOI":"10.1109\/ICFSP55781.2022.9924854"},{"key":"ref272","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2345390"},{"key":"ref273","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2019.2917703"},{"key":"ref274","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2965302"},{"key":"ref275","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.2978512"},{"key":"ref276","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2020.2988165"},{"key":"ref277","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2019.2933488"},{"key":"ref278","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2022.3179770"},{"key":"ref279","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3063144"},{"key":"ref280","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.513"},{"key":"ref281","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01438"},{"key":"ref282","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS47720.2021.9554131"},{"key":"ref283","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.465"},{"key":"ref284","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3077640"},{"key":"ref285","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2020.2971657"},{"key":"ref286","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3063001"},{"key":"ref287","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2021.3096809"},{"key":"ref288","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33765-9_50"},{"key":"ref289","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00515"},{"key":"ref290","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.129"},{"key":"ref291","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.531"},{"key":"ref292","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.733"},{"key":"ref293","doi-asserted-by":"publisher","DOI":"10.1023\/b:visi.0000029664.99615.94"},{"key":"ref294","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_6"},{"key":"ref295","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3041332"},{"key":"ref296","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2020.105526"},{"key":"ref297","article-title":"TAda! Temporally-adaptive convolutions for video understanding","author":"Huang","year":"2021"},{"key":"ref298","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00687"},{"key":"ref299","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00441"},{"key":"ref300","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00628"},{"key":"ref301","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00942"},{"key":"ref302","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00630"},{"key":"ref303","doi-asserted-by":"publisher","DOI":"10.1145\/3488933.3488934"},{"key":"ref304","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2021.107071"},{"key":"ref305","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3056684"},{"key":"ref306","first-page":"293","article-title":"Tracking holistic object representations","volume-title":"Proc. IEEE 30th British Mach. Vis. Conf. Trans. Antennas Propag.","author":"Sauer","year":"2019"},{"key":"ref307","article-title":"TrTr: Visual tracking with transformer","author":"Zhao","year":"2021"},{"key":"ref308","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-48881-3_56"},{"key":"ref309","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00303"},{"key":"ref310","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01517"},{"key":"ref311","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01324"},{"key":"ref312","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20047-2_9"},{"key":"ref313","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"ref314","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3162599"},{"key":"ref315","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00479"},{"key":"ref316","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00196"},{"key":"ref317","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-022-03352-3"},{"key":"ref318","doi-asserted-by":"publisher","DOI":"10.1109\/IROS47612.2022.9981248"},{"key":"ref319","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00935"},{"key":"ref320","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.108502"},{"key":"ref321","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3234372"},{"key":"ref322","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS47720.2021.9553779"},{"key":"ref323","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_27"},{"key":"ref324","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.312"},{"key":"ref325","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2388226"},{"key":"ref326","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2866955"},{"key":"ref327","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS.2018.8518302"},{"key":"ref328","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS.2018.8518431"},{"key":"ref329","first-page":"23703","article-title":"Tracking people with 3D representations","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Rajasegaran","year":"2021"},{"key":"ref330","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_16"},{"key":"ref331","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2849374"},{"key":"ref332","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3087898"},{"key":"ref333","first-page":"652","article-title":"PointNet: Deep learning on point sets for 3D classification and segmentation","volume-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit.","author":"Qi","year":"2017"},{"key":"ref334","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2687462"},{"key":"ref335","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298832"},{"key":"ref336","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.3019915"},{"key":"ref337","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3044219"},{"key":"ref338","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2018.2881123"},{"key":"ref339","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2928123"},{"key":"ref340","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298718"},{"key":"ref341","doi-asserted-by":"publisher","DOI":"10.1109\/34.969114"},{"key":"ref342","doi-asserted-by":"publisher","DOI":"10.1121\/1.398863"},{"key":"ref343","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_7"},{"key":"ref344","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2929520"},{"key":"ref345","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01460-0"},{"key":"ref346","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01527-y"},{"key":"ref347","doi-asserted-by":"publisher","DOI":"10.1137\/1.9781611972788.20"},{"key":"ref348","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","year":"2014"},{"key":"ref349","article-title":"Objects as points","author":"Zhou","year":"2019"},{"key":"ref350","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.164"},{"key":"ref351","article-title":"Vision meets drones: A challenge","author":"Zhu","year":"2018"},{"key":"ref352","article-title":"Vision meets drones: Past, present and future","author":"Zhu","year":"2020"},{"key":"ref353","doi-asserted-by":"publisher","DOI":"10.1016\/j.imed.2022.07.002"},{"key":"ref354","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00493"},{"key":"ref355","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19812-0_38"},{"key":"ref356","article-title":"A generalization of transformer networks to graphs","author":"Dwivedi","year":"2020"},{"key":"ref357","first-page":"13266","article-title":"Representing long-range context for graph neural networks with global attention","volume":"34","author":"Wu","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref358","first-page":"3469","article-title":"Structure-aware transformer for graph representation learning","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chen"},{"key":"ref359","doi-asserted-by":"publisher","DOI":"10.1038\/s42003-022-03557-9"},{"key":"ref360","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-021-00376-1"},{"key":"ref361","doi-asserted-by":"publisher","DOI":"10.1038\/s41562-021-01194-6"},{"key":"ref362","doi-asserted-by":"publisher","DOI":"10.1038\/s41583-021-00473-5"},{"key":"ref363","doi-asserted-by":"publisher","DOI":"10.1038\/s41593-020-0653-3"},{"key":"ref364","doi-asserted-by":"publisher","DOI":"10.1038\/s42254-021-00314-5"},{"key":"ref365","article-title":"Physics-informed machine learning: A survey on problems, methods and applications","author":"Hao","year":"2022"},{"key":"ref366","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2012.02.008"},{"key":"ref367","doi-asserted-by":"publisher","DOI":"10.1109\/CEC.2015.7257269"},{"key":"ref368","doi-asserted-by":"publisher","DOI":"10.1007\/s00500-015-1702-9"},{"key":"ref369","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2016.04.021"},{"key":"ref370","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2016.03.009"},{"key":"ref371","doi-asserted-by":"publisher","DOI":"10.1016\/j.cnsns.2016.08.001"},{"key":"ref372","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2016.09.013"},{"key":"ref373","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcp.2018.10.045"},{"key":"ref374","first-page":"887","article-title":"Dynamic visual reasoning by learning differentiable physics models from video and language","volume":"34","author":"Ding","year":"2021","journal-title":"in Proc. Int. Conf. Neural Inf. Process. Syst."},{"key":"ref375","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01066"},{"key":"ref376","doi-asserted-by":"publisher","DOI":"10.4135\/9781412952637.n77"},{"key":"ref377","first-page":"3020","article-title":"Learning representations for counterfactual inference","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Johansson","year":"2016"},{"key":"ref378","article-title":"From unstructured text to causal knowledge graphs: A transformer-based approach","author":"Friedman","year":"2022"},{"key":"ref379","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539472"},{"key":"ref380","article-title":"A survey on graph neural networks and graph transformers in computer vision: A task-oriented perspective","author":"Chen","year":"2022"},{"key":"ref381","article-title":"Neural architecture search with reinforcement learning","author":"Zoph","year":"2016"},{"key":"ref382","article-title":"Designing neural network architectures using reinforcement learning","author":"Baker","year":"2016"},{"key":"ref383","first-page":"2902","article-title":"Large-scale evolution of image classifiers","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Real","year":"2017"},{"key":"ref384","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00907"},{"key":"ref385","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_2"},{"key":"ref386","first-page":"4095","article-title":"Efficient neural architecture search via parameters sharing","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Pham","year":"2018"},{"key":"ref387","article-title":"ProxylessNAS: Direct neural architecture search on target task and hardware","author":"Cai","year":"2018"},{"key":"ref388","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00138"},{"key":"ref389","article-title":"A survey on neural architecture search","author":"Wistuba","year":"2019"},{"key":"ref390","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-05318-5_3"},{"key":"ref391","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01489"},{"key":"ref392","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00017"},{"key":"ref393","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00720"},{"key":"ref394","first-page":"5877","article-title":"The evolved transformer","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"So","year":"2019"},{"key":"ref395","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19803-8_9"},{"key":"ref396","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01206"},{"key":"ref397","article-title":"Searching for efficient multi-stage vision transformers","author":"Liao","year":"2021"},{"key":"ref398","article-title":"Lightweight vision transformer with cross feature attention","author":"Zhao","year":"2022"},{"key":"ref399","article-title":"MobileViT: Light-weight, general-purpose, and mobile-friendly vision transformer","author":"Mehta","year":"2021"},{"key":"ref400","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/121"},{"key":"ref401","article-title":"EdgeFormer: Improving light-weight ConvNets by learning from vision transformers","author":"Zhang","year":"2022"},{"key":"ref402","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01186"},{"key":"ref403","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00520"},{"key":"ref404","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01169"},{"key":"ref405","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref406","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00140"},{"key":"ref407","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00716"},{"key":"ref408","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_8"},{"key":"ref409","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01204"},{"key":"ref410","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-31880-4_2"},{"key":"ref411","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-70928-2_53"},{"key":"ref412","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2016.07.051"},{"key":"ref413","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2013.10.008"},{"key":"ref414","doi-asserted-by":"publisher","DOI":"10.1016\/j.asoc.2018.03.005"},{"key":"ref415","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2015.2459137"},{"key":"ref416","doi-asserted-by":"publisher","DOI":"10.1109\/TEVC.2014.2350987"},{"key":"ref417","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-49774-5_8"},{"key":"ref418","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-49774-5_5"},{"key":"ref419","doi-asserted-by":"publisher","DOI":"10.1109\/CEC.2006.1688293"},{"key":"ref420","volume-title":"Multi-objective Optim. Using Evol. Algorithms","author":"Deb","year":"2001"},{"key":"ref421","doi-asserted-by":"publisher","DOI":"10.1109\/4235.996017"},{"key":"ref422","doi-asserted-by":"publisher","DOI":"10.1109\/TEVC.2007.892759"},{"key":"ref423","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2015.2403849"},{"key":"ref424","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-014-0625-y"},{"key":"ref425","doi-asserted-by":"publisher","DOI":"10.1162\/evco_a_00289"},{"key":"ref426","doi-asserted-by":"publisher","DOI":"10.1109\/TEVC.2004.831456"},{"key":"ref427","doi-asserted-by":"publisher","DOI":"10.1162\/106365600568202"},{"key":"ref428","first-page":"573","article-title":"A dynamic multi-objective evolutionary algorithm based on an orthogonal design","volume-title":"Proc. IEEE Int. Conf. Evol. Comput.","author":"Zeng","year":"2006"},{"key":"ref429","doi-asserted-by":"publisher","DOI":"10.1016\/j.amc.2008.05.151"},{"key":"ref430","doi-asserted-by":"publisher","DOI":"10.1109\/34.192463"},{"key":"ref431","doi-asserted-by":"publisher","DOI":"10.1109\/72.165591"},{"key":"ref432","doi-asserted-by":"publisher","DOI":"10.1109\/tnnls.2021.3118221"},{"key":"ref433","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.3007412"},{"key":"ref434","article-title":"How framelets enhance graph neural networks","author":"Zheng","year":"2021"},{"key":"ref435","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3218735"},{"key":"ref436","article-title":"MWQ: Multiscale wavelet quantized neural networks","author":"Sun","year":"2021"},{"key":"ref437","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3181486"},{"key":"ref438","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2018.2881482"},{"key":"ref439","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2016.11.015"},{"key":"ref440","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00856"},{"key":"ref441","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS47720.2021.9554705"},{"key":"ref442","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3149947"},{"key":"ref443","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2020.3032958"},{"key":"ref444","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2022.3191783"},{"key":"ref445","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00891"},{"key":"ref446","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00862"},{"key":"ref447","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00525"},{"key":"ref448","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3275156"},{"key":"ref449","first-page":"5583","article-title":"ViLT: Vision-and-language transformer without convolution or region supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kim"},{"key":"ref450","first-page":"1298","article-title":"data2vec: A general framework for self-supervised learning in speech, vision and language","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Baevski","year":"2022"},{"key":"ref451","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3542634"},{"key":"ref452","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2014.09.027"},{"key":"ref453","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2016.2547638"},{"key":"ref454","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-10-3614-9_47"},{"key":"ref455","doi-asserted-by":"publisher","DOI":"10.1109\/MGRS.2021.3088865"},{"key":"ref456","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2019.2909781"},{"key":"ref457","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2018.11.077"},{"key":"ref458","doi-asserted-by":"publisher","DOI":"10.1117\/1.JRS.14.036501"},{"key":"ref459","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.3001584"},{"key":"ref460","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2022.3167830"},{"key":"ref461","doi-asserted-by":"publisher","DOI":"10.1109\/IGARSS.2015.7326597"},{"key":"ref462","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2022.3184789"},{"key":"ref463","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2012.01.004"},{"key":"ref464","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2018.2872590"},{"key":"ref465","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2021.3066508"},{"key":"ref466","doi-asserted-by":"publisher","DOI":"10.1109\/LGRS.2021.3073176"},{"key":"ref467","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2022.3180548"},{"key":"ref468","doi-asserted-by":"publisher","DOI":"10.1109\/TGRS.2020.3046727"},{"key":"ref469","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2020.2990457"},{"key":"ref470","first-page":"3621","article-title":"Deep variational graph convolutional recurrent network for multivariate time series anomaly detection","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Chen"},{"key":"ref471","doi-asserted-by":"publisher","DOI":"10.1109\/AVSS52988.2021.9663810"},{"key":"ref472","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548199"}],"container-title":["IEEE Journal of Selected Topics in Applied Earth Observations and Remote Sensing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/4609443\/9973430\/10163641.pdf?arnumber=10163641","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,1]],"date-time":"2024-03-01T11:54:16Z","timestamp":1709294056000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10163641\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"references-count":472,"URL":"https:\/\/doi.org\/10.1109\/jstars.2023.3289293","relation":{},"ISSN":["1939-1404","2151-1535"],"issn-type":[{"value":"1939-1404","type":"print"},{"value":"2151-1535","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]}}}