{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,18]],"date-time":"2026-01-18T04:14:54Z","timestamp":1768709694994,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T00:00:00Z","timestamp":1730678400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"DOD U.S. Department of Defense","doi-asserted-by":"publisher","award":["N00014- 20-1-2027"],"award-info":[{"award-number":["N00014- 20-1-2027"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"Army Research Laboratory","doi-asserted-by":"publisher","award":["W911NF-23-2-0067"],"award-info":[{"award-number":["W911NF-23-2-0067"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"Air Force Office of Scientific Research","doi-asserted-by":"publisher","award":["FA9550-22-1-0337"],"award-info":[{"award-number":["FA9550-22-1-0337"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,4]]},"DOI":"10.1145\/3686215.3688381","type":"proceedings-article","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T12:17:01Z","timestamp":1730290621000},"page":"184-194","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Gaze-Informed Vision Transformers: Predicting Driving Decisions Under Uncertainty"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2014-424X","authenticated-orcid":false,"given":"Sharath","family":"Koorathota","sequence":"first","affiliation":[{"name":"Columbia University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1082-5140","authenticated-orcid":false,"given":"Nikolas","family":"Papadopoulos","sequence":"additional","affiliation":[{"name":"Columbia University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0701-2870","authenticated-orcid":false,"given":"Jia Li","family":"Ma","sequence":"additional","affiliation":[{"name":"Columbia University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9987-564X","authenticated-orcid":false,"given":"Shruti","family":"Kumar","sequence":"additional","affiliation":[{"name":"Columbia University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0823-8713","authenticated-orcid":false,"given":"Xiaoxiao","family":"Sun","sequence":"additional","affiliation":[{"name":"Columbia University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5160-7780","authenticated-orcid":false,"given":"Arunesh","family":"Mittal","sequence":"additional","affiliation":[{"name":"Columbia University, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3534-1848","authenticated-orcid":false,"given":"Patrick","family":"Adelman","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9738-1342","authenticated-orcid":false,"given":"Paul","family":"Sajda","sequence":"additional","affiliation":[{"name":"Columbia University, USA"}]}],"member":"320","published-online":{"date-parts":[[2024,11,4]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2013.2247759"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2016.14"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1088\/1757-899X\/252\/1\/012096"},{"key":"e_1_3_2_2_4_1","unstructured":"Gedas Bertasius Heng Wang and Lorenzo Torresani. 2021. Is Space-Time Attention All You Need for Video Understanding?arxiv:2102.05095\u00a0[cs]"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1037\/a0035813"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.3390\/su12073030"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1317557111"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2007.01.018"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2020. An Image Is Worth 16x16 Words: Transformers for Image Recognition at Scale. (2020). https:\/\/doi.org\/10.48550\/ARXIV.2010.11929","DOI":"10.48550\/ARXIV.2010.11929"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41583-018-0045-9"},{"key":"e_1_3_2_2_11_1","volume-title":"ImageNet-trained CNNs are biased towards texture","author":"Geirhos Robert","year":"1811","unstructured":"Robert Geirhos, Patricia Rubisch, Claudio Michaelis, Matthias Bethge, Felix\u00a0A. Wichmann, and Wieland Brendel. 2022. ImageNet-trained CNNs are biased towards texture; increasing shape bias improves accuracy and robustness. http:\/\/arxiv.org\/abs\/1811.12231 arXiv:1811.12231 [cs, q-bio, stat]."},{"key":"e_1_3_2_2_12_1","unstructured":"Jacob Gildenblat. 2020. Exploring Explainability for Vision Transformers. http:\/\/jacobgil.github.io\/deeplearning\/vision-transformer-explainability."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/JBHI.2024.3371893"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1088\/1741-2552\/ad1055"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.conb.2011.06.012"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.04.080"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.3389\/fcomp.2023.1178450"},{"key":"e_1_3_2_2_18_1","unstructured":"Muzammal Naseer Kanchana Ranasinghe Salman Khan Munawar Hayat Fahad\u00a0Shahbaz Khan and Ming-Hsuan Yang. 2021. Intriguing Properties of Vision Transformers. http:\/\/arxiv.org\/abs\/2105.10497 arXiv:2105.10497 [cs]."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.robot.2014.03.003"},{"key":"e_1_3_2_2_20_1","unstructured":"Andrea Palazzi Davide Abati Simone Calderara Francesco Solera and Rita Cucchiara. 2018. Predicting the Driver\u2019s Focus of Attention: the DR(eye)VE Project. http:\/\/arxiv.org\/abs\/1705.03854 arXiv:1705.03854 [cs]."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20103"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNSRE.2017.2694553"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1167\/7.3.6"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0959-4388(98)80142-6"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00265"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.conll-1.2"},{"key":"e_1_3_2_2_27_1","volume-title":"Proceedings of the 38th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0139)","author":"Touvron Hugo","year":"2021","unstructured":"Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, and Herve Jegou. 2021. Training data-efficient image transformers & distillation through attention. In Proceedings of the 38th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0139), Marina Meila and Tong Zhang (Eds.). PMLR, 10347\u201310357. https:\/\/proceedings.mlr.press\/v139\/touvron21a.html"},{"key":"e_1_3_2_2_28_1","volume-title":"Are Convolutional Neural Networks or Transformers more like human vision?","author":"Tuli Shikhar","year":"2021","unstructured":"Shikhar Tuli, Ishita Dasgupta, Erin Grant, and Thomas\u00a0L. Griffiths. 2021. Are Convolutional Neural Networks or Transformers more like human vision? (2021). arxiv:2105.07197\u00a0[cs.CV]"}],"event":{"name":"ICMI '24: INTERNATIONAL CONFERENCE ON MULTIMODAL INTERACTION","location":"San Jose Costa Rica","acronym":"ICMI '24","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Companion Proceedings of the 26th International Conference on Multimodal Interaction"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3686215.3688381","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3686215.3688381","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,15]],"date-time":"2025-10-15T16:22:37Z","timestamp":1760545357000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3686215.3688381"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,4]]},"references-count":28,"alternative-id":["10.1145\/3686215.3688381","10.1145\/3686215"],"URL":"https:\/\/doi.org\/10.1145\/3686215.3688381","relation":{},"subject":[],"published":{"date-parts":[[2024,11,4]]},"assertion":[{"value":"2024-11-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}