{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T23:45:57Z","timestamp":1768434357207,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":36,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,4]],"date-time":"2024-06-04T00:00:00Z","timestamp":1717459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,4]]},"DOI":"10.1145\/3649902.3653439","type":"proceedings-article","created":{"date-parts":[[2024,5,31]],"date-time":"2024-05-31T18:23:51Z","timestamp":1717179831000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["A Transformer-Based Model for the Prediction of Human Gaze Behavior on Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3390-6154","authenticated-orcid":false,"given":"S\u00fcleyman","family":"\u00d6zdel","sequence":"first","affiliation":[{"name":"Human-Centered Technologies for Learning, Technical University of Munich, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6031-3741","authenticated-orcid":false,"given":"Yao","family":"Rong","sequence":"additional","affiliation":[{"name":"Technical University of Munich, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3406-8412","authenticated-orcid":false,"given":"Berat Mert","family":"Albaba","sequence":"additional","affiliation":[{"name":"Department of Computer Science\/AIT Lab, ETH Z\u00fcrich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6433-6713","authenticated-orcid":false,"given":"Yen-Ling","family":"Kuo","sequence":"additional","affiliation":[{"name":"Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5442-1116","authenticated-orcid":false,"given":"Xi","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, ETH, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3146-4484","authenticated-orcid":false,"given":"Enkelejda","family":"Kasneci","sequence":"additional","affiliation":[{"name":"Human-Centered Technologies for Learning, Technical University of Munich, Germany"}]}],"member":"320","published-online":{"date-parts":[[2024,6,4]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01293"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1111\/lnc3.12396"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_4_1","volume-title":"Decision transformer: Reinforcement learning via sequence modeling. Advances in neural information processing systems 34","author":"Chen Lili","year":"2021","unstructured":"Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Misha Laskin, Pieter Abbeel, Aravind Srinivas, and Igor Mordatch. 2021. Decision transformer: Reinforcement learning via sequence modeling. Advances in neural information processing systems 34 (2021), 15084\u201315097."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33718-5_23"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2021.3067779"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2020.2973473"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2019.2899187"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_46"},{"key":"e_1_3_2_1_12_1","volume-title":"Videograph: Recognizing minutes-long human activities in videos. arXiv preprint arXiv:1905.05143","author":"Hussein Noureldien","year":"2019","unstructured":"Noureldien Hussein, Efstratios Gavves, and Arnold\u00a0WM Smeulders. 2019. Videograph: Recognizing minutes-long human activities in videos. arXiv preprint arXiv:1905.05143 (2019)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.5555\/1622737.1622748"},{"key":"e_1_3_2_1_14_1","volume-title":"Creation and validation of a chest X-ray dataset with eye-tracking and report dictation for AI development. Scientific data 8, 1","author":"Karargyris Alexandros","year":"2021","unstructured":"Alexandros Karargyris, Satyananda Kashyap, Ismini Lourentzou, Joy\u00a0T Wu, Arjun Sharma, Matthew Tong, Shafiq Abedin, David Beymer, Vandana Mukherjee, Elizabeth\u00a0A Krupinski, 2021. Creation and validation of a chest X-ray dataset with eye-tracking and report dictation for AI development. Scientific data 8, 1 (2021), 92."},{"key":"e_1_3_2_1_15_1","unstructured":"Andrej Karpathy. [n. d.]. minGPT. https:\/\/github.com\/karpathy\/minGPT. Accessed: 2023-04-23."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/VR.2016.7504694"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.399"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_38"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3038311"},{"key":"e_1_3_2_1_20_1","volume-title":"Advances in neural information processing systems 27","author":"Maddison J","year":"2014","unstructured":"Chris\u00a0J Maddison, Daniel Tarlow, and Tom Minka. 2014. A* sampling. Advances in neural information processing systems 27 (2014)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00111"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00024"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2012.6239188"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3649902.3653340"},{"key":"e_1_3_2_1_25_1","volume-title":"Predicting the Driver\u2019s Focus of Attention: the DR (eye) VE Project","author":"Palazzi Andrea","year":"2018","unstructured":"Andrea Palazzi, Davide Abati, Francesco Solera, Rita Cucchiara, 2018. Predicting the Driver\u2019s Focus of Attention: the DR (eye) VE Project. IEEE transactions on pattern analysis and machine intelligence 41, 7 (2018), 1720\u20131733."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00886"},{"key":"e_1_3_2_1_27_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2021.103252"},{"key":"e_1_3_2_1_29_1","volume-title":"GazeTransformer: Gaze Forecasting for Virtual Reality Using Transformer Networks. In DAGM German Conference on Pattern Recognition. Springer, 577\u2013593","author":"Rolff Tim","year":"2022","unstructured":"Tim Rolff, H\u00a0Matthias Harms, Frank Steinicke, and Simone Frintrop. 2022. GazeTransformer: Gaze Forecasting for Virtual Reality Using Transformer Networks. In DAGM German Conference on Pattern Recognition. Springer, 577\u2013593."},{"key":"e_1_3_2_1_30_1","volume-title":"Reinforcement Learning Upside Down: Don\u2019t Predict Rewards\u2013Just Map Them to Actions. arXiv preprint arXiv:1912.02875","author":"Schmidhuber Juergen","year":"2019","unstructured":"Juergen Schmidhuber. 2019. Reinforcement Learning Upside Down: Don\u2019t Predict Rewards\u2013Just Map Them to Actions. arXiv preprint arXiv:1912.02875 (2019)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.artint.2020.103275"},{"key":"e_1_3_2_1_32_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS40897.2019.8967779"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00027"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.377"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.377"}],"event":{"name":"ETRA '24: The 2024 Symposium on Eye Tracking Research and Applications","location":"Glasgow United Kingdom","acronym":"ETRA '24","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 2024 Symposium on Eye Tracking Research and Applications"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649902.3653439","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3649902.3653439","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T11:47:02Z","timestamp":1755863222000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649902.3653439"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,4]]},"references-count":36,"alternative-id":["10.1145\/3649902.3653439","10.1145\/3649902"],"URL":"https:\/\/doi.org\/10.1145\/3649902.3653439","relation":{},"subject":[],"published":{"date-parts":[[2024,6,4]]},"assertion":[{"value":"2024-06-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}