{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T17:19:23Z","timestamp":1774718363424,"version":"3.50.1"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730009","type":"print"},{"value":"9783031730016","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73001-6_21","type":"book-chapter","created":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T10:22:22Z","timestamp":1732616542000},"page":"366-382","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["OAT: Object-Level Attention Transformer for\u00a0Gaze Scanpath Prediction"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-9478-5545","authenticated-orcid":false,"given":"Yini","family":"Fang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3076-1287","authenticated-orcid":false,"given":"Jingling","family":"Yu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1312-4566","authenticated-orcid":false,"given":"Haozheng","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7726-8238","authenticated-orcid":false,"given":"Ralf","family":"van der Lans","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9167-7495","authenticated-orcid":false,"given":"Bertram","family":"Shi","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,27]]},"reference":[{"issue":"2","key":"21_CR1","doi-asserted-by":"publisher","first-page":"235","DOI":"10.1037\/0096-1523.32.2.235","volume":"32","author":"MR Beck","year":"2006","unstructured":"Beck, M.R., Peterson, M.S., Vomela, M.: Memory for where, but not what, is used during visual search. J. Exp. Psychol. Hum. Percept. Perform. 32(2), 235 (2006)","journal-title":"J. Exp. Psychol. Hum. Percept. Perform."},{"issue":"1","key":"21_CR2","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1167\/jov.21.1.5","volume":"21","author":"CR Bennett","year":"2021","unstructured":"Bennett, C.R., Bex, P.J., Merabet, L.B.: Assessing visual search performance using a novel dynamic naturalistic scene. J. Vis. 21(1), 5 (2021)","journal-title":"J. Vis."},{"key":"21_CR3","doi-asserted-by":"crossref","unstructured":"Chen, X., Jiang, M., Zhao, Q.: Predicting human scanpaths in visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10876\u201310885 (2021)","DOI":"10.1109\/CVPR46437.2021.01073"},{"issue":"10","key":"21_CR4","doi-asserted-by":"publisher","first-page":"5142","DOI":"10.1109\/TIP.2018.2851672","volume":"27","author":"M Cornia","year":"2018","unstructured":"Cornia, M., Baraldi, L., Serra, G., Cucchiara, R.: Predicting human eye fixations via an LSTM-based saliency attentive model. IEEE Trans. Image Process. 27(10), 5142\u20135154 (2018)","journal-title":"IEEE Trans. Image Process."},{"key":"21_CR5","doi-asserted-by":"publisher","first-page":"1079","DOI":"10.3758\/s13428-012-0212-2","volume":"44","author":"R Dewhurst","year":"2012","unstructured":"Dewhurst, R., Nystr\u00f6m, M., Jarodzka, H., Foulsham, T., Johansson, R., Holmqvist, K.: It depends on how you look at it: scanpath comparison in multiple dimensions with multimatch, a vector-based approach. Behav. Res. Methods 44, 1079\u20131100 (2012)","journal-title":"Behav. Res. Methods"},{"key":"21_CR6","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth $$16 \\times 16$$ words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"issue":"2","key":"21_CR7","doi-asserted-by":"publisher","first-page":"161","DOI":"10.1037\/0096-3445.123.2.161","volume":"123","author":"R Egly","year":"1994","unstructured":"Egly, R., Driver, J., Rafal, R.D.: Shifting visual attention between objects and locations: evidence from normal and parietal lesion subjects. J. Exp. Psychol. Gen. 123(2), 161 (1994)","journal-title":"J. Exp. Psychol. Gen."},{"issue":"14","key":"21_CR8","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1167\/8.14.18","volume":"8","author":"W Einh\u00e4user","year":"2008","unstructured":"Einh\u00e4user, W., Spain, M., Perona, P.: Objects predict fixations better than early saliency. J. Vis. 8(14), 18 (2008)","journal-title":"J. Vis."},{"issue":"11","key":"21_CR9","doi-asserted-by":"publisher","first-page":"1254","DOI":"10.1109\/34.730558","volume":"20","author":"L Itti","year":"1998","unstructured":"Itti, L., Koch, C., Niebur, E.: A model of saliency-based visual attention for rapid scene analysis. IEEE Trans. Pattern Anal. Mach. Intell. 20(11), 1254\u20131259 (1998)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"21_CR10","unstructured":"Judd, T., Durand, F., Torralba, A.: A benchmark of computational models of saliency to predict human fixations (2012)"},{"key":"21_CR11","doi-asserted-by":"crossref","unstructured":"Judd, T., Ehinger, K., Durand, F., Torralba, A.: Learning to predict where humans look. In: 2009 IEEE 12th International Conference on Computer Vision, pp. 2106\u20132113. IEEE (2009)","DOI":"10.1109\/ICCV.2009.5459462"},{"issue":"2","key":"21_CR12","doi-asserted-by":"publisher","first-page":"175","DOI":"10.1016\/0010-0285(92)90007-O","volume":"24","author":"D Kahneman","year":"1992","unstructured":"Kahneman, D., Treisman, A., Gibbs, B.J.: The reviewing of object files: object-specific integration of information. Cogn. Psychol. 24(2), 175\u2013219 (1992)","journal-title":"Cogn. Psychol."},{"issue":"4","key":"21_CR13","first-page":"219","volume":"4","author":"C Koch","year":"1985","unstructured":"Koch, C., Ullman, S.: Shifts in selective visual attention: towards the underlying neural circuitry. Hum. Neurobiol. 4(4), 219\u2013227 (1985)","journal-title":"Hum. Neurobiol."},{"issue":"6","key":"21_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3414685.3417820","volume":"39","author":"B Krajancich","year":"2020","unstructured":"Krajancich, B., Kellnhofer, P., Wetzstein, G.: Optimizing depth perception in virtual and augmented reality through gaze-contingent stereo rendering. ACM Trans. Graph. (TOG) 39(6), 1\u201310 (2020)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"21_CR15","doi-asserted-by":"crossref","unstructured":"Kummerer, M., Wallis, T.S., Bethge, M.: Saliency benchmarking made easy: separating models, maps and metrics. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 770\u2013787 (2018)","DOI":"10.1007\/978-3-030-01270-0_47"},{"issue":"3","key":"21_CR16","doi-asserted-by":"publisher","first-page":"374","DOI":"10.1093\/jcr\/ucab017","volume":"48","author":"R Van der Lans","year":"2021","unstructured":"Van der Lans, R., Pieters, R., Wedel, M.: Online advertising suppresses visual competition during planned purchases. J. Consum. Res. 48(3), 374\u2013393 (2021)","journal-title":"J. Consum. Res."},{"key":"21_CR17","unstructured":"Levenshtein, V.I., et\u00a0al.: Binary codes capable of correcting deletions, insertions, and reversals. In: Soviet Physics Doklady, vol.\u00a010, pp. 707\u2013710. Soviet Union (1966)"},{"key":"21_CR18","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"21_CR19","doi-asserted-by":"crossref","unstructured":"Linardos, A., K\u00fcmmerer, M., Press, O., Bethge, M.: Deepgaze IIE: calibrated prediction in and out-of-domain for state-of-the-art saliency modeling. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12919\u201312928 (2021)","DOI":"10.1109\/ICCV48922.2021.01268"},{"key":"21_CR20","doi-asserted-by":"publisher","first-page":"455","DOI":"10.1016\/j.neucom.2022.04.080","volume":"494","author":"J Lou","year":"2022","unstructured":"Lou, J., Lin, H., Marshall, D., Saupe, D., Liu, H.: Transalnet: towards perceptually relevant visual saliency prediction. Neurocomputing 494, 455\u2013467 (2022)","journal-title":"Neurocomputing"},{"key":"21_CR21","doi-asserted-by":"crossref","unstructured":"Mondal, S., Yang, Z., Ahn, S., Samaras, D., Zelinsky, G., Hoai, M.: GazeFormer: scalable, effective and fast prediction of goal-directed human attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1441\u20131450 (2023)","DOI":"10.1109\/CVPR52729.2023.00145"},{"key":"21_CR22","unstructured":"Niebur, E.: Computational architectures for attention. The attentive brain (1998)"},{"issue":"8","key":"21_CR23","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1167\/10.8.20","volume":"10","author":"A Nuthmann","year":"2010","unstructured":"Nuthmann, A., Henderson, J.M.: Object-based attentional selection in scene viewing. J. Vis. 10(8), 20 (2010)","journal-title":"J. Vis."},{"issue":"6753","key":"21_CR24","doi-asserted-by":"publisher","first-page":"584","DOI":"10.1038\/44134","volume":"401","author":"KM O\u2019Craven","year":"1999","unstructured":"O\u2019Craven, K.M., Downing, P.E., Kanwisher, N.: fMRI evidence for objects as the units of attentional selection. Nature 401(6753), 584\u2013587 (1999)","journal-title":"Nature"},{"issue":"5","key":"21_CR25","doi-asserted-by":"publisher","first-page":"2","DOI":"10.1167\/13.5.2","volume":"13","author":"M Pajak","year":"2013","unstructured":"Pajak, M., Nuthmann, A.: Object-based saccadic selection during scene perception: evidence from viewing position effects. J. Vis. 13(5), 2 (2013)","journal-title":"J. Vis."},{"key":"21_CR26","doi-asserted-by":"crossref","unstructured":"Rivu, R., Abdrabou, Y., Pfeuffer, K., Esteves, A., Meitner, S., Alt, F.: Stare: gaze-assisted face-to-face communication in augmented reality. In: ACM Symposium on Eye Tracking Research and Applications, pp.\u00a01\u20135 (2020)","DOI":"10.1145\/3379157.3388930"},{"key":"21_CR27","unstructured":"Schwinn, L., Precup, D., Eskofier, B., Zanca, D.: Behind the machine\u2019s gaze: neural networks with biologically-inspired constraints exhibit human-like visual attention. arXiv preprint arXiv:2204.09093 (2022)"},{"key":"21_CR28","doi-asserted-by":"crossref","unstructured":"Shaw, P., Uszkoreit, J., Vaswani, A.: Self-attention with relative position representations. arXiv preprint arXiv:1803.02155 (2018)","DOI":"10.18653\/v1\/N18-2074"},{"key":"21_CR29","unstructured":"Shen, Y., Mo, X., Krisciunas, V., Hanson, D., Shi, B.E.: Intention estimation via gaze for robot guidance in hierarchical tasks. In: Annual Conference on Neural Information Processing Systems, pp. 140\u2013164. PMLR (2023)"},{"key":"21_CR30","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"297","DOI":"10.1007\/978-3-030-00111-7_25","volume-title":"KI 2018: Advances in Artificial Intelligence","author":"S Stauden","year":"2018","unstructured":"Stauden, S., Barz, M., Sonntag, D.: Visual search target inference using bag of deep visual words. In: Trollmann, F., Turhan, A.-Y. (eds.) KI 2018. LNCS (LNAI), vol. 11117, pp. 297\u2013304. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-00111-7_25"},{"key":"21_CR31","doi-asserted-by":"crossref","unstructured":"Tonini, F., Dall\u2019Asen, N., Beyan, C., Ricci, E.: Object-aware gaze target detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 21860\u201321869 (2023)","DOI":"10.1109\/ICCV51070.2023.01998"},{"key":"21_CR32","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"21_CR33","doi-asserted-by":"crossref","unstructured":"Wedel, M., Pieters, R., van\u00a0der Lans, R.: Eye tracking methodology for research in consumer psychology. In: Handbook of Research Methods in Consumer Psychology, pp. 276\u2013292. Routledge (2019)","DOI":"10.4324\/9781351137713-15"},{"key":"21_CR34","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: Predicting goal-directed human attention using inverse reinforcement learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 193\u2013202 (2020)","DOI":"10.1109\/CVPR42600.2020.00027"},{"key":"21_CR35","unstructured":"Yang, Z., Mondal, S., Ahn, S., Zelinsky, G., Hoai, M., Samaras, D.: Predicting human attention using computational attention. arXiv preprint arXiv:2303.09383 (2023)"},{"key":"21_CR36","doi-asserted-by":"crossref","unstructured":"Zelinsky, G., et al.: Benchmarking gaze prediction for categorical visual search. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (2019)","DOI":"10.1109\/CVPRW.2019.00111"},{"key":"21_CR37","doi-asserted-by":"crossref","unstructured":"Zelinsky, G.J., et al.: Predicting goal-directed attention control using inverse-reinforcement learning. Neurons Behav. Data Anal. Theory 2021 (2021)","DOI":"10.51628\/001c.22322"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73001-6_21","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T11:14:50Z","timestamp":1732619690000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73001-6_21"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,27]]},"ISBN":["9783031730009","9783031730016"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73001-6_21","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,27]]},"assertion":[{"value":"27 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}