{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,8]],"date-time":"2026-01-08T21:46:08Z","timestamp":1767908768550,"version":"3.49.0"},"publisher-location":"Cham","reference-count":79,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729454","type":"print"},{"value":"9783031729461","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T00:00:00Z","timestamp":1727827200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T00:00:00Z","timestamp":1727827200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72946-1_14","type":"book-chapter","created":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T19:02:08Z","timestamp":1727809328000},"page":"236-255","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Look Hear: Gaze Prediction for\u00a0Speech-Directed Human Attention"],"prefix":"10.1007","author":[{"given":"Sounak","family":"Mondal","sequence":"first","affiliation":[]},{"given":"Seoyoung","family":"Ahn","sequence":"additional","affiliation":[]},{"given":"Zhibo","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Niranjan","family":"Balasubramanian","sequence":"additional","affiliation":[]},{"given":"Dimitris","family":"Samaras","sequence":"additional","affiliation":[]},{"given":"Gregory","family":"Zelinsky","sequence":"additional","affiliation":[]},{"given":"Minh","family":"Hoai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,2]]},"reference":[{"key":"14_CR1","doi-asserted-by":"crossref","unstructured":"Adhanom, I.B., Griffin, N.N., MacNeilage, P., Folmer, E.: The effect of a foveated field-of-view restrictor on VR sickness. In: 2020 IEEE Conference on Virtual Reality and 3D User Interfaces (VR). IEEE (2020)","DOI":"10.1109\/VR46266.2020.00087"},{"key":"14_CR2","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. In: Advances in Neural Information Processing Systems (2022)"},{"issue":"2","key":"14_CR3","first-page":"190","volume":"137","author":"GT Altmann","year":"2011","unstructured":"Altmann, G.T.: Language can mediate eye movement control within 100 milliseconds, regardless of whether there is anything to move the eyes to. Acta Physiol. 137(2), 190\u2013200 (2011)","journal-title":"Acta Physiol."},{"key":"14_CR4","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2015)","DOI":"10.1109\/ICCV.2015.279"},{"issue":"11","key":"14_CR5","doi-asserted-by":"publisher","first-page":"2600","DOI":"10.1111\/ans.18686","volume":"93","author":"T Bapna","year":"2023","unstructured":"Bapna, T., Valles, J., Leng, S., Pacilli, M., Nataraja, R.M.: Eye-tracking in surgery: a systematic review. ANZ J. Surg. 93(11), 2600\u20132608 (2023)","journal-title":"ANZ J. Surg."},{"issue":"1","key":"14_CR6","doi-asserted-by":"publisher","first-page":"5","DOI":"10.1167\/jov.21.1.5","volume":"21","author":"CR Bennett","year":"2021","unstructured":"Bennett, C.R., Bex, P.J., Merabet, L.B.: Assessing visual search performance using a novel dynamic naturalistic scene. J. Vis. 21(1), 5 (2021)","journal-title":"J. Vis."},{"issue":"5","key":"14_CR7","doi-asserted-by":"publisher","first-page":"19","DOI":"10.1167\/9.5.19","volume":"9","author":"DJ Berg","year":"2009","unstructured":"Berg, D.J., Boehnke, S.E., Marino, R.A., Munoz, D.P., Itti, L.: Free viewing of dynamic stimuli by humans and monkeys. J. Vis. 9(5), 19 (2009)","journal-title":"J. Vis."},{"issue":"3","key":"14_CR8","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1109\/TPAMI.2018.2815601","volume":"41","author":"Z Bylinskii","year":"2019","unstructured":"Bylinskii, Z., Judd, T., Oliva, A., Torralba, A., Durand, F.: What do different evaluation metrics tell us about saliency models? IEEE Trans. Pattern Anal. Mach. Intell. 41(3), 740\u2013757 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"14_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"14_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1007\/978-3-030-58452-8_6","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Chen","year":"2020","unstructured":"Chen, S., Jiang, M., Yang, J., Zhao, Q.: AiR: attention with reasoning capability. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 91\u2013107. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_6"},{"key":"14_CR11","doi-asserted-by":"crossref","unstructured":"Chen, X., Jiang, M., Zhao, Q.: Predicting human scanpaths in visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2021)","DOI":"10.1109\/CVPR46437.2021.01073"},{"key":"14_CR12","unstructured":"Chen, X., Ma, L., Chen, J., Jie, Z., Liu, W., Luo, J.: Real-time referring expression comprehension by single-stage grounding network. arXiv preprint arXiv:1812.03426 (2018)"},{"issue":"1","key":"14_CR13","doi-asserted-by":"publisher","first-page":"8776","DOI":"10.1038\/s41598-021-87715-9","volume":"11","author":"Y Chen","year":"2021","unstructured":"Chen, Y., Yang, Z., Ahn, S., Samaras, D., Hoai, M., Zelinsky, G.: COCO-Search18 fixation dataset for predicting goal-directed attention control. Sci. Rep. 11(1), 8776 (2021)","journal-title":"Sci. Rep."},{"key":"14_CR14","doi-asserted-by":"crossref","unstructured":"Chen, Y., et al.: Characterizing target-absent human attention. In: Proceedings of CVPR International Workshop on Gaze Estimation and Prediction in the Wild (2022)","DOI":"10.1109\/CVPRW56347.2022.00551"},{"issue":"5","key":"14_CR15","doi-asserted-by":"publisher","first-page":"2362","DOI":"10.3390\/app12052362","volume":"12","author":"J Chung","year":"2022","unstructured":"Chung, J., Lee, H., Moon, H., Lee, E.: The static and dynamic analyses of drivers\u2019 gaze movement using VR driving simulator. Appl. Sci. 12(5), 2362 (2022)","journal-title":"Appl. Sci."},{"issue":"1","key":"14_CR16","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1016\/0010-0285(74)90005-X","volume":"6","author":"RM Cooper","year":"1974","unstructured":"Cooper, R.M.: The control of eye fixation by the meaning of spoken language: a new methodology for the real-time investigation of speech perception, memory, and language processing. Cogn. Psychol. 6(1), 84\u2013107 (1974)","journal-title":"Cogn. Psychol."},{"key":"14_CR17","doi-asserted-by":"crossref","unstructured":"Deng, J., Yang, Z., Chen, T., Zhou, W., Li, H.: TransVG: end-to-end visual grounding with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"14_CR18","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers) (2019)"},{"key":"14_CR19","doi-asserted-by":"crossref","unstructured":"Fang, H., et al.: From captions to visual concepts and back. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"14_CR20","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"14_CR21","doi-asserted-by":"crossref","unstructured":"He, S., Tavakoli, H.R., Borji, A., Pugeault, N.: Human attention in image captioning: dataset and analysis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00862"},{"key":"14_CR22","doi-asserted-by":"crossref","unstructured":"Henderson, J.M., Brockmole, J.R., Castelhano, M.S., Mack, M.: Visual saliency does not account for eye movements during visual search in real-world scenes. In: Eye Movements, pp. 537\u2013III. Elsevier (2007)","DOI":"10.1016\/B978-008044980-7\/50027-6"},{"issue":"2","key":"14_CR23","doi-asserted-by":"publisher","first-page":"684","DOI":"10.1109\/TPAMI.2019.2911066","volume":"44","author":"R Hong","year":"2019","unstructured":"Hong, R., Liu, D., Mo, X., He, X., Zhang, H.: Learning to compose and reason with language tree structures for visual grounding. IEEE Trans. Pattern Anal. Mach. Intell. 44(2), 684\u2013696 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"14_CR24","doi-asserted-by":"crossref","unstructured":"Hu, R., Rohrbach, M., Andreas, J., Darrell, T., Saenko, K.: Modeling relationships in referential expressions with compositional modular networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.470"},{"issue":"11","key":"14_CR25","doi-asserted-by":"publisher","first-page":"1254","DOI":"10.1109\/34.730558","volume":"20","author":"L Itti","year":"1998","unstructured":"Itti, L., Koch, C., Niebur, E.: A model of saliency-based visual attention for rapid scene analysis. IEEE Trans. Pattern Anal. Mach. Intell. 20(11), 1254\u20131259 (1998)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"14_CR26","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning (2021)"},{"issue":"1\u20132","key":"14_CR27","doi-asserted-by":"publisher","first-page":"107","DOI":"10.1016\/j.cviu.2004.10.009","volume":"100","author":"T Jost","year":"2005","unstructured":"Jost, T., Ouerhani, N., Von Wartburg, R., M\u00fcri, R., H\u00fcgli, H.: Assessing the contribution of color in visual attention. Comput. Vis. Image Underst. 100(1\u20132), 107\u2013123 (2005)","journal-title":"Comput. Vis. Image Underst."},{"key":"14_CR28","doi-asserted-by":"crossref","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., Carion, N.: MDETR-modulated detection for end-to-end multi-modal understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2021)","DOI":"10.1109\/ICCV48922.2021.00180"},{"issue":"1","key":"14_CR29","doi-asserted-by":"publisher","first-page":"133","DOI":"10.1016\/S0749-596X(03)00023-8","volume":"49","author":"Y Kamide","year":"2003","unstructured":"Kamide, Y., Altmann, G.T., Haywood, S.L.: The time-course of prediction in incremental sentence processing: evidence from anticipatory eye movements. J. Mem. Lang. 49(1), 133\u2013156 (2003)","journal-title":"J. Mem. Lang."},{"key":"14_CR30","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: ReferitGame: referring to objects in photographs of natural scenes. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP) (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"14_CR31","doi-asserted-by":"crossref","unstructured":"Khokhar, A., Yoshimura, A., Borst, C.: Eye-gaze-triggered visual cues to restore attention in educational VR. In: 2019 IEEE Conference on Virtual Reality and 3D User Interfaces (VR), Poster (2019)","DOI":"10.1109\/VR.2019.8798327"},{"issue":"2","key":"14_CR32","doi-asserted-by":"publisher","first-page":"66","DOI":"10.1111\/lnc3.12177","volume":"10","author":"P Knoeferle","year":"2016","unstructured":"Knoeferle, P., Guerra, E.: Visually situated language comprehension. Lang. Linguist. Compass 10(2), 66\u201382 (2016)","journal-title":"Lang. Linguist. Compass"},{"issue":"3","key":"14_CR33","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1167\/14.3.14","volume":"14","author":"K Koehler","year":"2014","unstructured":"Koehler, K., Guo, F., Zhang, S., Eckstein, M.P.: What do saliency models predict? J. Vis. 14(3), 14 (2014)","journal-title":"J. Vis."},{"key":"14_CR34","doi-asserted-by":"crossref","unstructured":"Kuo, C.W., Kira, Z.: Beyond a pre-trained object detector: cross-modal textual and visual context for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01744"},{"key":"14_CR35","doi-asserted-by":"crossref","unstructured":"Lang, Y., Wei, L., Xu, F., Zhao, Y., Yu, L.F.: Synthesizing personalized training programs for improving driving habits via virtual reality. In: 2018 IEEE Conference on Virtual Reality and 3D User Interfaces (VR). IEEE (2018)","DOI":"10.1109\/VR.2018.8448290"},{"issue":"2","key":"14_CR36","doi-asserted-by":"publisher","first-page":"9","DOI":"10.1167\/jov.24.2.9","volume":"24","author":"E Lavoie","year":"2024","unstructured":"Lavoie, E., Hebert, J.S., Chapman, C.S.: Comparing eye-hand coordination between controller-mediated virtual reality, and a real-world object interaction task. J. Vis. 24(2), 9 (2024)","journal-title":"J. Vis."},{"key":"14_CR37","first-page":"707","volume":"10","author":"VI Levenshtein","year":"1965","unstructured":"Levenshtein, V.I.: Binary codes capable of correcting deletions, insertions, and reversals. Sov. Phys. Dokl. 10, 707\u2013710 (1965)","journal-title":"Sov. Phys. Dokl."},{"key":"14_CR38","unstructured":"Li, L.H., Yatskar, M., Yin, D., Hsieh, C.J., Chang, K.W.: VisualBERT: a simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)"},{"key":"14_CR39","unstructured":"Li, P., et al.: TOIST: task oriented instance segmentation transformer with noun-pronoun distillation. In: Advances in Neural Information Processing Systems (2022)"},{"key":"14_CR40","unstructured":"Li, Y., et al.: Understanding embodied reference with touch-line transformer. In: International Conference on Learning Representations (2023)"},{"key":"14_CR41","doi-asserted-by":"crossref","unstructured":"Liao, Y., et al.: A real-time cross-modality correlation filtering method for referring expression comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2020)","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"14_CR42","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"14_CR43","doi-asserted-by":"crossref","unstructured":"Liu, D., Zhang, H., Wu, F., Zha, Z.J.: Learning to assemble neural module tree networks for visual grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2019)","DOI":"10.1109\/ICCV.2019.00477"},{"key":"14_CR44","unstructured":"Liu, Y., et al.: RoBERTa: a robustly optimized BERT pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"14_CR45","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A.L., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.9"},{"issue":"11","key":"14_CR46","doi-asserted-by":"publisher","first-page":"25","DOI":"10.1167\/9.11.25","volume":"9","author":"CM Masciocchi","year":"2009","unstructured":"Masciocchi, C.M., Mihalas, S., Parkhurst, D., Niebur, E.: Everyone knows what is interesting: salient locations which should be fixated. J. Vis. 9(11), 25 (2009)","journal-title":"J. Vis."},{"key":"14_CR47","doi-asserted-by":"crossref","unstructured":"McAuliffe, M., Socolof, M., Mihuc, S., Wagner, M., Sonderegger, M.: Montreal forced aligner: trainable text-speech alignment using kaldi. In: Interspeech (2017)","DOI":"10.21437\/Interspeech.2017-1386"},{"key":"14_CR48","doi-asserted-by":"crossref","unstructured":"Mensink, T., et al.: Encyclopedic VQA: visual questions about detailed properties of fine-grained categories. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.00289"},{"key":"14_CR49","doi-asserted-by":"crossref","unstructured":"Min, K., Corso, J.J.: Integrating human gaze into attention for egocentric activity recognition. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (2021)","DOI":"10.1109\/WACV48630.2021.00111"},{"key":"14_CR50","doi-asserted-by":"crossref","unstructured":"Mondal, S., Yang, Z., Ahn, S., Samaras, D., Zelinsky, G., Hoai, M.: Gazeformer: scalable, effective and fast prediction of goal-directed human attention. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.00145"},{"issue":"3","key":"14_CR51","doi-asserted-by":"publisher","first-page":"443","DOI":"10.1016\/0022-2836(70)90057-4","volume":"48","author":"SB Needleman","year":"1970","unstructured":"Needleman, S.B., Wunsch, C.D.: A general method applicable to the search for similarities in the amino acid sequence of two proteins. J. Mol. Biol. 48(3), 443\u2013453 (1970)","journal-title":"J. Mol. Biol."},{"key":"14_CR52","doi-asserted-by":"crossref","unstructured":"Pai, Y.S., Tag, B., Outram, B., Vontin, N., Sugiura, K., Kunze, K.: GazeSim: simulating foveated rendering using depth in eye gaze for VR. In: ACM SIGGRAPH 2016 Posters (2016)","DOI":"10.1145\/2945078.2945153"},{"issue":"8","key":"14_CR53","doi-asserted-by":"publisher","first-page":"692","DOI":"10.1167\/5.8.692","volume":"5","author":"RJ Peters","year":"2005","unstructured":"Peters, R.J., Iyer, A., Koch, C., Itti, L.: Components of bottom-up gaze allocation in natural scenes. J. Vis. 5(8), 692 (2005)","journal-title":"J. Vis."},{"key":"14_CR54","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"647","DOI":"10.1007\/978-3-030-58558-7_38","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Pont-Tuset","year":"2020","unstructured":"Pont-Tuset, J., Uijlings, J., Changpinyo, S., Soricut, R., Ferrari, V.: Connecting vision and language with localized narratives. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12350, pp. 647\u2013664. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58558-7_38"},{"key":"14_CR55","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (2021)"},{"key":"14_CR56","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., Savarese, S.: Generalized intersection over union: A metric and a loss for bounding box regression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2019)","DOI":"10.1109\/CVPR.2019.00075"},{"key":"14_CR57","doi-asserted-by":"crossref","unstructured":"Sennrich, R., Haddow, B., Birch, A.: Neural machine translation of rare words with subword units. In: Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (2016)","DOI":"10.18653\/v1\/P16-1162"},{"issue":"5217","key":"14_CR58","doi-asserted-by":"publisher","first-page":"1632","DOI":"10.1126\/science.7777863","volume":"268","author":"MK Tanenhaus","year":"1995","unstructured":"Tanenhaus, M.K., Spivey-Knowlton, M.J., Eberhard, K.M., Sedivy, J.C.: Integration of visual and linguistic information in spoken language comprehension. Science 268(5217), 1632\u20131634 (1995)","journal-title":"Science"},{"key":"14_CR59","doi-asserted-by":"crossref","unstructured":"Tanenhaus, M.K., Spivey-Knowlton, M.J., Eberhard, K.M., Sedivy, J.C.: Using eye movements to study spoken language comprehension: evidence for visually mediated incremental interpretation (1996)","DOI":"10.7551\/mitpress\/1479.003.0029"},{"key":"14_CR60","doi-asserted-by":"publisher","first-page":"138","DOI":"10.26817\/16925777.291","volume":"11","author":"NC Thanh","year":"2015","unstructured":"Thanh, N.C.: The differences between spoken and written grammar in English, in comparison with Vietnamese (las diferencias entre la gram\u00e1tica oral y escrita del idioma ingl\u00e9s en comparaci\u00f3n con el idioma vietnamita). Gist Educ. Learn. Res. J. 11, 138\u2013153 (2015)","journal-title":"Gist Educ. Learn. Res. J."},{"key":"14_CR61","unstructured":"Townend, J., Walker, J.: Structure of Language: Spoken and Written English. Whurr Publishers (2006)"},{"issue":"7","key":"14_CR62","doi-asserted-by":"publisher","first-page":"13","DOI":"10.1167\/jov.20.7.13","volume":"20","author":"P Vaidyanathan","year":"2020","unstructured":"Vaidyanathan, P., Prud\u2019hommeaux, E., Alm, C.O., Pelz, J.B.: Computational framework for fusing eye movements and spoken narratives for image annotation. J. Vis. 20(7), 13 (2020)","journal-title":"J. Vis."},{"key":"14_CR63","doi-asserted-by":"crossref","unstructured":"Vaidyanathan, P., Prud\u2019hommeaux, E., Pelz, J.B., Alm, C.O.: SNAG: spoken narratives and gaze dataset. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (2018)","DOI":"10.18653\/v1\/P18-2022"},{"key":"14_CR64","doi-asserted-by":"crossref","unstructured":"Vasudevan, A.B., Dai, D., Van\u00a0Gool, L.: Object referring in videos with language and human gaze. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00434"},{"key":"14_CR65","doi-asserted-by":"crossref","unstructured":"Vasudevan, A.B., Dai, D., Van\u00a0Gool, L.: Object referring in visual scene with spoken language. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (2018)","DOI":"10.1109\/WACV.2018.00206"},{"key":"14_CR66","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems (2017)"},{"key":"14_CR67","unstructured":"Wang, P., et al.: OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning (2022)"},{"issue":"2","key":"14_CR68","doi-asserted-by":"publisher","first-page":"270","DOI":"10.1162\/neco.1989.1.2.270","volume":"1","author":"RJ Williams","year":"1989","unstructured":"Williams, R.J., Zipser, D.: A learning algorithm for continually running fully recurrent neural networks. Neural Comput. 1(2), 270\u2013280 (1989)","journal-title":"Neural Comput."},{"key":"14_CR69","doi-asserted-by":"crossref","unstructured":"Yan, B., et al.: Universal instance perception as object discovery and retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01471"},{"key":"14_CR70","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: Predicting goal-directed human attention using inverse reinforcement learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2020)","DOI":"10.1109\/CVPR42600.2020.00027"},{"key":"14_CR71","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: Unifying top-down and bottom-up scanpath prediction using transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2024)","DOI":"10.1109\/CVPR52733.2024.00166"},{"key":"14_CR72","doi-asserted-by":"publisher","first-page":"52","DOI":"10.1007\/978-3-031-19772-7_4","volume-title":"European Conference on Computer Vision 2022","author":"Z Yang","year":"2022","unstructured":"Yang, Z., Mondal, S., Ahn, S., Zelinsky, G., Hoai, M., Samaras, D.: Target-absent human attention. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13664, pp. 52\u201368. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19772-7_4"},{"key":"14_CR73","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: CoCa: contrastive captioners are image-text foundation models. Trans. Mach. Learn. Res. (2022). https:\/\/openreview.net\/forum?id=Ee277P3AYC"},{"key":"14_CR74","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Yu","year":"2016","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 69\u201385. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5"},{"key":"14_CR75","unstructured":"Yuan, L., et al.: Florence: a new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)"},{"key":"14_CR76","doi-asserted-by":"crossref","unstructured":"Zelinsky, G., et al.: Benchmarking gaze prediction for categorical visual search. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshops (2019)","DOI":"10.1109\/CVPRW.2019.00111"},{"key":"14_CR77","doi-asserted-by":"crossref","unstructured":"Zelinsky, G.J., Chen, Y., Ahn, S., Adeli, H.: Changing perspectives on goal-directed attention control: the past, present, and future of modeling fixations during visual search. In: Psychology of Learning and Motivation, vol.\u00a073, pp. 231\u2013286. Elsevier (2020)","DOI":"10.1016\/bs.plm.2020.08.001"},{"key":"14_CR78","doi-asserted-by":"crossref","unstructured":"Zelinsky, G.J., et al.: Predicting goal-directed attention control using inverse-reinforcement learning. Neurons Behav. Data Anal. Theory (2), 1\u20139 (2021)","DOI":"10.51628\/001c.22322"},{"key":"14_CR79","doi-asserted-by":"crossref","unstructured":"Zhang, D., Tian, Y., Chen, K., Qian, K.: Gaze-directed visual grounding under object referring uncertainty. In: 2022 41st Chinese Control Conference (CCC). IEEE (2022)","DOI":"10.23919\/CCC55666.2022.9902263"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72946-1_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T23:35:41Z","timestamp":1732836941000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72946-1_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,2]]},"ISBN":["9783031729454","9783031729461"],"references-count":79,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72946-1_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,2]]},"assertion":[{"value":"2 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}