{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,10]],"date-time":"2025-10-10T00:28:23Z","timestamp":1760056103001,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":51,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031723377"},{"type":"electronic","value":"9783031723384"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-72338-4_28","type":"book-chapter","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T10:03:01Z","timestamp":1726480981000},"page":"415-429","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Gaze Target Detection with\u00a0Visual Prompt Tuning Based on\u00a0Attention"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2377-0855","authenticated-orcid":false,"given":"Ting","family":"Huang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0427-508X","authenticated-orcid":false,"given":"Jian","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,17]]},"reference":[{"key":"28_CR1","unstructured":"Cordonnier, J.B., Loukas, A., Jaggi, M.: On the relationship between self-attention and convolutional layers. arXiv preprint arXiv:1911.03584 (2019)"},{"key":"28_CR2","doi-asserted-by":"crossref","unstructured":"Jin, S., Wang, Z., Wang, L., Bi, N., Nguyen, T.: Redirtrans: latent-to-latent translation for gaze and head redirection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5547\u20135556 (2023)","DOI":"10.1109\/CVPR52729.2023.00537"},{"key":"28_CR3","doi-asserted-by":"publisher","first-page":"83","DOI":"10.1007\/BF01000006","volume":"10","author":"A Abele","year":"1986","unstructured":"Abele, A.: Functions of gaze in social interaction: communication and monitoring. J. Nonverbal Behav. 10, 83\u2013101 (1986)","journal-title":"J. Nonverbal Behav."},{"key":"28_CR4","doi-asserted-by":"crossref","unstructured":"Andrist, S., Tan, X.Z., Gleicher, M., Mutlu, B.: Conversational gaze aversion for humanlike robots. In: Proceedings of the 2014 ACM\/IEEE International Conference on Human-Robot Interaction, pp. 25\u201332 (2014)","DOI":"10.1145\/2559636.2559666"},{"key":"28_CR5","doi-asserted-by":"crossref","unstructured":"Huang, M.X., Li, J., Ngai, G., Leong, H.V.: Stressclick: sensing stress from gaze-click patterns. In: Proceedings of the 24th ACM International Conference on Multimedia, pp. 1395\u20131404 (2016)","DOI":"10.1145\/2964284.2964318"},{"key":"28_CR6","doi-asserted-by":"crossref","unstructured":"Feit, A.M., et al.: Toward everyday gaze input: accuracy and precision of eye tracking and implications for design. In: Proceedings of the 2017 Chi Conference on Human Factors in Computing Systems, pp. 1118\u20131130 (2017)","DOI":"10.1145\/3025453.3025599"},{"key":"28_CR7","doi-asserted-by":"crossref","unstructured":"Jacob, R.J., Karn, K.S.: Eye tracking in human-computer interaction and usability research: ready to deliver the promises. In: The Mind\u2019s Eye, pp. 573\u2013605 (2003)","DOI":"10.1016\/B978-044451020-4\/50031-1"},{"key":"28_CR8","doi-asserted-by":"crossref","unstructured":"Schwehr, J., Willert, V.: Driver\u2019s gaze prediction in dynamic automotive scenes. In: 2017 IEEE 20th International Conference on Intelligent Transportation Systems (ITSC), pp. 1\u20138. IEEE (2017)","DOI":"10.1109\/ITSC.2017.8317586"},{"key":"28_CR9","doi-asserted-by":"crossref","unstructured":"Burova, A., et al.: Utilizing VR and gaze tracking to develop AR solutions for industrial maintenance. In: Proceedings of the 2020 CHI Conference on Human Factors in Computing Systems, pp. 1\u201313 (2020)","DOI":"10.1145\/3313831.3376405"},{"issue":"4","key":"28_CR10","doi-asserted-by":"publisher","first-page":"1633","DOI":"10.1109\/TVCG.2018.2793599","volume":"24","author":"V Sitzmann","year":"2018","unstructured":"Sitzmann, V., et al.: Saliency in VR: how do people explore virtual environments? IEEE Trans. Visual Comput. Graphics 24(4), 1633\u20131642 (2018)","journal-title":"IEEE Trans. Visual Comput. Graphics"},{"key":"28_CR11","unstructured":"Recasens, A., Khosla, A., Vondrick, C., Torralba, A.: Where are they looking? Adv. Neural Inf. Process. Syst. 28 (2015)"},{"key":"28_CR12","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Adv. Neural Inf. Process. Syst. 25 (2012)"},{"key":"28_CR13","doi-asserted-by":"crossref","unstructured":"Chong, E., Wang, Y., Ruiz, N., Rehg, J.M.: Detecting attended visual targets in video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5396\u20135406 (2020)","DOI":"10.1109\/CVPR42600.2020.00544"},{"key":"28_CR14","doi-asserted-by":"crossref","unstructured":"Chong, E., Ruiz, N., Wang, Y., Zhang, Y., Rozga, A., Rehg, J.M.: Connecting gaze, scene, and attention: generalized attention estimation via joint modeling of gaze and scene saliency. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 383\u2013398 (2018)","DOI":"10.1007\/978-3-030-01228-1_24"},{"key":"28_CR15","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"28_CR16","unstructured":"Ramachandran, P., Parmar, N., Vaswani, A., Bello, I., Levskaya, A., Shlens, J.: Stand-alone self-attention in vision models. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"28_CR17","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017"},{"key":"28_CR18","unstructured":"Wu, Y., et al.: Google\u2019s neural machine translation system: bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144 (2016)"},{"key":"28_CR19","unstructured":"Chorowski, J.K., Bahdanau, D., Serdyuk, D., Cho, K., Bengio, Y.: Attention-based models for speech recognition. Adv. Neural Inf. Process. Syst. 28 (2015)"},{"key":"28_CR20","doi-asserted-by":"crossref","unstructured":"Chan, W., Jaitly, N., Le, Q., Vinyals, O.: Listen, attend and spell: a neural network for large vocabulary conversational speech recognition. In: 2016 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 4960\u20134964. IEEE (2016)","DOI":"10.1109\/ICASSP.2016.7472621"},{"key":"28_CR21","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: International Conference on Machine Learning, pp. 2048\u20132057. PMLR (2015)"},{"key":"28_CR22","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G.: Squeeze-and-excitation networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7132\u20137141 (2018)","DOI":"10.1109\/CVPR.2018.00745"},{"key":"28_CR23","doi-asserted-by":"crossref","unstructured":"Tan, M., et al.: Mnasnet: platform-aware neural architecture search for mobile. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2820\u20132828 (2019)","DOI":"10.1109\/CVPR.2019.00293"},{"key":"28_CR24","doi-asserted-by":"crossref","unstructured":"Xiaolong Wang, Ross Girshick, Abhinav Gupta, and Kaiming He. Non-local neural networks. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 7794\u20137803, 2018","DOI":"10.1109\/CVPR.2018.00813"},{"key":"28_CR25","doi-asserted-by":"crossref","unstructured":"Bello, I., Zoph, B., Vaswani, A., Shlens, J., Le, Q.V.: Attention augmented convolutional networks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3286\u20133295 (2019)","DOI":"10.1109\/ICCV.2019.00338"},{"key":"28_CR26","unstructured":"Hu, J., Shen, L., Albanie, S., Sun, G., Vedaldi, A.: Gather-excite: exploiting feature context in convolutional neural networks. Adv. Neural Inf. Process. Syst. 31 (2018)"},{"key":"28_CR27","doi-asserted-by":"crossref","unstructured":"Hu, H., Zhang, Z., Xie, Z., Lin, S.: Local relation networks for image recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3464\u20133473 (2019)","DOI":"10.1109\/ICCV.2019.00356"},{"key":"28_CR28","doi-asserted-by":"crossref","unstructured":"Jia, M., et al.: Visual prompt tuning. In: European Conference on Computer Vision, pp. 709\u2013727. Springer, Heidelberg (2022)","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"28_CR29","doi-asserted-by":"publisher","first-page":"282","DOI":"10.1007\/s11263-013-0655-7","volume":"106","author":"MJ Marin-Jimenez","year":"2014","unstructured":"Marin-Jimenez, M.J., Zisserman, A., Eichner, M., Ferrari, V.: Detecting people looking at each other in videos. Int. J. Comput. Vision 106, 282\u2013296 (2014)","journal-title":"Int. J. Comput. Vision"},{"key":"28_CR30","unstructured":"Jasso, H., Triesch, J., De\u00e1k, G.O.: Using eye direction cues for gaze following\u2013a developmental model. In: Yu, C., Smith, L.B., Sporns, O. (eds.) Proceedings of the International Conference on Development and Learning (ICDL 2006). Indiana University, Bloomington (2006)"},{"key":"28_CR31","doi-asserted-by":"publisher","first-page":"113","DOI":"10.1016\/j.visres.2014.10.027","volume":"116","author":"D Parks","year":"2015","unstructured":"Parks, D., Borji, A., Itti, L.: Augmented saliency model using automatic 3d head pose detection and learned gaze following in natural scenes. Vision. Res. 116, 113\u2013126 (2015)","journal-title":"Vision. Res."},{"key":"28_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"314","DOI":"10.1007\/978-3-642-33718-5_23","volume-title":"Computer Vision \u2013 ECCV 2012","author":"A Fathi","year":"2012","unstructured":"Fathi, A., Li, Y., Rehg, J.M.: Learning to recognize daily actions using gaze. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7572, pp. 314\u2013327. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-33718-5_23"},{"key":"28_CR33","doi-asserted-by":"crossref","unstructured":"Miao, Q., Hoai, M., Samaras, D.: Patch-level gaze distribution prediction for gaze following. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 880\u2013889 (2023)","DOI":"10.1109\/WACV56688.2023.00094"},{"key":"28_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1007\/978-3-030-20893-6_3","volume-title":"Computer Vision \u2013 ACCV 2018","author":"D Lian","year":"2019","unstructured":"Lian, D., Yu, Z., Gao, S.: Believe it or not, we know what you are looking at! In: Jawahar, C.V., Li, H., Mori, G., Schindler, K. (eds.) ACCV 2018. LNCS, vol. 11363, pp. 35\u201350. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-20893-6_3"},{"key":"28_CR35","doi-asserted-by":"crossref","unstructured":"Fang, Y., et al.: Dual attention guided gaze target detection in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11390\u201311399 (2021)","DOI":"10.1109\/CVPR46437.2021.01123"},{"key":"28_CR36","doi-asserted-by":"crossref","unstructured":"Tu, D., Min, X., Duan, H., Guo, G., Zhai, G., Shen, W.: End-to-end human-gaze-target detection with transformers. In: 2022 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2192\u20132200. IEEE (2022)","DOI":"10.1109\/CVPR52688.2022.00224"},{"key":"28_CR37","doi-asserted-by":"crossref","unstructured":"Bao, J., Liu, B., Yu, J.: Escnet: gaze target detection with the understanding of 3d scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14126\u201314135 (2022)","DOI":"10.1109\/CVPR52688.2022.01373"},{"issue":"12","key":"28_CR38","doi-asserted-by":"publisher","first-page":"8524","DOI":"10.1109\/TCSVT.2022.3190314","volume":"32","author":"H Zhengxi","year":"2022","unstructured":"Zhengxi, H.: Gaze target estimation inspired by interactive attention. IEEE Trans. Circuits Syst. Video Technol. 32(12), 8524\u20138536 (2022)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"28_CR39","doi-asserted-by":"crossref","unstructured":"Tonini, F., Dall\u2019Asen, N., Beyan, C., Ricci, E.: Object-aware gaze target detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 21860\u201321869 (2023)","DOI":"10.1109\/ICCV51070.2023.01998"},{"key":"28_CR40","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)"},{"key":"28_CR41","doi-asserted-by":"crossref","unstructured":"Luong, M.T., Pham, H., Manning, C.D.: Effective approaches to attention-based neural machine translation. arXiv preprint arXiv:1508.04025 (2015)","DOI":"10.18653\/v1\/D15-1166"},{"key":"28_CR42","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"28_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-End object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"28_CR44","unstructured":"Parmar, N., et al.: Image transformer. In: International Conference on Machine Learning, pp. 4055\u20134064. PMLR (2018)"},{"issue":"14","key":"28_CR45","first-page":"71","volume":"8","author":"MDM Reddy","year":"2021","unstructured":"Reddy, M.D.M., Basha, M.S.M., Hari, M.M.C., Penchalaiah, M.N.: Dall-e: creating images from text. UGC Care Group I J. 8(14), 71\u201375 (2021)","journal-title":"UGC Care Group I J."},{"key":"28_CR46","doi-asserted-by":"crossref","unstructured":"Xu, T., et al.: Attngan: fine-grained text to image generation with attentional generative adversarial networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1316\u20131324 (2018)","DOI":"10.1109\/CVPR.2018.00143"},{"key":"28_CR47","unstructured":"Lee, K., Chang, H., Jiang, L., Zhang, H., Tu, Z., Liu, C.: Vitgan: training gans with vision transformers. arXiv preprint arXiv:2107.04589 (2021)"},{"key":"28_CR48","first-page":"11285","volume":"33","author":"H Cai","year":"2020","unstructured":"Cai, H., Gan, C., Zhu, L., Han, S.: Tinytl: reduce memory, not parameters for efficient on-device learning. Adv. Neural. Inf. Process. Syst. 33, 11285\u201311297 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"28_CR49","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"698","DOI":"10.1007\/978-3-030-58580-8_41","volume-title":"Computer Vision \u2013 ECCV 2020","author":"JO Zhang","year":"2020","unstructured":"Zhang, J.O., Sax, A., Zamir, A., Guibas, L., Malik, J.: Side-tuning: a baseline for network adaptation via additive side networks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12348, pp. 698\u2013714. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58580-8_41"},{"key":"28_CR50","doi-asserted-by":"crossref","unstructured":"Judd, T., Ehinger, K., Durand, F., Torralba, A.: Learning to predict where humans look. In: 2009 IEEE 12th International Conference on Computer Vision, pp. 2106\u20132113. IEEE (2009)","DOI":"10.1109\/ICCV.2009.5459462"},{"key":"28_CR51","doi-asserted-by":"crossref","unstructured":"Tonini, F., Beyan, C., Ricci, E.: Multimodal across domains gaze target detection. In: Proceedings of the 2022 International Conference on Multimodal Interaction, pp. 420\u2013431 (2022)","DOI":"10.1145\/3536221.3556624"}],"container-title":["Lecture Notes in Computer Science","Artificial Neural Networks and Machine Learning \u2013 ICANN 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72338-4_28","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T07:49:09Z","timestamp":1759996149000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72338-4_28"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031723377","9783031723384"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72338-4_28","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"17 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ICANN","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Artificial Neural Networks","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lugano","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Switzerland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"17 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"33","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icann2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}