{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:08:30Z","timestamp":1765357710187,"version":"3.40.3"},"publisher-location":"Cham","reference-count":68,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031726729"},{"type":"electronic","value":"9783031726736"}],"license":[{"start":{"date-parts":[[2024,10,22]],"date-time":"2024-10-22T00:00:00Z","timestamp":1729555200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,22]],"date-time":"2024-10-22T00:00:00Z","timestamp":1729555200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72673-6_11","type":"book-chapter","created":{"date-parts":[[2024,10,21]],"date-time":"2024-10-21T16:03:50Z","timestamp":1729526630000},"page":"192-210","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Listen to\u00a0Look Into the\u00a0Future: Audio-Visual Egocentric Gaze Anticipation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7578-7336","authenticated-orcid":false,"given":"Bolin","family":"Lai","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fiona","family":"Ryan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenqi","family":"Jia","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Miao","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"James M.","family":"Rehg","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,22]]},"reference":[{"key":"11_CR1","doi-asserted-by":"crossref","unstructured":"Agrawal, R., Jyoti, S., Girmaji, R., Sivaprasad, S., Gandhi, V.: Does audio help in deep audio-visual saliency prediction models? In: Proceedings of the 2022 International Conference on Multimodal Interaction, pp. 48\u201356 (2022)","DOI":"10.1145\/3536221.3556625"},{"key":"11_CR2","unstructured":"Akbari, H., et al.: VATT: transformers for multimodal self-supervised learning from raw video, audio and text. In: Advances in Neural Information Processing Systems, vol. 34, pp. 24206\u201324221 (2021)"},{"key":"11_CR3","unstructured":"Alayrac, J.B., et al.: Self-supervised multimodal versatile networks. In: Advances in Neural Information Processing Systems, vol. 33, pp. 25\u201337 (2020)"},{"key":"11_CR4","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., Zisserman, A.: Look, listen and learn. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 609\u2013617 (2017)","DOI":"10.1109\/ICCV.2017.73"},{"key":"11_CR5","doi-asserted-by":"crossref","unstructured":"Arandjelovic, R., Zisserman, A.: Objects that sound. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 435\u2013451 (2018)","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"11_CR6","unstructured":"Chang, Q., Zhu, S.: Temporal-spatial feature pyramid for video saliency detection. Cognitive Computation (2021)"},{"key":"11_CR7","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Afouras, T., Nagrani, A., Vedaldi, A., Zisserman, A.: Localizing visual sounds the hard way. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16867\u201316876 (2021)","DOI":"10.1109\/CVPR46437.2021.01659"},{"key":"11_CR8","doi-asserted-by":"crossref","unstructured":"Cheng, S., Gao, X., Song, L., Xiahou, J.: Audio-visual salieny network with audio attention module. In: 2021 2nd International Conference on Artificial Intelligence and Information Systems, pp.\u00a01\u20135 (2021)","DOI":"10.1145\/3469213.3470254"},{"key":"11_CR9","doi-asserted-by":"crossref","unstructured":"Coutrot, A., Guyader, N.: Multimodal saliency models for videos. In: From Human Attention to Computational Attention: A Multidisciplinary Approach, pp. 291\u2013304 (2016)","DOI":"10.1007\/978-1-4939-3435-5_16"},{"key":"11_CR10","doi-asserted-by":"crossref","unstructured":"Damen, D., et al.: Scaling egocentric vision: the epic-kitchens dataset. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 720\u2013736 (2018)","DOI":"10.1007\/978-3-030-01225-0_44"},{"key":"11_CR11","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2020)"},{"key":"11_CR12","doi-asserted-by":"crossref","unstructured":"Fan, H., et al.: Multiscale vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6824\u20136835 (2021)","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"11_CR13","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: Slowfast networks for video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"11_CR14","doi-asserted-by":"crossref","unstructured":"Gao, R., Oh, T.H., Grauman, K., Torresani, L.: Listen to look: action recognition by previewing audio. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10457\u201310467 (2020)","DOI":"10.1109\/CVPR42600.2020.01047"},{"key":"11_CR15","unstructured":"Gong, Y., et al.: Contrastive audio-visual masked autoencoder. In: International Conference on Learning Representations (2022)"},{"key":"11_CR16","unstructured":"Grauman, K., et al.: Ego4D: around the world in 3,000 hours of egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18995\u201319012 (2022)"},{"key":"11_CR17","unstructured":"Gurram, S., Fang, A., Chan, D., Canny, J.: Lava: language audio vision alignment for contrastive video pre-training. arXiv preprint arXiv:2207.08024 (2022)"},{"issue":"4","key":"11_CR18","doi-asserted-by":"publisher","first-page":"188","DOI":"10.1016\/j.tics.2005.02.009","volume":"9","author":"M Hayhoe","year":"2005","unstructured":"Hayhoe, M., Ballard, D.: Eye movements in natural behavior. Trends Cogn. Sci. 9(4), 188\u2013194 (2005)","journal-title":"Trends Cogn. Sci."},{"key":"11_CR19","doi-asserted-by":"crossref","unstructured":"Hu, D., Nie, F., Li, X.: Deep multimodal clustering for unsupervised audiovisual learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9248\u20139257 (2019)","DOI":"10.1109\/CVPR.2019.00947"},{"key":"11_CR20","unstructured":"Hu, D., et al.: Discriminative sounding objects localization via self-supervised audiovisual matching. In: Advances in Neural Information Processing Systems, vol. 33, pp. 10077\u201310087 (2020)"},{"key":"11_CR21","doi-asserted-by":"crossref","unstructured":"Hu, X., Chen, Z., Owens, A.: Mix and localize: localizing sound sources in mixtures. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10483\u201310492 (2022)","DOI":"10.1109\/CVPR52688.2022.01023"},{"key":"11_CR22","doi-asserted-by":"crossref","unstructured":"Huang, C., Tian, Y., Kumar, A., Xu, C.: Egocentric audio-visual object localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22910\u201322921 (2023)","DOI":"10.1109\/CVPR52729.2023.02194"},{"key":"11_CR23","doi-asserted-by":"publisher","first-page":"7795","DOI":"10.1109\/TIP.2020.3007841","volume":"29","author":"Y Huang","year":"2020","unstructured":"Huang, Y., Cai, M., Li, Z., Lu, F., Sato, Y.: Mutual context network for jointly estimating egocentric gaze and action. IEEE Trans. Image Process. 29, 7795\u20137806 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"11_CR24","doi-asserted-by":"crossref","unstructured":"Huang, Y., Cai, M., Li, Z., Sato, Y.: Predicting gaze in egocentric video by learning task-dependent attention transition. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 754\u2013769 (2018)","DOI":"10.1007\/978-3-030-01225-0_46"},{"issue":"4","key":"11_CR25","doi-asserted-by":"publisher","first-page":"306","DOI":"10.1109\/THMS.2020.2965429","volume":"50","author":"Y Huang","year":"2020","unstructured":"Huang, Y., Cai, M., Sato, Y.: An ego-vision system for discovering human joint attention. IEEE Trans. Hum.-Mach. Syst. 50(4), 306\u2013316 (2020)","journal-title":"IEEE Trans. Hum.-Mach. Syst."},{"key":"11_CR26","doi-asserted-by":"crossref","unstructured":"Jain, S., Yarlagadda, P., Jyoti, S., Karthik, S., Subramanian, R., Gandhi, V.: Vinet: pushing the limits of visual modality for audio-visual saliency prediction. In: 2021 IEEE\/RSJ International Conference on Intelligent Robots and Systems (IROS), pp. 3520\u20133527. IEEE (2021)","DOI":"10.1109\/IROS51168.2021.9635989"},{"key":"11_CR27","doi-asserted-by":"crossref","unstructured":"Kazakos, E., Nagrani, A., Zisserman, A., Damen, D.: Epic-fusion: audio-visual temporal binding for egocentric action recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5492\u20135501 (2019)","DOI":"10.1109\/ICCV.2019.00559"},{"key":"11_CR28","unstructured":"Korbar, B., Tran, D., Torresani, L.: Cooperative learning of audio and video models from self-supervised synchronization. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"11_CR29","doi-asserted-by":"crossref","unstructured":"Lai, B., Liu, M., Ryan, F., Rehg, J.: In the eye of transformer: global-local correlation for egocentric gaze estimation. In: British Machine Vision Conference (2022)","DOI":"10.1007\/s11263-023-01879-7"},{"issue":"3","key":"11_CR30","doi-asserted-by":"publisher","first-page":"854","DOI":"10.1007\/s11263-023-01879-7","volume":"132","author":"B Lai","year":"2024","unstructured":"Lai, B., Liu, M., Ryan, F., Rehg, J.M.: In the eye of transformer: global-local correlation for egocentric gaze estimation and beyond. Int. J. Comput. Vision 132(3), 854\u2013871 (2024)","journal-title":"Int. J. Comput. Vision"},{"key":"11_CR31","doi-asserted-by":"crossref","unstructured":"Lai, B., et al.: Werewolf among us: multimodal resources for modeling persuasion behaviors in social deduction games. In: Association for Computational Linguistics: ACL 2023 (2023)","DOI":"10.18653\/v1\/2023.findings-acl.411"},{"key":"11_CR32","doi-asserted-by":"crossref","unstructured":"Lee, S., Lai, B., Ryan, F., Boote, B., Rehg, J.M.: Modeling multimodal social interactions: new challenges and baselines with densely aligned representations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14585\u201314595 (2024)","DOI":"10.1109\/CVPR52733.2024.01382"},{"key":"11_CR33","doi-asserted-by":"crossref","unstructured":"Li, Y., Fathi, A., Rehg, J.M.: Learning to predict gaze in egocentric video. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 3216\u20133223 (2013)","DOI":"10.1109\/ICCV.2013.399"},{"issue":"6","key":"11_CR34","doi-asserted-by":"publisher","first-page":"6731","DOI":"10.1109\/TPAMI.2021.3051319","volume":"45","author":"Y Li","year":"2021","unstructured":"Li, Y., Liu, M., Rehg, J.: In the eye of the beholder: gaze and actions in first person video. IEEE Trans. Pattern Anal. Mach. Intell. 45(6), 6731\u20136747 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"11_CR35","unstructured":"Lin, K.Q., et al.: Egocentric video-language pretraining. In: Advances in Neural Information Processing Systems (2022)"},{"key":"11_CR36","doi-asserted-by":"crossref","unstructured":"Lin, Y.B., Sung, Y.L., Lei, J., Bansal, M., Bertasius, G.: Vision transformers are parameter-efficient audio-visual learners. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.00228"},{"key":"11_CR37","unstructured":"Lv, Z., et al.: Aria pilot dataset (2022). https:\/\/about.facebook.com\/realitylabs\/projectaria\/datasets"},{"key":"11_CR38","unstructured":"Ma, S., Zeng, Z., McDuff, D., Song, Y.: Active contrastive learning of audio-visual video representations. In: International Conference on Learning Representations (2020)"},{"key":"11_CR39","unstructured":"Ma, S., Zeng, Z., McDuff, D., Song, Y.: Contrastive learning of global-local video representations. arXiv preprint arXiv:2104.05418 (2021)"},{"key":"11_CR40","doi-asserted-by":"crossref","unstructured":"Min, X., Zhai, G., Gu, K., Yang, X.: Fixation prediction through multimodal analysis. ACM Trans. Multimedia Comput. Commun. Appl. (TOMM) 13(1), 1\u201323 (2016)","DOI":"10.1145\/2996463"},{"key":"11_CR41","doi-asserted-by":"publisher","first-page":"3805","DOI":"10.1109\/TIP.2020.2966082","volume":"29","author":"X Min","year":"2020","unstructured":"Min, X., Zhai, G., Zhou, J., Zhang, X.P., Yang, X., Guan, X.: A multimodal saliency model for videos with high audio-visual correspondence. IEEE Trans. Image Process. 29, 3805\u20133819 (2020)","journal-title":"IEEE Trans. Image Process."},{"key":"11_CR42","unstructured":"Morgado, P., Li, Y., Nvasconcelos, N.: Learning representations from audio-visual spatial alignment. In: Advances in Neural Information Processing Systems, vol. 33, pp. 4733\u20134744 (2020)"},{"key":"11_CR43","doi-asserted-by":"crossref","unstructured":"Morgado, P., Misra, I., Vasconcelos, N.: Robust audio-visual instance discrimination. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12934\u201312945 (2021)","DOI":"10.1109\/CVPR46437.2021.01274"},{"key":"11_CR44","doi-asserted-by":"crossref","unstructured":"Morgado, P., Vasconcelos, N., Misra, I.: Audio-visual instance discrimination with cross-modal agreement. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12475\u201312486 (2021)","DOI":"10.1109\/CVPR46437.2021.01229"},{"key":"11_CR45","unstructured":"Patrick, M., et al.: Multi-modal self-supervision from generalized data transformations. arXiv preprint (2020)"},{"key":"11_CR46","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"292","DOI":"10.1007\/978-3-030-58565-5_18","volume-title":"Computer Vision \u2013 ECCV 2020","author":"R Qian","year":"2020","unstructured":"Qian, R., Hu, D., Dinkel, H., Wu, M., Xu, N., Lin, W.: Multiple sound sources localization from coarse to fine. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12365, pp. 292\u2013308. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58565-5_18"},{"key":"11_CR47","unstructured":"Ratajczak, R., Pellerin, D., Labourey, Q., Garbay, C.: A fast audiovisual attention model for human detection and localization on a companion robot. In: VISUAL 2016-The First International Conference on Applications and Systems of Visual Paradigms (VISUAL 2016) (2016)"},{"key":"11_CR48","doi-asserted-by":"crossref","unstructured":"Ruesch, J., Lopes, M., Bernardino, A., Hornstein, J., Santos-Victor, J., Pfeifer, R.: Multimodal saliency-based bottom-up attention a framework for the humanoid robot icub. In: 2008 IEEE International Conference on Robotics and Automation, pp. 962\u2013967. IEEE (2008)","DOI":"10.1109\/ROBOT.2008.4543329"},{"key":"11_CR49","doi-asserted-by":"crossref","unstructured":"Ryan, F., Jiang, H., Shukla, A., Rehg, J.M., Ithapu, V.K.: Egocentric auditory attention localization in conversations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14663\u201314674 (2023)","DOI":"10.1109\/CVPR52729.2023.01409"},{"key":"11_CR50","doi-asserted-by":"publisher","first-page":"674","DOI":"10.1111\/j.1749-6632.1981.tb30910.x","volume":"374","author":"K Schaefer","year":"1981","unstructured":"Schaefer, K., S\u00fcss, K., Fiebig, E.: Acoustic-induced eye movements. Ann. N. Y. Acad. Sci. 374, 674\u2013688 (1981)","journal-title":"Ann. N. Y. Acad. Sci."},{"key":"11_CR51","doi-asserted-by":"crossref","unstructured":"Schauerte, B., K\u00fchn, B., Kroschel, K., Stiefelhagen, R.: Multimodal saliency-based attention for object-based scene analysis. In: 2011 IEEE\/RSJ International Conference on Intelligent Robots and Systems, pp. 1173\u20131179. IEEE (2011)","DOI":"10.1109\/IROS.2011.6095124"},{"key":"11_CR52","doi-asserted-by":"crossref","unstructured":"Senocak, A., Oh, T.H., Kim, J., Yang, M.H., Kweon, I.S.: Learning to localize sound source in visual scenes. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4358\u20134366 (2018)","DOI":"10.1109\/CVPR.2018.00458"},{"key":"11_CR53","doi-asserted-by":"publisher","first-page":"94","DOI":"10.1016\/j.neucom.2016.08.130","volume":"259","author":"N Sidaty","year":"2017","unstructured":"Sidaty, N., Larabi, M.C., Saadane, A.: Toward an audiovisual attention model for multimodal video content. Neurocomputing 259, 94\u2013111 (2017)","journal-title":"Neurocomputing"},{"key":"11_CR54","unstructured":"Sigurdsson, G.A., Gupta, A., Schmid, C., Farhadi, A., Alahari, K.: Charades-ego: a large-scale dataset of paired third and first person videos. arXiv preprint arXiv:1804.09626 (2018)"},{"key":"11_CR55","doi-asserted-by":"crossref","unstructured":"Soo\u00a0Park, H., Shi, J.: Social saliency prediction. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4777\u20134785 (2015)","DOI":"10.1109\/CVPR.2015.7299110"},{"key":"11_CR56","unstructured":"Tavakoli, H.R., Borji, A., Rahtu, E., Kannala, J.: Dave: a deep audio-visual embedding for dynamic saliency prediction. arXiv preprint arXiv:1905.10693 (2019)"},{"key":"11_CR57","doi-asserted-by":"crossref","unstructured":"Tian, Y., Shi, J., Li, B., Duan, Z., Xu, C.: Audio-visual event localization in unconstrained videos. In: Proceedings of the European Conference on Computer Vision (ECCV) (2018)","DOI":"10.1007\/978-3-030-01216-8_16"},{"key":"11_CR58","doi-asserted-by":"publisher","first-page":"186","DOI":"10.1016\/j.image.2019.05.001","volume":"76","author":"A Tsiami","year":"2019","unstructured":"Tsiami, A., Koutras, P., Katsamanis, A., Vatakis, A., Maragos, P.: A behaviorally inspired fusion approach for computational audiovisual saliency modeling. Signal Process. Image Commun. 76, 186\u2013200 (2019)","journal-title":"Signal Process. Image Commun."},{"key":"11_CR59","doi-asserted-by":"crossref","unstructured":"Tsiami, A., Koutras, P., Maragos, P.: Stavis: spatio-temporal audiovisual saliency network. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4766\u20134776 (2020)","DOI":"10.1109\/CVPR42600.2020.00482"},{"key":"11_CR60","doi-asserted-by":"crossref","unstructured":"Wang, G., Chen, C., Fan, D.P., Hao, A., Qin, H.: From semantic categories to fixations: a novel weakly-supervised visual-auditory saliency detection approach. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15119\u201315128 (2021)","DOI":"10.1109\/CVPR46437.2021.01487"},{"key":"11_CR61","unstructured":"Wang, G., Chen, C., Fan, D.P., Hao, A., Qin, H.: Weakly supervised visual-auditory fixation prediction with multigranularity perception. arXiv preprint arXiv:2112.13697 (2021)"},{"key":"11_CR62","doi-asserted-by":"crossref","unstructured":"Wang, W., Tran, D., Feiszli, M.: What makes training multi-modal classification networks hard? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12695\u201312705 (2020)","DOI":"10.1109\/CVPR42600.2020.01271"},{"key":"11_CR63","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., He, K.: Non-local neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7794\u20137803 (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"11_CR64","doi-asserted-by":"crossref","unstructured":"Xiong, J., Wang, G., Zhang, P., Huang, W., Zha, Y., Zhai, G.: Casp-net: rethinking video saliency prediction from an audio-visual consistency perceptual perspective. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6441\u20136450 (2023)","DOI":"10.1109\/CVPR52729.2023.00623"},{"key":"11_CR65","doi-asserted-by":"crossref","unstructured":"Yang, Q., et al.: SVGC-AVA: 360-degree video saliency prediction with spherical vector-based graph convolution and audio-visual attention. IEEE Trans. Multimedia (2023)","DOI":"10.1109\/TMM.2023.3306596"},{"key":"11_CR66","doi-asserted-by":"crossref","unstructured":"Yao, S., Min, X., Zhai, G.: Deep audio-visual fusion neural network for saliency estimation. In: 2021 IEEE International Conference on Image Processing (ICIP), pp. 1604\u20131608. IEEE (2021)","DOI":"10.1109\/ICIP42928.2021.9506089"},{"issue":"8","key":"11_CR67","doi-asserted-by":"publisher","first-page":"1783","DOI":"10.1109\/TPAMI.2018.2871688","volume":"41","author":"M Zhang","year":"2018","unstructured":"Zhang, M., Ma, K.T., Lim, J.H., Zhao, Q., Feng, J.: Anticipating where people will look using adversarial networks. IEEE Trans. Pattern Anal. Mach. Intell. 41(8), 1783\u20131796 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"11_CR68","doi-asserted-by":"crossref","unstructured":"Zhang, M., Teck\u00a0Ma, K., Hwee\u00a0Lim, J., Zhao, Q., Feng, J.: Deep future gaze: gaze anticipation on egocentric videos using adversarial networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4372\u20134381 (2017)","DOI":"10.1109\/CVPR.2017.377"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72673-6_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,21]],"date-time":"2024-10-21T16:07:41Z","timestamp":1729526861000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72673-6_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,22]]},"ISBN":["9783031726729","9783031726736"],"references-count":68,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72673-6_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,22]]},"assertion":[{"value":"22 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}