{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T07:26:05Z","timestamp":1780471565458,"version":"3.54.1"},"publisher-location":"Cham","reference-count":47,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012274","type":"print"},{"value":"9783030012281","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01228-1_38","type":"book-chapter","created":{"date-parts":[[2018,10,6]],"date-time":"2018-10-06T05:03:51Z","timestamp":1538802231000},"page":"639-655","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":207,"title":["In the Eye of Beholder: Joint Learning of Gaze and Actions in First Person Video"],"prefix":"10.1007","author":[{"given":"Yin","family":"Li","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Miao","family":"Liu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"James M.","family":"Rehg","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"issue":"5","key":"38_CR1","doi-asserted-by":"publisher","first-page":"744","DOI":"10.1109\/TCSVT.2015.2409731","volume":"25","author":"A Betancourt","year":"2015","unstructured":"Betancourt, A., Morerio, P., Regazzoni, C.S., Rauterberg, M.: The evolution of first person vision methods: a survey. IEEE Trans. Circuits Syst. Video Technol. 25(5), 744\u2013760 (2015)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"1","key":"38_CR2","doi-asserted-by":"publisher","first-page":"185","DOI":"10.1109\/TPAMI.2012.89","volume":"35","author":"A Borji","year":"2013","unstructured":"Borji, A., Itti, L.: State-of-the-art in visual attention modeling. IEEE Trans. Pattern Anal. Mach. Intell. 35(1), 185\u2013207 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"6","key":"38_CR3","doi-asserted-by":"publisher","first-page":"719","DOI":"10.1016\/0042-6989(75)90290-4","volume":"15","author":"B Bridgeman","year":"1975","unstructured":"Bridgeman, B., Hendry, D., Stark, L.: Failure to detect displacement of the visual world during saccadic eye movements. Vis. Res. 15(6), 719\u2013722 (1975)","journal-title":"Vis. Res."},{"key":"38_CR4","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"38_CR5","doi-asserted-by":"crossref","unstructured":"Fathi, A., Hodgins, J.K., Rehg, J.M.: Social interactions: a first-person perspective. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6247805"},{"key":"38_CR6","doi-asserted-by":"crossref","unstructured":"Fathi, A., Farhadi, A., Rehg, J.M.: Understanding egocentric activities. In: ICCV (2011)","DOI":"10.1109\/ICCV.2011.6126269"},{"key":"38_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"314","DOI":"10.1007\/978-3-642-33718-5_23","volume-title":"Computer Vision \u2013 ECCV 2012","author":"A Fathi","year":"2012","unstructured":"Fathi, A., Li, Y., Rehg, J.M.: Learning to recognize daily actions using gaze. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7572, pp. 314\u2013327. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-33718-5_23"},{"key":"38_CR8","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Pinz, A., Zisserman, A.: Convolutional two-stream network fusion for video action recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.213"},{"key":"38_CR9","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.81"},{"issue":"3","key":"38_CR10","doi-asserted-by":"publisher","first-page":"478","DOI":"10.1109\/TPAMI.2009.30","volume":"32","author":"DW Hansen","year":"2010","unstructured":"Hansen, D.W., Ji, Q.: In the eye of the beholder: a survey of models for eyes and gaze. IEEE Trans. Pattern Anal. Mach. Intell. 32(3), 478\u2013500 (2010)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"11","key":"38_CR11","doi-asserted-by":"publisher","first-page":"498","DOI":"10.1016\/j.tics.2003.09.006","volume":"7","author":"JM Henderson","year":"2003","unstructured":"Henderson, J.M.: Human gaze control during real-world scene perception. Trends Cogn. Sci. 7(11), 498\u2013504 (2003)","journal-title":"Trends Cogn. Sci."},{"key":"38_CR12","doi-asserted-by":"crossref","unstructured":"Ilg, E., Mayer, N., Saikia, T., Keuper, M., Dosovitskiy, A., Brox, T.: FlowNet 2.0: evolution of optical flow estimation with deep networks. In: ICCV (2017)","DOI":"10.1109\/CVPR.2017.179"},{"key":"38_CR13","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. In: ICML (2015)"},{"issue":"3","key":"38_CR14","doi-asserted-by":"publisher","first-page":"194","DOI":"10.1038\/35058500","volume":"2","author":"L Itti","year":"2001","unstructured":"Itti, L., Koch, C.: Computational modelling of visual attention. Nat. Rev. Neurosci. 2(3), 194 (2001)","journal-title":"Nat. Rev. Neurosci."},{"key":"38_CR15","unstructured":"Jang, E., Gu, S., Poole, B.: Categorical reparameterization with gumbel-softmax. In: ICLR (2017)"},{"key":"38_CR16","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. In: ICLR (2014)"},{"key":"38_CR17","doi-asserted-by":"crossref","unstructured":"Kitani, K.M., Okabe, T., Sato, Y., Sugimoto, A.: Fast unsupervised ego-action learning for first-person sports videos. In: CVPR (2011)","DOI":"10.1109\/CVPR.2011.5995406"},{"key":"38_CR18","doi-asserted-by":"crossref","unstructured":"Li, Y., Fathi, A., Rehg, J.M.: Learning to predict gaze in egocentric video. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.399"},{"key":"38_CR19","doi-asserted-by":"crossref","unstructured":"Li, Y., Ye, Z., Rehg, J.M.: Delving into egocentric actions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298625"},{"key":"38_CR20","doi-asserted-by":"crossref","unstructured":"Liu, S., Johns, E., Davison, A.J.: End-to-end multi-task learning with attention. arXiv preprint arXiv:1803.10704 (2018)","DOI":"10.1109\/CVPR.2019.00197"},{"key":"38_CR21","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"38_CR22","doi-asserted-by":"crossref","unstructured":"Ma, M., Fan, H., Kitani, K.M.: Going deeper into first-person activity recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.209"},{"key":"38_CR23","unstructured":"Maddison, C.J., Mnih, A., Teh, Y.W.: The concrete distribution: a continuous relaxation of discrete random variables. In: ICLR (2017)"},{"key":"38_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"842","DOI":"10.1007\/978-3-642-33709-3_60","volume-title":"Computer Vision \u2013 ECCV 2012","author":"S Mathe","year":"2012","unstructured":"Mathe, S., Sminchisescu, C.: Dynamic eye movement datasets and learnt saliency models for visual action recognition. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, pp. 842\u2013856. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-33709-3_60"},{"key":"38_CR25","first-page":"422","volume-title":"NIPS","author":"HS Park","year":"2012","unstructured":"Park, H.S., Jain, E., Sheikh, Y.: 3D social saliency from head-mounted cameras. In: Pereira, F., Burges, C.J.C., Bottou, L., Weinberger, K.Q. (eds.) NIPS, pp. 422\u2013430. Curran Associates, Inc., Red Hook (2012)"},{"key":"38_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"143","DOI":"10.1007\/978-3-642-15561-1_11","volume-title":"Computer Vision \u2013 ECCV 2010","author":"F Perronnin","year":"2010","unstructured":"Perronnin, F., S\u00e1nchez, J., Mensink, T.: Improving the fisher kernel for large-scale image classification. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010. LNCS, vol. 6314, pp. 143\u2013156. Springer, Heidelberg (2010). https:\/\/doi.org\/10.1007\/978-3-642-15561-1_11"},{"key":"38_CR27","doi-asserted-by":"crossref","unstructured":"Pirsiavash, H., Ramanan, D.: Detecting activities of daily living in first-person camera views. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6248010"},{"key":"38_CR28","doi-asserted-by":"crossref","unstructured":"Poleg, Y., Ephrat, A., Peleg, S., Arora, C.: Compact CNN for indexing egocentric videos. In: WACV (2016)","DOI":"10.1109\/WACV.2016.7477708"},{"key":"38_CR29","first-page":"91","volume-title":"NIPS","author":"S Ren","year":"2015","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Cortes, C., Lawrence, N.D., Lee, D.D., Sugiyama, M., Garnett, R. (eds.) NIPS, pp. 91\u201399. Curran Associates, Inc., Red Hook (2015)"},{"key":"38_CR30","doi-asserted-by":"crossref","unstructured":"Ryoo, M.S., Rothrock, B., Matthies, L.: Pooled motion features for first-person videos. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298691"},{"key":"38_CR31","first-page":"2409","volume-title":"NIPS","author":"N Shapovalova","year":"2013","unstructured":"Shapovalova, N., Raptis, M., Sigal, L., Mori, G.: Action is in the eye of the beholder: eye-gaze driven model for spatio-temporal action localization. In: Burges, C.J.C., Bottou, L., Welling, M., Ghahramani, Z., Weinberger, K.Q. (eds.) NIPS, pp. 2409\u20132417. Curran Associates, Inc., Red Hook (2013)"},{"key":"38_CR32","unstructured":"Sharma, S., Kiros, R., Salakhutdinov, R.: Action recognition using visual attention. In: ICLR Workshop (2016)"},{"key":"38_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1007\/978-3-319-46448-0_31","volume-title":"Computer Vision \u2013 ECCV 2016","author":"GA Sigurdsson","year":"2016","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 510\u2013526. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31"},{"key":"38_CR34","first-page":"568","volume-title":"NIPS","author":"K Simonyan","year":"2014","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: Ghahramani, Z., Welling, M., Cortes, C., Lawrence, N.D., Weinberger, K.Q. (eds.) NIPS, pp. 568\u2013576. Curran Associates, Inc., Red Hook (2014)"},{"key":"38_CR35","doi-asserted-by":"crossref","unstructured":"Singh, S., Arora, C., Jawahar, C.: First person action recognition using deep learned descriptors. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.287"},{"key":"38_CR36","first-page":"3483","volume-title":"NIPS","author":"K Sohn","year":"2015","unstructured":"Sohn, K., Lee, H., Yan, X.: Learning structured output representation using deep conditional generative models. In: Cortes, C., Lawrence, N.D., Lee, D.D., Sugiyama, M., Garnett, R. (eds.) NIPS, pp. 3483\u20133491. Curran Associates, Inc., Red Hook (2015)"},{"key":"38_CR37","doi-asserted-by":"crossref","unstructured":"Spriggs, E.H., De La Torre, F., Hebert, M.: Temporal segmentation and activity classification from first-person sensing. In: CVPR Workshops (2009)","DOI":"10.1109\/CVPRW.2009.5204354"},{"key":"38_CR38","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. J. Mach. Learn. Res. 15, 1929\u20131958 (2014)","journal-title":"J. Mach. Learn. Res."},{"key":"38_CR39","doi-asserted-by":"crossref","unstructured":"Szegedy, C., et al.: Going deeper with convolutions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"38_CR40","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.510"},{"issue":"11","key":"38_CR41","doi-asserted-by":"publisher","first-page":"1473","DOI":"10.1109\/TCSVT.2008.2005594","volume":"18","author":"P Turaga","year":"2008","unstructured":"Turaga, P., Chellappa, R., Subrahmanian, V.S., Udrea, O.: Machine recognition of human activities: a survey. IEEE Trans. Circuits Syst. Video Technol. 18(11), 1473\u20131488 (2008)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"6","key":"38_CR42","doi-asserted-by":"publisher","first-page":"1510","DOI":"10.1109\/TPAMI.2017.2712608","volume":"40","author":"G Varol","year":"2017","unstructured":"Varol, G., Laptev, I., Schmid, C.: Long-term temporal convolutions for action recognition. IEEE Trans. Pattern Anal. Mach. Intell. 40(6), 1510\u20131517 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"38_CR43","doi-asserted-by":"crossref","unstructured":"Wang, H., Kl\u00e4ser, A., Schmid, C., Liu, C.L.: Action recognition by dense trajectories. In: CVPR (2011)","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"38_CR44","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1007\/978-3-319-46484-8_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Wang","year":"2016","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 20\u201336. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2"},{"key":"38_CR45","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R.B., Gupta, A., He, K.: Non-local neural networks. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"38_CR46","doi-asserted-by":"crossref","unstructured":"Yonetani, R., Kitani, K.M., Sato, Y.: Recognizing micro-actions and reactions from paired egocentric videos. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.288"},{"key":"38_CR47","doi-asserted-by":"crossref","unstructured":"Zhang, M., Teck Ma, K., Hwee Lim, J., Zhao, Q., Feng, J.: Deep future gaze: gaze anticipation on egocentric videos using adversarial networks. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.377"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01228-1_38","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,6]],"date-time":"2022-10-06T00:32:22Z","timestamp":1665016342000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01228-1_38"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012274","9783030012281"],"references-count":47,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01228-1_38","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}