{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,15]],"date-time":"2026-04-15T18:33:05Z","timestamp":1776277985978,"version":"3.50.1"},"publisher-location":"Cham","reference-count":30,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012458","type":"print"},{"value":"9783030012465","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01246-5_49","type":"book-chapter","created":{"date-parts":[[2018,10,5]],"date-time":"2018-10-05T20:14:56Z","timestamp":1538770496000},"page":"831-846","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":595,"title":["Temporal Relational Reasoning in Videos"],"prefix":"10.1007","author":[{"given":"Bolei","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Alex","family":"Andonian","sequence":"additional","affiliation":[]},{"given":"Aude","family":"Oliva","sequence":"additional","affiliation":[]},{"given":"Antonio","family":"Torralba","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"key":"49_CR1","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Russakovsky, O., Gupta, A.: What actions are needed for understanding human actions in videos? arXiv preprint arXiv:1708.02696 (2017)","DOI":"10.1109\/ICCV.2017.235"},{"key":"49_CR2","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild. In: Proceedings of CVPR (2012)"},{"key":"49_CR3","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., Fei-Fei, L.: Large-scale video classification with convolutional neural networks. In: Proceedings of CVPR (2014)","DOI":"10.1109\/CVPR.2014.223"},{"key":"49_CR4","unstructured":"Gorban, A., et al.: THUMOS challenge: action recognition with a large number of classes. In: CVPR Workshop (2015)"},{"key":"49_CR5","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: Advances in Neural Information Processing Systems, pp. 568\u2013576 (2014)"},{"key":"49_CR6","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. arXiv preprint arXiv:1705.07750 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"49_CR7","unstructured":"Santoro, A., et al.: A simple neural network module for relational reasoning. arXiv preprint arXiv:1706.01427 (2017)"},{"key":"49_CR8","first-page":"1","volume":"40","author":"BM Lake","year":"2016","unstructured":"Lake, B.M., Ullman, T.D., Tenenbaum, J.B., Gershman, S.J.: Building machines that learn and think like people. Behav. Brain Sci. 40, 1\u2013101 (2016)","journal-title":"Behav. Brain Sci."},{"key":"49_CR9","doi-asserted-by":"crossref","unstructured":"Goyal, R., et al.: The \u201csomething something\u201d video database for learning and evaluating visual common sense. In: Proceedings of ICCV (2017)","DOI":"10.1109\/ICCV.2017.622"},{"key":"49_CR10","unstructured":"Twentybn jester dataset: a hand gesture dataset (2017). https:\/\/www.twentybn.com\/datasets\/jester"},{"key":"49_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1007\/978-3-319-46448-0_31","volume-title":"Computer Vision \u2013 ECCV 2016","author":"GA Sigurdsson","year":"2016","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 510\u2013526. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31"},{"key":"49_CR12","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. In: Advances in Neural Information Processing Systems, pp. 1097\u20131105 (2012)"},{"key":"49_CR13","unstructured":"Zhou, B., Lapedriza, A., Xiao, J., Torralba, A., Oliva, A.: Learning deep features for scene recognition using places database. In: Advances in Neural Information Processing Systems, pp. 487\u2013495 (2014)"},{"key":"49_CR14","doi-asserted-by":"crossref","unstructured":"Donahue, J., et al.: Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2625\u20132634 (2015)","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"49_CR15","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: Proceedings of CVPR (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"49_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1007\/978-3-319-46484-8_2","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Wang","year":"2016","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 20\u201336. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_2"},{"key":"49_CR17","unstructured":"Kay, W., et al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"issue":"11","key":"49_CR18","doi-asserted-by":"publisher","first-page":"2782","DOI":"10.1109\/TPAMI.2013.65","volume":"35","author":"A Gaidon","year":"2013","unstructured":"Gaidon, A., Harchaoui, Z., Schmid, C.: Temporal localization of actions with actoms. IEEE Trans. Pattern Anal. Mach. Intell. 35(11), 2782\u20132795 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"49_CR19","doi-asserted-by":"crossref","unstructured":"Pirsiavash, H., Ramanan, D.: Parsing videos of actions with segmental grammars. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 612\u2013619 (2014)","DOI":"10.1109\/CVPR.2014.85"},{"key":"49_CR20","doi-asserted-by":"crossref","unstructured":"Wang, H., Schmid, C.: Action recognition with improved trajectories. In: Proceedings of ICCV, pp. 3551\u20133558 (2013)","DOI":"10.1109\/ICCV.2013.441"},{"issue":"3","key":"49_CR21","doi-asserted-by":"publisher","first-page":"219","DOI":"10.1007\/s11263-013-0677-1","volume":"107","author":"A Gaidon","year":"2014","unstructured":"Gaidon, A., Harchaoui, Z., Schmid, C.: Activity representation with motion hierarchies. Int. J. Comput. Vis. 107(3), 219\u2013238 (2014)","journal-title":"Int. J. Comput. Vis."},{"issue":"3","key":"49_CR22","doi-asserted-by":"publisher","first-page":"254","DOI":"10.1007\/s11263-015-0859-0","volume":"119","author":"L Wang","year":"2016","unstructured":"Wang, L., Qiao, Y., Tang, X.: MoFAP: a multi-level representation for action recognition. Int. J. Comput. Vis. 119(3), 254\u2013271 (2016)","journal-title":"Int. J. Comput. Vis."},{"key":"49_CR23","unstructured":"Agrawal, P., Nair, A.V., Abbeel, P., Malik, J., Levine, S.: Learning to poke by poking: experiential learning of intuitive physics. In: Advances in Neural Information Processing Systems, pp. 5074\u20135082 (2016)"},{"key":"49_CR24","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1007\/978-3-319-46475-6_1","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Pinto","year":"2016","unstructured":"Pinto, L., Gandhi, D., Han, Y., Park, Y.-L., Gupta, A.: The curious robot: learning visual representations via physical interactions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 3\u201318. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_1"},{"key":"49_CR25","doi-asserted-by":"crossref","unstructured":"Sermanet, P., Lynch, C., Hsu, J., Levine, S.: Time-contrastive networks: self-supervised learning from multi-view observation. arXiv preprint arXiv:1704.06888 (2017)","DOI":"10.1109\/CVPRW.2017.69"},{"key":"49_CR26","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Divvala, S., Farhadi, A., Gupta, A.: Asynchronous temporal fields for action recognition (2017)","DOI":"10.1109\/CVPR.2017.599"},{"key":"49_CR27","unstructured":"Mahdisoltani, F., Berger, G., Gharbieh, W., Fleet, D., Memisevic, R.: Fine-grained video classification and captioning. arXiv preprint arXiv:1804.09235 (2018)"},{"key":"49_CR28","doi-asserted-by":"crossref","unstructured":"Sharif Razavian, A., Azizpour, H., Sullivan, J., Carlsson, S.: CNN features off-the-shelf: an astounding baseline for recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops (2014)","DOI":"10.1109\/CVPRW.2014.131"},{"key":"49_CR29","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"49_CR30","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: Accelerating deep network training by reducing internal covariate shift. In: International Conference on Machine Learning, pp. 448\u2013456 (2015)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01246-5_49","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,10,5]],"date-time":"2022-10-05T00:26:44Z","timestamp":1664929604000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01246-5_49"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012458","9783030012465"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01246-5_49","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}