{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:15:38Z","timestamp":1775578538089,"version":"3.50.1"},"reference-count":50,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"2","license":[{"start":{"date-parts":[[2020,2,1]],"date-time":"2020-02-01T00:00:00Z","timestamp":1580515200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2020,2,1]],"date-time":"2020-02-01T00:00:00Z","timestamp":1580515200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2020,2,1]],"date-time":"2020-02-01T00:00:00Z","timestamp":1580515200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"MIT-IBM Watson AI Lab"},{"DOI":"10.13039\/100011039","name":"Intelligence Advanced Research Projects Activity","doi-asserted-by":"publisher","award":["D17PC00341"],"award-info":[{"award-number":["D17PC00341"]}],"id":[{"id":"10.13039\/100011039","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Toyota Research Institute \/ MIT CSAIL Joint Research Center"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2020,2,1]]},"DOI":"10.1109\/tpami.2019.2901464","type":"journal-article","created":{"date-parts":[[2019,2,25]],"date-time":"2019-02-25T20:04:22Z","timestamp":1551125062000},"page":"502-508","source":"Crossref","is-referenced-by-count":311,"title":["Moments in Time Dataset: One Million Videos for Event Understanding"],"prefix":"10.1109","volume":"42","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6373-5520","authenticated-orcid":false,"given":"Mathew","family":"Monfort","sequence":"first","affiliation":[]},{"given":"Carl","family":"Vondrick","sequence":"additional","affiliation":[]},{"given":"Aude","family":"Oliva","sequence":"additional","affiliation":[]},{"given":"Alex","family":"Andonian","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4030-0684","authenticated-orcid":false,"given":"Bolei","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Kandan","family":"Ramakrishnan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3157-0412","authenticated-orcid":false,"given":"Sarah Adel","family":"Bargal","sequence":"additional","affiliation":[]},{"given":"Tom","family":"Yan","sequence":"additional","affiliation":[]},{"given":"Lisa","family":"Brown","sequence":"additional","affiliation":[]},{"given":"Quanfu","family":"Fan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5101-4443","authenticated-orcid":false,"given":"Dan","family":"Gutfreund","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","first-page":"568","article-title":"Two-stream convolutional networks for action recognition in videos","author":"simonyan","year":"2014","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref38","article-title":"Hollywood in homes: Crowdsourcing data collection for activity understanding","author":"sigurdsson","year":"2016","journal-title":"CoRR"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.85"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/MLSP.2015.7324337"},{"key":"ref31","article-title":"The open world of micro-videos","author":"nguyen","year":"2016"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2003.1238378"},{"key":"ref37","article-title":"VerbNet: A broad-coverage, comprehensive verb lexicon","author":"schuler","year":"2005"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.2004.1334462"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2655045"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2012.6247806"},{"key":"ref28","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc 25th Int Conf Neural Inf Process Syst"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.5244\/C.22.99"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-33374-3_41"},{"key":"ref2","article-title":"YouTube-8M: A large-scale video classification benchmark","author":"abu-el-haija","year":"2016"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1126\/science.1736359"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.3115\/1614049.1614064"},{"key":"ref22","article-title":"Open source computer vision library","year":"2015"},{"key":"ref21","first-page":"448","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"ref23","article-title":"THUMOS challenge: Action recognition with a large number of classes","author":"jiang","year":"2014"},{"key":"ref26","article-title":"Propbank: The next level of treebank","volume":"3","author":"kingsbury","year":"2003","journal-title":"Proceedings of Treebanks and Lexical Theories"},{"key":"ref25","article-title":"The kinetics human action video dataset","author":"kay","year":"2017"},{"key":"ref50","first-page":"487","article-title":"Learning deep features for scene recognition using places database","author":"zhou","year":"2014","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref10","article-title":"Corpus of contemporary American English","author":"davies","year":"2016"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"ref40","article-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"soomro","year":"2012"},{"key":"ref12","article-title":"From lifestyle vlogs to everyday interactions","author":"fouhey","year":"2017"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.622"},{"key":"ref15","article-title":"AVA: A video dataset of spatio-temporally localized atomic visual actions","author":"gu","year":"2017"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.114"},{"key":"ref17","article-title":"Deep residual learning for image recognition","author":"he","year":"2015","journal-title":"CoRR"},{"key":"ref18","first-page":"630","article-title":"Identity mappings in deep residual networks","author":"he","year":"2016","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref19","article-title":"CNN architectures for large-scale audio classification","author":"hershey","year":"2016"},{"key":"ref4","first-page":"892","article-title":"SoundNet: Learning sound representations from unlabeled video","author":"aytar","year":"2016","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref3","article-title":"Look, listen and learn","author":"arandjelovi?","year":"2017"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1037\/0096-3445.133.1.83"},{"key":"ref5","first-page":"86","article-title":"The Berkeley FrameNet project","author":"baker","year":"1998","journal-title":"Proc 17th Int Conf Comput Linguistics"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2005.28"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2723009"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref46","first-page":"214","article-title":"A duality based approach for realtime TV-$L^{1}$L1 optical flow","author":"zach","year":"2007","journal-title":"Proc Pattern Recognition Symp"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00736"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.319"},{"key":"ref47","first-page":"831","article-title":"Temporal relational reasoning in videos","author":"zhou","year":"2018","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2428998"},{"key":"ref44","first-page":"20","article-title":"Temporal segment networks: Towards good practices for deep action recognition","author":"wang","year":"2016","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995407"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/8952810\/08651343.pdf?arnumber=8651343","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,4,27]],"date-time":"2022-04-27T13:59:25Z","timestamp":1651067965000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8651343\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,2,1]]},"references-count":50,"journal-issue":{"issue":"2"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2019.2901464","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,2,1]]}}}