{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T13:25:03Z","timestamp":1773840303971,"version":"3.50.1"},"reference-count":128,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"12","license":[{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,12,1]],"date-time":"2023-12-01T00:00:00Z","timestamp":1701388800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61836012"],"award-info":[{"award-number":["61836012"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62171431"],"award-info":[{"award-number":["62171431"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"CAAI-Huawei MindSpore Open Fund"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Neural Netw. Learning Syst."],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1109\/tnnls.2022.3160860","type":"journal-article","created":{"date-parts":[[2022,3,31]],"date-time":"2022-03-31T19:41:12Z","timestamp":1648755672000},"page":"9832-9846","source":"Crossref","is-referenced-by-count":14,"title":["Self-Supervised Motion Perception for Spatiotemporal Representation Learning"],"prefix":"10.1109","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6747-0646","authenticated-orcid":false,"given":"Chang","family":"Liu","sequence":"first","affiliation":[{"name":"Department of Automation, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3616-2496","authenticated-orcid":false,"given":"Yuan","family":"Yao","sequence":"additional","affiliation":[{"name":"School of Electronic, Electrical and Communication Engineering, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dezhao","family":"Luo","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4188-9953","authenticated-orcid":false,"given":"Yu","family":"Zhou","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1215-6259","authenticated-orcid":false,"given":"Qixiang","family":"Ye","sequence":"additional","affiliation":[{"name":"School of Electronic, Electrical and Communication Engineering, University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref3","article-title":"The kinetics human action video dataset","author":"Kay","year":"2017","journal-title":"arXiv:1705.06950"},{"key":"ref4","article-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"Soomro","year":"2012","journal-title":"arXiv:1212.0402"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"ref6","article-title":"Momentum contrast for unsupervised visual representation learning","author":"He","year":"2019","journal-title":"arXiv:1911.05722"},{"key":"ref7","article-title":"A simple framework for contrastive learning of visual representations","author":"Chen","year":"2020","journal-title":"arXiv:2002.05709"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.167"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46466-4_5"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_32"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.79"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.607"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00840"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01058"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2020.3025661"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00413"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6840"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.03.120"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00406"},{"key":"ref20","article-title":"TCLR: Temporal contrastive learning for video representation","author":"Dave","year":"2021","journal-title":"arXiv:2101.07974"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1126\/science.3283936"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1016\/0896-6273(94)90455-3"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1016\/0042-6989(92)90036-I"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1038\/nrn1057"},{"key":"ref25","first-page":"523","article-title":"Central visual pathways","volume":"4","author":"Wurtz","year":"2000","journal-title":"Princ. Neural Sci."},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"ref27","article-title":"Your head is there to move you around: Goal-driven models of the primate dorsal pathway","volume-title":"Proc. NIPS","volume":"34","author":"Mineault"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00658"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_32"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00349"},{"key":"ref32","article-title":"A short note on the Kinetics-700-2020 human action dataset","author":"Smaira","year":"2020","journal-title":"arXiv:2010.10864"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.50"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2992393"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00649"},{"key":"ref38","first-page":"3581","article-title":"Look, listen and learn\u2014A multimodal LSTM for speaker identification","volume-title":"Proc. AAAI Conf. Artif. Intell.","author":"Jimmy"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.73"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00715"},{"key":"ref41","article-title":"Unsupervised representation learning by predicting image rotations","author":"Gidaris","year":"2018","journal-title":"arXiv:1803.07728"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01061"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.278"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00092"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_9"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.628"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_35"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46487-9_40"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.76"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.96"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.320"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.149"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00267"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.638"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00586"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2558148"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_47"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018545"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_24"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00165"},{"key":"ref61","first-page":"843","article-title":"Unsupervised learning of video representations using lstms","volume-title":"Proc. ICML","author":"Srivastava"},{"key":"ref62","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv:1807.03748"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00186"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123451"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.3039899"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.3016291"},{"key":"ref67","first-page":"613","article-title":"Generating videos with scene dynamics","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Vondrick"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i11.17215"},{"key":"ref69","article-title":"Removing the background by adding the background: Towards background robust self-supervised video representation learning","author":"Wang","year":"2020","journal-title":"arXiv:2009.05769"},{"key":"ref70","article-title":"Spatiotemporal contrastive video representation learning","author":"Qian","year":"2020","journal-title":"arXiv:2008.03800"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.3390\/technologies9010002"},{"key":"ref72","first-page":"766","article-title":"Discriminative unsupervised feature learning with convolutional neural networks","volume-title":"Proc. Neural Inf. Process. Syst.","author":"Dosovitskiy"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2496141"},{"key":"ref74","first-page":"517","article-title":"Unsupervised learning by predicting noise","volume-title":"Proc. ICML","author":"Bojanowski"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"key":"ref76","first-page":"297","article-title":"Noise-contrastive estimation: A new estimation principle for unnormalized statistical models","volume-title":"Proc. AISTATS JMLR Workshop Conf.","author":"Gutmann"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"ref78","article-title":"Supervised contrastive learning","author":"Khosla","year":"2020","journal-title":"arXiv:2004.11362"},{"key":"ref79","article-title":"Improved baselines with momentum contrastive learning","author":"Chen","year":"2020","journal-title":"arXiv:2003.04297"},{"key":"ref80","article-title":"Big self-supervised models are strong semi-supervised learners","author":"Chen","year":"2020","journal-title":"arXiv:2006.10029"},{"key":"ref81","article-title":"Bootstrap your own latent: A new approach to self-supervised learning","author":"Grill","year":"2020","journal-title":"arXiv:2006.07733"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_45"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00021"},{"key":"ref84","article-title":"Self-supervised co-training for video representation learning","author":"Han","year":"2020","journal-title":"arXiv:2010.09709"},{"key":"ref85","article-title":"On compositions of transformations in contrastive self-supervised learning","author":"Patrick","year":"2020","journal-title":"arXiv:2003.04298"},{"key":"ref86","article-title":"Unsupervised learning of visual features by contrasting cluster assignments","author":"Caron","year":"2020","journal-title":"arXiv:2006.09882"},{"key":"ref87","article-title":"Deep robust clustering by contrastive learning","author":"Zhong","year":"2020","journal-title":"arXiv:2008.03030"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475551"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00797"},{"key":"ref90","article-title":"Debiased contrastive learning","author":"Chuang","year":"2020","journal-title":"arXiv:2007.00224"},{"key":"ref91","article-title":"On mutual information in contrastive learning for visual representations","author":"Wu","year":"2020","journal-title":"arXiv:2005.13149"},{"key":"ref92","article-title":"Hard negative mixing for contrastive learning","author":"Kalantidis","year":"2020","journal-title":"arXiv:2010.01028"},{"key":"ref93","article-title":"Contrastive learning with adversarial examples","author":"Ho","year":"2020","journal-title":"arXiv:2010.12050"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i12.17274"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_30"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16189"},{"key":"ref97","article-title":"Video representation learning with visual tempo consistency","author":"Yang","year":"2020","journal-title":"arXiv:2006.15489"},{"key":"ref98","article-title":"Cycle-contrast for self-supervised video representation learning","author":"Kong","year":"2020","journal-title":"arXiv:2010.14810"},{"key":"ref99","article-title":"Watching the world go by: Representation learning from unlabeled videos","author":"Gordon","year":"2020","journal-title":"arXiv:2003.07990"},{"key":"ref100","article-title":"Self-supervised temporal discriminative learning for video representation learning","author":"Wang","year":"2020","journal-title":"arXiv:2008.02129"},{"key":"ref101","article-title":"Can temporal information help with contrastive self-supervised learning?","author":"Bai","year":"2020","journal-title":"arXiv:2011.13046"},{"key":"ref102","article-title":"Representation learning with video deep InfoMax","author":"Hjelm","year":"2020","journal-title":"arXiv:2007.13278"},{"key":"ref103","article-title":"Learning representations by maximizing mutual information across views","author":"Bachman","year":"2019","journal-title":"arXiv:1906.00910"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00958"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00034"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_18"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00012"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00795"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.213"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.590"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/TCOM.1983.1095851"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00938"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.4324\/9781410605337-29"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.86"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.226"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.319"},{"key":"ref118","article-title":"What makes for good views for contrastive learning?","author":"Tian","year":"2020","journal-title":"arXiv:2005.10243"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-12939-2_20"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3057833"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1145\/212094.212141"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00994"},{"key":"ref123","article-title":"Self-supervised visual learning by variable playback speeds prediction of a video","author":"Cho","year":"2020","journal-title":"arXiv:2003.02692"},{"key":"ref124","article-title":"What is considered complete for visual recognition?","author":"Xie","year":"2021","journal-title":"arXiv:2105.13978"},{"key":"ref125","article-title":"A short note on the kinetics-700 human action dataset","author":"Carreira","year":"2019","journal-title":"arXiv:1907.06987"},{"key":"ref126","article-title":"Why can\u2019t i dance in the mall? Learning to mitigate scene bias in action recognition","author":"Choi","year":"2019","journal-title":"arXiv:1912.05534"},{"key":"ref127","article-title":"Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer","volume-title":"Proc. ICLR","author":"Zagoruyko"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58604-1_26"}],"container-title":["IEEE Transactions on Neural Networks and Learning Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/5962385\/10336252\/09745754.pdf?arnumber=9745754","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,16]],"date-time":"2024-02-16T19:10:11Z","timestamp":1708110611000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9745754\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12]]},"references-count":128,"journal-issue":{"issue":"12"},"URL":"https:\/\/doi.org\/10.1109\/tnnls.2022.3160860","relation":{},"ISSN":["2162-237X","2162-2388"],"issn-type":[{"value":"2162-237X","type":"print"},{"value":"2162-2388","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,12]]}}}