{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2023,1,10]],"date-time":"2023-01-10T05:41:20Z","timestamp":1673329280037},"reference-count":37,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2021,9,14]],"date-time":"2021-09-14T00:00:00Z","timestamp":1631577600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,9,14]],"date-time":"2021-09-14T00:00:00Z","timestamp":1631577600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2022,4]]},"DOI":"10.1007\/s10489-021-02790-9","type":"journal-article","created":{"date-parts":[[2021,9,14]],"date-time":"2021-09-14T07:02:51Z","timestamp":1631602971000},"page":"6613-6622","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Video representation learning by identifying spatio-temporal transformations"],"prefix":"10.1007","volume":"52","author":[{"given":"Sheng","family":"Geng","sequence":"first","affiliation":[]},{"given":"Shimin","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Hu","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,9,14]]},"reference":[{"key":"2790_CR1","doi-asserted-by":"crossref","unstructured":"Carreira J, Zisserman A (2017) Quo vadis, action recognition? a new model and the kinetics dataset. In: proceedings of the IEEE conference on computer vision and pattern recognition, pp 6299\u20136308","DOI":"10.1109\/CVPR.2017.502"},{"key":"2790_CR2","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li LJ, Li K, Fei-Fei L (2009) Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on computer vision and pattern recognition. Ieee, pp 248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2790_CR3","doi-asserted-by":"crossref","unstructured":"Doersch C, Gupta A, Efros AA (2015) Unsupervised visual representation learning by context prediction. In: Proceedings of the IEEE international conference on computer vision, pp 1422\u20131430","DOI":"10.1109\/ICCV.2015.167"},{"key":"2790_CR4","doi-asserted-by":"crossref","unstructured":"Fernando B, Bilen H, Gavves E, Gould S (2017) Self-supervised video representation learning with odd-one-out networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3636\u20133645","DOI":"10.1109\/CVPR.2017.607"},{"key":"2790_CR5","unstructured":"Gidaris S, Singh P, Komodakis N (2018) Unsupervised representation learning by predicting image rotations. arXiv:1803.07728"},{"key":"2790_CR6","doi-asserted-by":"crossref","unstructured":"Hara K, Kataoka H, Satoh Y (2018) Can spatiotemporal 3d cnns retrace the history of 2d cnns and imagenet?. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6546\u20136555","DOI":"10.1109\/CVPR.2018.00685"},{"key":"2790_CR7","first-page":"1106","volume":"25","author":"GE Hinton","year":"2012","unstructured":"Hinton GE, Krizhevsky A, Sutskever I (2012) Imagenet classification with deep convolutional neural networks. Adv Neural Inform Process Syst 25:1106\u20131114","journal-title":"Adv Neural Inform Process Syst"},{"key":"2790_CR8","unstructured":"Jing L, Tian Y (2018) Self-supervised spatiotemporal feature learning by video geometric transformations. 2(7):8. arXiv:1811.11387"},{"key":"2790_CR9","doi-asserted-by":"crossref","unstructured":"Jing L, Tian Y (2020) Self-supervised visual feature learning with deep neural networks: A survey. IEEE Transactions on Pattern Analysis and Machine Intelligence","DOI":"10.1109\/TPAMI.2020.2992393"},{"key":"2790_CR10","doi-asserted-by":"crossref","unstructured":"Kim D, Cho D, Kweon IS (2019) Self-supervised video representation learning with space-time cubic puzzles. In: Proceedings of the AAAI conference on artificial intelligence, vol 33, pp 8545\u20138552","DOI":"10.1609\/aaai.v33i01.33018545"},{"key":"2790_CR11","first-page":"571","volume":"12","author":"H Kuehne","year":"2013","unstructured":"Kuehne H, Jhuang H, Stiefelhagen R, Serre T (2013) Hmdb51: a large video database for human motion recognition. High Perform Comput SciEng 12:571\u2013582","journal-title":"High Perform Comput SciEng"},{"key":"2790_CR12","doi-asserted-by":"crossref","unstructured":"Larsson G, Maire M, Shakhnarovich G (2016) Learning representations for automatic colorization. In: European conference on computer vision. Springer, pp 577\u2013593","DOI":"10.1007\/978-3-319-46493-0_35"},{"key":"2790_CR13","doi-asserted-by":"crossref","unstructured":"Lee HY, Huang JB, Singh M, Yang MH (2017) Unsupervised representation learning by sorting sequences. In: Proceedings of the IEEE international conference on computer vision, pp 667\u2013676","DOI":"10.1109\/ICCV.2017.79"},{"key":"2790_CR14","doi-asserted-by":"crossref","unstructured":"Lorre G, Rabarisoa J, Orcesi A, Ainouz S, Canu S (2020) Temporal contrastive pretraining for video action recognition. In: Proceedings of the IEEE\/CVF winter conference on applications of computer vision, pp 662\u2013670","DOI":"10.1109\/WACV45572.2020.9093278"},{"key":"2790_CR15","doi-asserted-by":"crossref","unstructured":"Luo D, Liu C, Zhou Y, Yang D, Ma C, Ye Q, Wang W (2020) Video cloze procedure for self-supervised spatio-temporal learning. In: Proceedings of the AAAI conference on artificial intelligence, vol 34, pp 11701\u201311708","DOI":"10.1609\/aaai.v34i07.6840"},{"key":"2790_CR16","doi-asserted-by":"crossref","unstructured":"Misra I, Zitnick CL, Hebert M (2016) Shuffle and learn: unsupervised learning using temporal order verification. In: European conference on computer vision. Springer, pp 527\u2013544","DOI":"10.1007\/978-3-319-46448-0_32"},{"key":"2790_CR17","doi-asserted-by":"crossref","unstructured":"Noroozi M, Favaro P (2016) Unsupervised learning of visual representations by solving jigsaw puzzles. In: European conference on computer vision. Springer, pp 69\u201384","DOI":"10.1007\/978-3-319-46466-4_5"},{"key":"2790_CR18","doi-asserted-by":"crossref","unstructured":"Noroozi M, Pirsiavash H, Favaro P (2017) Representation learning by learning to count. In: Proceedings of the IEEE international conference on computer vision, pp 5898\u20135906","DOI":"10.1109\/ICCV.2017.628"},{"key":"2790_CR19","doi-asserted-by":"crossref","unstructured":"Pathak D, Krahenbuhl P, Donahue J, Darrell T, Efros AA (2016) Context encoders: Feature learning by inpainting. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2536\u20132544","DOI":"10.1109\/CVPR.2016.278"},{"key":"2790_CR20","doi-asserted-by":"publisher","first-page":"105590","DOI":"10.1016\/j.knosys.2020.105590","volume":"194","author":"F P\u00e9rez-Hern\u00e1ndez","year":"2020","unstructured":"P\u00e9rez-Hern\u00e1ndez F, Tabik S, Lamas A, Olmos R, Fujita H, Herrera F (2020) Object detection binary classifiers methodology based on deep learning to identify small objects handled similarly: Application in video surveillance. Knowl-Based Syst 194:105590","journal-title":"Knowl-Based Syst"},{"key":"2790_CR21","doi-asserted-by":"crossref","unstructured":"Selvaraju RR, Cogswell M, Das A, Vedantam R, Parikh D, Batra D (2017) Grad-cam: Visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE international conference on computer vision, pp 618\u2013626","DOI":"10.1109\/ICCV.2017.74"},{"key":"2790_CR22","unstructured":"Soomro K, Zamir AR, Shah M (2012) Ucf101: A dataset of 101 human actions classes from videos in the wild. arXiv:1212.0402"},{"key":"2790_CR23","doi-asserted-by":"crossref","unstructured":"Tian F, Gao Y, Fang Z, Fang Y, Gu J, Fujita H, Hwang JN (2021) Depth estimation using a self-supervised network based on cross-layer feature fusion and the quadtree constraint. IEEE Transactions on Circuits and Systems for Video Technology","DOI":"10.1109\/TCSVT.2021.3080928"},{"key":"2790_CR24","unstructured":"Vondrick C, Pirsiavash H, Torralba A (2016) Generating videos with scene dynamics. In: Advances in neural information processing systems, pp 613\u2013621"},{"key":"2790_CR25","doi-asserted-by":"crossref","unstructured":"Wang F, Liu H (2021) Understanding the behaviour of contrastive loss. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 2495\u20132504","DOI":"10.1109\/CVPR46437.2021.00252"},{"key":"2790_CR26","unstructured":"Wang F, Liu H, Guo D, Sun F (2020) Unsupervised representation learning by invariancepropagation. arXiv:2010.11694"},{"key":"2790_CR27","doi-asserted-by":"crossref","unstructured":"Wang J, Gao Y, Li K, Jiang X, Guo X, Ji R, Sun X (2021) Enhancing unsupervised video representation learning by decoupling the scene and the motion. In: AAAI","DOI":"10.1609\/aaai.v35i11.17215"},{"key":"2790_CR28","unstructured":"Wang J, Jiao J, Bao L, He S, Liu W, Liu YH (2021) Self-supervised video representation learning by uncovering spatio-temporal statistics. IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2790_CR29","doi-asserted-by":"publisher","first-page":"103135","DOI":"10.1016\/j.cviu.2020.103135","volume":"203","author":"T Wang","year":"2021","unstructured":"Wang T, Zhang X, Jiang R, Zhao L, Chen H, Luo W (2021) Video deblurring via spatiotemporal pyramid network and adversarial gradient prior. Comput Vis Image Underst 203:103135","journal-title":"Comput Vis Image Underst"},{"key":"2790_CR30","doi-asserted-by":"crossref","unstructured":"Wang X, Gupta A (2015) Unsupervised learning of visual representations using videos. In: Proceedings of the IEEE international conference on computer vision, pp 2794\u20132802","DOI":"10.1109\/ICCV.2015.320"},{"key":"2790_CR31","doi-asserted-by":"crossref","unstructured":"Wei D, Lim JJ, Zisserman A, Freeman WT (2018) Learning and using the arrow of time. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 8052\u2013 8060","DOI":"10.1109\/CVPR.2018.00840"},{"key":"2790_CR32","doi-asserted-by":"publisher","first-page":"107405","DOI":"10.1016\/j.asoc.2021.107405","volume":"108","author":"Y Wu","year":"2021","unstructured":"Wu Y, Jiang X, Fang Z, Gao Y, Fujita H (2021) Multi-modal 3d object detection by 2d-guided precision anchor proposal and multi-layer fusion. Appl Soft Comput 108:107405","journal-title":"Appl Soft Comput"},{"key":"2790_CR33","doi-asserted-by":"crossref","unstructured":"Yao Y, Liu C, Luo D, Zhou Y, Ye Q (2020) Video playback rate perception for self-supervised spatio-temporal representation learning. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 6548\u20136557","DOI":"10.1109\/CVPR42600.2020.00658"},{"key":"2790_CR34","unstructured":"Yosinski J, Clune J, Bengio Y, Lipson H (2014) How transferable are features in deep neural networks?. In: Advances in neural information processing systems, pp 3320\u20133328"},{"key":"2790_CR35","doi-asserted-by":"crossref","unstructured":"Zhang R, Isola P, Efros AA (2017) Split-brain autoencoders: Unsupervised learning by cross-channel prediction. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1058\u20131067","DOI":"10.1109\/CVPR.2017.76"},{"key":"2790_CR36","doi-asserted-by":"publisher","first-page":"103003","DOI":"10.1016\/j.cviu.2020.103003","volume":"197","author":"X Zhang","year":"2020","unstructured":"Zhang X, Wang T, Wang J, Tang G, Zhao L (2020) Pyramid channel-based feature attention network for image dehazing. Comput Vis Image Underst 197:103003","journal-title":"Comput Vis Image Underst"},{"key":"2790_CR37","doi-asserted-by":"crossref","unstructured":"Zhao Y, Deng B, Shen C, Liu Y, Lu H, Hua XS (2017) Spatio-temporal autoencoder for video anomaly detection. In: Proceedings of the 25th ACM international conference on multimedia, pp 1933\u20131941","DOI":"10.1145\/3123266.3123451"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-021-02790-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-021-02790-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-021-02790-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,9]],"date-time":"2023-01-09T04:04:23Z","timestamp":1673237063000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-021-02790-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,9,14]]},"references-count":37,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2022,4]]}},"alternative-id":["2790"],"URL":"https:\/\/doi.org\/10.1007\/s10489-021-02790-9","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"value":"0924-669X","type":"print"},{"value":"1573-7497","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,9,14]]},"assertion":[{"value":"23 August 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 September 2021","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}