{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T07:24:25Z","timestamp":1740122665865,"version":"3.37.3"},"reference-count":46,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2021,7,1]],"date-time":"2021-07-01T00:00:00Z","timestamp":1625097600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,7,1]],"date-time":"2021-07-01T00:00:00Z","timestamp":1625097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100010896","name":"International Cooperation and Exchange Programme","doi-asserted-by":"publisher","award":["61520106002"],"award-info":[{"award-number":["61520106002"]}],"id":[{"id":"10.13039\/501100010896","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Appl Intell"],"published-print":{"date-parts":[[2022,2]]},"DOI":"10.1007\/s10489-021-02440-0","type":"journal-article","created":{"date-parts":[[2021,7,1]],"date-time":"2021-07-01T11:02:45Z","timestamp":1625137365000},"page":"3143-3155","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Dynamic-boosting attention for self-supervised video representation learning"],"prefix":"10.1007","volume":"52","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1145-234X","authenticated-orcid":false,"given":"Zhipeng","family":"Wang","sequence":"first","affiliation":[]},{"given":"Chunping","family":"Hou","sequence":"additional","affiliation":[]},{"given":"Guanghui","family":"Yue","sequence":"additional","affiliation":[]},{"given":"Qingyuan","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,7,1]]},"reference":[{"key":"2440_CR1","doi-asserted-by":"crossref","unstructured":"Bi HB, Lu D, Zhu HH, Yang LN, Guan HP (2020) Sta-net: spatial-temporal attention network for video salient object detection. Appl Intell pp 1\u201310","DOI":"10.1109\/TCDS.2021.3078824"},{"key":"2440_CR2","doi-asserted-by":"crossref","unstructured":"Buchler U, Brattoli B, Ommer B (2018) Improving spatiotemporal self-supervision by deep reinforcement learning. In: Proceedings of the European conference on computer vision (ECCV), pp 770\u2013786","DOI":"10.1007\/978-3-030-01267-0_47"},{"key":"2440_CR3","doi-asserted-by":"crossref","unstructured":"Ding C, Liu K, Cheng F, Belyaev E (2020) Spatio-temporal attention on manifold space for 3d human action recognition. Appl Intell vol 51(5)","DOI":"10.1007\/s10489-020-01803-3"},{"key":"2440_CR4","doi-asserted-by":"crossref","unstructured":"Doersch C, Gupta A, Efros AA (2015) Unsupervised visual representation learning by context prediction. In: Proceedings of the IEEE international conference on computer vision, pp 1422\u20131430","DOI":"10.1109\/ICCV.2015.167"},{"key":"2440_CR5","doi-asserted-by":"crossref","unstructured":"Donahue J, Anne Hendricks L, Guadarrama S, Rohrbach M, Venugopalan S, Saenko K, Darrell T (2015) Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE conference computer vision pattern recognition, pp 2625\u20132634","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"2440_CR6","doi-asserted-by":"crossref","unstructured":"Feng Y, Li K, Gao Y, Qiu J (2020) Hierarchical graph attention networks for semi-supervised node classification. Appl Intell vol 50(3)","DOI":"10.1007\/s10489-020-01729-w"},{"key":"2440_CR7","doi-asserted-by":"crossref","unstructured":"Fernando B, Bilen H, Gavves E, Gould S (2017) Self-supervised video representation learning with odd-one-out networks. In: Proceedings of the IEEE conference computer vision pattern recognition, pp 3636\u20133645","DOI":"10.1109\/CVPR.2017.607"},{"key":"2440_CR8","doi-asserted-by":"crossref","unstructured":"Fu J, Liu J, Tian H, Li Y, Bao Y, Fang Z, Lu H (2019) Dual attention network for scene segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3146\u20133154","DOI":"10.1109\/CVPR.2019.00326"},{"key":"2440_CR9","doi-asserted-by":"publisher","first-page":"767","DOI":"10.1109\/TIP.2020.3038372","volume":"30","author":"Z Gao","year":"2020","unstructured":"Gao Z, Guo L, Guan W, Liu AA, Ren T, Chen S (2020) A pairwise attentive adversarial spatiotemporal network for cross-domain few-shot action recognition-r2. IEEE Trans Image Process 30:767\u2013782","journal-title":"IEEE Trans Image Process"},{"issue":"99","key":"2440_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1109\/TNNLS.2020.3041732","volume":"PP","author":"Z Gao","year":"2020","unstructured":"Gao Z, Guo L, Ren T, Liu AA, Cheng ZY, Chen S (2020) Pairwise two-stream convnets for cross-domain action recognition with small data. IEEE Trans Neural Netw Learn Syst PP(99):1\u201315. https:\/\/doi.org\/10.1109\/TNNLS.2020.3041018","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"issue":"1","key":"2440_CR11","doi-asserted-by":"publisher","first-page":"165","DOI":"10.1109\/TGRS.2019.2934760","volume":"58","author":"J He","year":"2019","unstructured":"He J, Zhao L, Yang H, Zhang M, Li W (2019) Hsi-bert: Hyperspectral image classification using the bidirectional encoder representation from transformers. IEEE Trans Geosci Remote Sens 58(1):165\u2013178","journal-title":"IEEE Trans Geosci Remote Sens"},{"issue":"8","key":"2440_CR12","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780","journal-title":"Neural Comput"},{"issue":"2","key":"2440_CR13","doi-asserted-by":"publisher","first-page":"577","DOI":"10.1109\/TCSVT.2019.2890899","volume":"30","author":"C Huang","year":"2020","unstructured":"Huang C, Wang H (2020) Novel key-frames selection framework for comprehensive video summarization. IEEE Trans Circ Syst Video Technol 30(2):577\u2013589","journal-title":"IEEE Trans Circ Syst Video Technol"},{"key":"2440_CR14","doi-asserted-by":"crossref","unstructured":"Huang W, Gu J, Ma X, Li Y (2020) End-to-end multitask siamese network with residual hierarchical attention for real-time object tracking. Appl Intell vol 50(7)","DOI":"10.1007\/s10489-019-01605-2"},{"key":"2440_CR15","doi-asserted-by":"crossref","unstructured":"Kar A, Rai N, Sikka K, Sharma G (2017) Adascan: Adaptive scan pooling in deep convolutional neural networks for human action recognition in videos. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3376\u20133385","DOI":"10.1109\/CVPR.2017.604"},{"key":"2440_CR16","doi-asserted-by":"crossref","unstructured":"Karpathy A, Toderici G, Shetty S, Leung T, Sukthankar R, Fei-Fei L (2014) Large-scale video classification with convolutional neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1725\u20131732","DOI":"10.1109\/CVPR.2014.223"},{"key":"2440_CR17","unstructured":"Kay W, Carreira J, Simonyan K, Zhang B, Hillier C, Vijayanarasimhan S, Viola F, Green T, Back T, Natsev P et al (2017) The kinetics human action video dataset. arXiv:1705.06950"},{"key":"2440_CR18","doi-asserted-by":"crossref","unstructured":"Kim D, Cho D, Kweon IS (2019) Self-supervised video representation learning with space-time cubic puzzles. In: Proceedings of the AAAI conference on artificial intelligence, vol 33, pp 8545\u20138552","DOI":"10.1609\/aaai.v33i01.33018545"},{"issue":"2","key":"2440_CR19","doi-asserted-by":"publisher","first-page":"562","DOI":"10.1007\/s10489-019-01526-0","volume":"50","author":"M Koohzadi","year":"2020","unstructured":"Koohzadi M, Charkari NM, Ghaderi F (2020) Unsupervised representation learning based on the deep multi-view ensemble learning. Appl Intell 50(2):562\u2013581","journal-title":"Appl Intell"},{"key":"2440_CR20","doi-asserted-by":"crossref","unstructured":"Kuehne H, Jhuang H, Garrote E, Poggio T, Serre T (2011) Hmdb: a large video database for human motion recognition. In: Proceedings of the IEEE international conference on computer vision. IEEE, pp 2556\u20132563","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"2440_CR21","doi-asserted-by":"crossref","unstructured":"Larsson G, Maire M, Shakhnarovich G (2017) Colorization as a proxy task for visual understanding. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6874\u20136883","DOI":"10.1109\/CVPR.2017.96"},{"key":"2440_CR22","doi-asserted-by":"crossref","unstructured":"Lee HY, Huang JB, Singh M, Yang MH (2017) Unsupervised representation learning by sorting sequences. In: Proceedings of the IEEE international conference on computer vision, pp 667\u2013676","DOI":"10.1109\/ICCV.2017.79"},{"key":"2440_CR23","doi-asserted-by":"crossref","unstructured":"Long X, Gan C, De Melo G, Wu J, Liu X, Wen S (2018) Attention clusters: Purely attention based local feature integration for video classification. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7834\u20137843","DOI":"10.1109\/CVPR.2018.00817"},{"key":"2440_CR24","doi-asserted-by":"crossref","unstructured":"Luo Z, Peng B, Huang DA, Alahi A, Fei-Fei L (2017) Unsupervised learning of long-term motion dynamics for videos. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2203\u20132212","DOI":"10.1109\/CVPR.2017.751"},{"key":"2440_CR25","doi-asserted-by":"crossref","unstructured":"Lv TX, Pan X, Zhu YZ, Li LH (2020) Unsupervised medical images denoising via graph attention dual adversarial network. Appl Intell pp 1\u201310","DOI":"10.1007\/s10489-020-02016-4"},{"key":"2440_CR26","doi-asserted-by":"crossref","unstructured":"Mao Y, He Z (2020) Dual-y network: infrared-visible image patches matching via semi-supervised transfer learning. Appl Intell pp 1\u201310","DOI":"10.1007\/s10489-020-01996-7"},{"key":"2440_CR27","doi-asserted-by":"crossref","unstructured":"Misra I, Zitnick CL, Hebert M (2016) Shuffle and learn: unsupervised learning using temporal order verification. In: Proceedings of the European conference on computer vision (ECCV). Springer, pp 527\u2013544","DOI":"10.1007\/978-3-319-46448-0_32"},{"issue":"1","key":"2440_CR28","doi-asserted-by":"publisher","first-page":"110","DOI":"10.1109\/TGRS.2019.2933609","volume":"58","author":"L Mou","year":"2019","unstructured":"Mou L, Zhu XX (2019) Learning to pay attention on spectral domain: A spectral attention module-based convolutional network for hyperspectral image classification. IEEE Trans Geosci Remote Sens 58 (1):110\u2013122","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"2440_CR29","doi-asserted-by":"crossref","unstructured":"Nalepa J, Myller M, Imai Y, Honda KI, Takeda T, Antoniak M (2020) Unsupervised segmentation of hyperspectral images using 3-d convolutional autoencoders. IEEE Geosci Remote Sens Lett pp 1\u20135","DOI":"10.1109\/LGRS.2019.2960945"},{"key":"2440_CR30","doi-asserted-by":"crossref","unstructured":"Nathan Mundhenk T, Ho D, Chen BY (2018) Improvements to context based self-supervised learning. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 9339\u20139348","DOI":"10.1109\/CVPR.2018.00973"},{"key":"2440_CR31","doi-asserted-by":"crossref","unstructured":"Noroozi M, Favaro P (2016) Unsupervised learning of visual representations by solving jigsaw puzzles. In: Proceedings of the European conference on computer vision (ECCV). Springer, pp 69\u201384","DOI":"10.1007\/978-3-319-46466-4_5"},{"key":"2440_CR32","doi-asserted-by":"crossref","unstructured":"Noroozi M, Pirsiavash H, Favaro P (2017) Representation learning by learning to count. In: Proceedings of the IEEE international conference on computer vision, pp 5898\u20135906","DOI":"10.1109\/ICCV.2017.628"},{"key":"2440_CR33","unstructured":"Paszke A, Gross S, Massa F, Lerer A, Bradbury J, Chanan G, Killeen T, Lin Z, Gimelshein N, Antiga L et al (2019) Pytorch: An imperative style, high-performance deep learning library. In: Advances in neural information processing systems, pp 8026\u20138037"},{"key":"2440_CR34","doi-asserted-by":"crossref","unstructured":"Pathak D, Krahenbuhl P, Donahue J, Darrell T, Efros AA (2016) Context encoders: Feature learning by inpainting. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 2536\u20132544","DOI":"10.1109\/CVPR.2016.278"},{"issue":"3","key":"2440_CR35","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky O, Deng J, Su H, Krause J, Satheesh S, Ma S, Huang Z, Karpathy A, Khosla A, Bernstein M et al (2015) Imagenet large scale visual recognition challenge. Int J Comput Vision 115(3):211\u2013252","journal-title":"Int J Comput Vision"},{"key":"2440_CR36","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. In: Advances in neural information processing systems, pp 568\u2013576"},{"key":"2440_CR37","unstructured":"Soomro K, Zamir AR, Shah M (2012) Ucf101: A dataset of 101 human actions classes from videos in the wild. arXiv:1212.0402"},{"key":"2440_CR38","doi-asserted-by":"crossref","unstructured":"Sun P, Su X, Guo S, Chen F (2020) Cycle representation-disentangling network: learning to completely disentangle spatial-temporal features in video. Appl Intell pp 1\u201320","DOI":"10.1007\/s10489-020-01750-z"},{"key":"2440_CR39","doi-asserted-by":"publisher","first-page":"424","DOI":"10.1016\/j.neucom.2018.11.038","volume":"331","author":"H Tang","year":"2019","unstructured":"Tang H, Liu H, Xiao W, Sebe N (2019) Fast and robust dynamic hand gesture recognition via key frames extraction and feature fusion. Neurocomputing 331:424\u2013433","journal-title":"Neurocomputing"},{"key":"2440_CR40","doi-asserted-by":"crossref","unstructured":"Tran D, Bourdev L, Fergus R, Torresani L, Paluri M (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision, pp 4489\u20134497","DOI":"10.1109\/ICCV.2015.510"},{"key":"2440_CR41","unstructured":"Vondrick C, Pirsiavash H, Torralba A (2016) Generating videos with scene dynamics. In: Advances in neural information processing systems, pp 613\u2013621"},{"key":"2440_CR42","doi-asserted-by":"crossref","unstructured":"Wang X, He K, Gupta A (2017) Transitive invariance for self-supervised visual representation learning. In: Proceedings of the IEEE international conference on computer vision, pp 1329\u20131338","DOI":"10.1109\/ICCV.2017.149"},{"key":"2440_CR43","doi-asserted-by":"crossref","unstructured":"Xu D, Xiao J, Zhao Z, Shao J, Xie D, Zhuang Y (2019) Self-supervised spatiotemporal learning via video clip order prediction. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 10334\u201310343","DOI":"10.1109\/CVPR.2019.01058"},{"key":"2440_CR44","unstructured":"Yang K, Liu Z, Lu Q, Xia GS (2019) Multi-scale weighted branch network for remote sensing image classification. In: Proceedings of the IEEE conference on computer vision and pattern recognition workshops, pp 1\u201310"},{"issue":"3","key":"2440_CR45","first-page":"1","volume":"15","author":"J Zhang","year":"2019","unstructured":"Zhang J, Hu H, Lu X (2019) Moving foreground-aware visual attention and key volume mining for human action recognition. ACM Trans Multimed Comput Commun Appl (TOMM) 15(3):1\u201316","journal-title":"ACM Trans Multimed Comput Commun Appl (TOMM)"},{"key":"2440_CR46","doi-asserted-by":"crossref","unstructured":"Zhu W, Hu J, Sun G, Cao X, Qiao Y (2016) A key volume mining deep framework for action recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1991\u20131999","DOI":"10.1109\/CVPR.2016.219"}],"container-title":["Applied Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-021-02440-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10489-021-02440-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10489-021-02440-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,2,10]],"date-time":"2022-02-10T05:27:52Z","timestamp":1644470872000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10489-021-02440-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,1]]},"references-count":46,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2022,2]]}},"alternative-id":["2440"],"URL":"https:\/\/doi.org\/10.1007\/s10489-021-02440-0","relation":{},"ISSN":["0924-669X","1573-7497"],"issn-type":[{"type":"print","value":"0924-669X"},{"type":"electronic","value":"1573-7497"}],"subject":[],"published":{"date-parts":[[2021,7,1]]},"assertion":[{"value":"15 April 2021","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 July 2021","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}