{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,6]],"date-time":"2024-09-06T00:53:12Z","timestamp":1725583992394},"reference-count":26,"publisher":"SPIE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,3,5]]},"DOI":"10.1117\/12.2623446","type":"proceedings-article","created":{"date-parts":[[2022,3,5]],"date-time":"2022-03-05T00:56:51Z","timestamp":1646441811000},"page":"40","source":"Crossref","is-referenced-by-count":0,"title":["Joint motion context and clip augmentation for spatio-temporal action detection"],"prefix":"10.1117","author":[{"given":"Xurui","family":"Ma","sequence":"first","affiliation":[]},{"given":"Xiang","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Chengkun","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Chuanfu","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Jie","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Zhigang","family":"Luo","sequence":"additional","affiliation":[]}],"member":"189","reference":[{"key":"c1","first-page":"759","article-title":"Finding action tubes","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Gkioxari","year":"2015"},{"key":"c2","first-page":"744","article-title":"Multi-region two-stream r-cnn for action detection","volume-title":"European Conference on Computer Vision","author":"Peng","year":"2016"},{"key":"c3","doi-asserted-by":"crossref","DOI":"10.5244\/C.30.58","article-title":"Deep learning for detecting multiple space-time action tubes in videos","author":"Saha","year":"2016"},{"key":"c4","first-page":"3637","article-title":"Online real-time multiple spatiotemporal action localisation and prediction","volume-title":"Proceedings of the IEEE International Conference on Computer Vision","author":"Singh","year":"2017"},{"key":"c5","first-page":"264","article-title":"Step: Spatio-temporal progressive learning for video action detection","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Yang","year":"2019"},{"key":"c6","first-page":"9935","article-title":"Dance with flow: Two-in-one stream action detection","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Zhao","year":"2019"},{"key":"c7","first-page":"68","article-title":"Actions as moving points","volume-title":"European Conference on Computer Vision","author":"Li","year":"2020"},{"year":"2020","author":"Wang","article-title":"Removing the background by adding the background: Towards background robust self-supervised video representation learning","key":"c8"},{"year":"2012","author":"Soomro","article-title":"Ucf101: A dataset of 101 human actions classes from videos in the wild","key":"c9"},{"key":"c10","first-page":"3192","article-title":"Towards understanding action recognition","volume-title":"Proceedings of the IEEE International Conference on Computer Vision","author":"Jhuang","year":"2013"},{"doi-asserted-by":"publisher","key":"c11","DOI":"10.1007\/s11263-019-01247-4"},{"key":"c12","first-page":"3164","article-title":"Learning to track for spatio-temporal action localization","volume-title":"Proceedings of the IEEE International Conference on Computer Vision","author":"Weinzaepfel","year":"2015"},{"key":"c13","first-page":"4405","article-title":"Action tubelet detector for spatio-temporal action localization","volume-title":"Proceedings of the IEEE International Conference on Computer Vision","author":"Kalogeiton","year":"2017"},{"doi-asserted-by":"publisher","key":"c14","DOI":"10.1109\/TIP.2018.2843129"},{"key":"c15","first-page":"6047","article-title":"Ava: A video dataset of spatio-temporally localized atomic visual actions","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Gu","year":"2018"},{"key":"c16","first-page":"5822","article-title":"Tube convolutional neural network (t-cnn) for action detection in videos","volume-title":"Proceedings of the IEEE International Conference on Computer Vision","author":"Hou","year":"2017"},{"key":"c17","first-page":"11987","article-title":"Tacnet: Transition-aware context network for spatio-temporal action detection","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Song","year":"2019"},{"key":"c18","first-page":"510","article-title":"Cfad: Coarse-to-fine action detector for spatiotemporal action localization","volume-title":"European Conference on Computer Vision","author":"Li","year":"2020"},{"year":"2017","author":"Vaswani","article-title":"Attention is all you need","key":"c19"},{"key":"c20","first-page":"7794","article-title":"Non-local neural networks","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","author":"Wang","year":"2018"},{"key":"c21","first-page":"603","article-title":"Ccnet: Criss-cross attention for semantic segmentation","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Huang","year":"2019"},{"doi-asserted-by":"publisher","key":"c22","DOI":"10.1109\/ICCVW48693.2019"},{"key":"c23","first-page":"2403","article-title":"Deep layer aggregation","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","author":"Yu","year":"2018"},{"key":"c24","first-page":"318","article-title":"Actor-centric relation network","volume-title":"Proceedings of the European Conference on Computer Vision (ECCV)","author":"Sun","year":"2018"},{"key":"c25","first-page":"12016","article-title":"Improving action localization by progressive cross-stream cooperation","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Su","year":"2019"},{"key":"c26","first-page":"12056","article-title":"Learning spatio-temporal representation with local and global diffusion","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Qiu","year":"2019"}],"event":{"name":"Fourteenth International Conference on Machine Vision (ICMV 2021)","start":{"date-parts":[[2021,11,8]]},"location":"Rome, Italy","end":{"date-parts":[[2021,11,12]]}},"container-title":["Fourteenth International Conference on Machine Vision (ICMV 2021)"],"original-title":[],"deposited":{"date-parts":[[2022,6,21]],"date-time":"2022-06-21T22:32:16Z","timestamp":1655850736000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.spiedigitallibrary.org\/conference-proceedings-of-spie\/12084\/2623446\/Joint-motion-context-and-clip-augmentation-for-spatio-temporal-action\/10.1117\/12.2623446.full"}},"subtitle":[],"editor":[{"given":"Wolfgang","family":"Osten","sequence":"additional","affiliation":[]},{"given":"Dmitry","family":"Nikolaev","sequence":"additional","affiliation":[]},{"given":"Jianhong","family":"Zhou","sequence":"additional","affiliation":[]}],"short-title":[],"issued":{"date-parts":[[2022,3,5]]},"references-count":26,"URL":"https:\/\/doi.org\/10.1117\/12.2623446","relation":{},"subject":[],"published":{"date-parts":[[2022,3,5]]}}}