{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T15:19:38Z","timestamp":1772119178767,"version":"3.50.1"},"reference-count":67,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2024,7,1]],"date-time":"2024-07-01T00:00:00Z","timestamp":1719792000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"},{"start":{"date-parts":[[2024,7,20]],"date-time":"2024-07-20T00:00:00Z","timestamp":1721433600000},"content-version":"vor","delay-in-days":19,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Machine Vision and Applications"],"published-print":{"date-parts":[[2024,7]]},"abstract":"<jats:title>Abstract<\/jats:title>\n                  <jats:p>\n                    In the realm of computer vision, Group Activity Recognition (GAR) plays a vital role, finding applications in sports video analysis, surveillance, and social scene understanding. This paper introduces\n                    <jats:bold>R<\/jats:bold>\n                    ecognize\n                    <jats:bold>E<\/jats:bold>\n                    very\n                    <jats:bold>Act<\/jats:bold>\n                    ion Everywhere All At Once (REACT), a novel architecture designed to model complex contextual relationships within videos. REACT leverages advanced transformer-based models for encoding intricate contextual relationships, enhancing understanding of group dynamics. Integrated Vision-Language Encoding facilitates efficient capture of spatiotemporal interactions and multi-modal information, enabling comprehensive scene understanding. The model\u2019s precise action localization refines joint understanding of text and video data, enabling precise bounding box retrieval and enhancing semantic links between textual descriptions and visual reality. Actor-Specific Fusion strikes a balance between actor-specific details and contextual information, improving model specificity and robustness in recognizing group activities. Experimental results demonstrate REACT\u2019s superiority over state-of-the-art GAR approaches, achieving higher accuracy in recognizing and understanding group activities across diverse datasets. This work significantly advances group activity recognition, offering a robust framework for nuanced scene comprehension.\n                  <\/jats:p>","DOI":"10.1007\/s00138-024-01561-z","type":"journal-article","created":{"date-parts":[[2024,7,20]],"date-time":"2024-07-20T13:01:41Z","timestamp":1721480501000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["React: recognize every action everywhere all at once"],"prefix":"10.1007","volume":"35","author":[{"given":"Naga V. S. Raviteja","family":"Chappa","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pha","family":"Nguyen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Page Daniel","family":"Dobbs","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Khoa","family":"Luu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,7,20]]},"reference":[{"key":"1561_CR1","doi-asserted-by":"crossref","unstructured":"Wang, L., Xiong, Y., Wang, Z., Qiao, Y., Lin, D., Tang, X., Van\u00a0Gool, L.: Temporal segment networks: Towards good practices for deep action recognition. In: ECCV, pp. 20\u201336. Springer (2016)","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"1561_CR2","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: CVPR, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"1561_CR3","doi-asserted-by":"crossref","unstructured":"Wang, X., Girshick, R., Gupta, A., He, K.: Non-local neural networks. In: CVPR, pp. 7794\u20137803 (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"1561_CR4","doi-asserted-by":"crossref","unstructured":"Ranasinghe, K., Naseer, M., Khan, S., Khan, F.S., Ryoo, M.S.: Self-supervised video transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2874\u20132884 (2022)","DOI":"10.1109\/CVPR52688.2022.00289"},{"key":"1561_CR5","doi-asserted-by":"crossref","unstructured":"Nguyen, T.-T., Nguyen, P., Luu, K.: Hig: Hierarchical interlacement graph approach to scene graph generation in video understanding. arXiv preprint arXiv:2312.03050 (2023)","DOI":"10.1109\/CVPR52733.2024.01740"},{"key":"1561_CR6","unstructured":"Nguyen, P., Quach, K.G., Duong, C.N., Phung, S.L., Le, N., Luu, K.: Multi-camera multi-object tracking on the move via single-stage global association approach. arXiv preprint arXiv:2211.09663 (2022)"},{"key":"1561_CR7","unstructured":"Nguyen, P., Quach, K.G., Kitani, K., Luu, K.: Type-to-track: Retrieve any object via prompt-based tracking. In: Advances in Neural Information Processing Systems 36 (2024)"},{"key":"1561_CR8","doi-asserted-by":"publisher","first-page":"108646","DOI":"10.1016\/j.patcog.2022.108646","volume":"128","author":"KG Quach","year":"2022","unstructured":"Quach, K.G., Le, N., Duong, C.N., Jalata, I., Roy, K., Luu, K.: Non-volume preserving-based fusion to group-level emotion recognition on crowd videos. Pattern Recogn. 128, 108646 (2022)","journal-title":"Pattern Recogn."},{"key":"1561_CR9","doi-asserted-by":"crossref","unstructured":"Ibrahim, M.S., Muralidharan, S., Deng, Z., Vahdat, A., Mori, G.: A hierarchical deep temporal model for group activity recognition. In: CVPR, pp. 1971\u20131980 (2016)","DOI":"10.1109\/CVPR.2016.217"},{"key":"1561_CR10","doi-asserted-by":"crossref","unstructured":"Wu, J., Wang, L., Wang, L., Guo, J., Wu, G.: Learning actor relation graphs for group activity recognition. In: CVPR, pp. 9964\u20139974 (2019)","DOI":"10.1109\/CVPR.2019.01020"},{"key":"1561_CR11","doi-asserted-by":"crossref","unstructured":"Hu, G., Cui, B., He, Y., Yu, S.: Progressive relation learning for group activity recognition. In: CVPR, pp. 980\u2013989 (2020)","DOI":"10.1109\/CVPR42600.2020.00106"},{"key":"1561_CR12","doi-asserted-by":"crossref","unstructured":"Gavrilyuk, K., Sanford, R., Javan, M., Snoek, C.G.: Actor-transformers for group activity recognition. In: CVPR, pp. 839\u2013848 (2020)","DOI":"10.1109\/CVPR42600.2020.00092"},{"key":"1561_CR13","doi-asserted-by":"crossref","unstructured":"Pramono, R.R.A., Chen, Y.T., Fang, W.H.: Empowering relational network by self-attention augmented conditional random fields for group activity recognition. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, August 23\u201328, 2020, Proceedings, Part I 16, pp. 71\u201390. Springer (2020)","DOI":"10.1007\/978-3-030-58452-8_5"},{"key":"1561_CR14","doi-asserted-by":"crossref","unstructured":"Ehsanpour, M., Abedin, A., Saleh, F., Shi, J., Reid, I., Rezatofighi, H.: Joint learning of social groups, individuals action and sub-group activities in videos. In: ECCV, pp. 177\u2013195. Springer (2020)","DOI":"10.1007\/978-3-030-58545-7_11"},{"key":"1561_CR15","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3034233","author":"R Yan","year":"2020","unstructured":"Yan, R., Xie, L., Tang, J., Shu, X., Tian, Q.: HIGCIN: hierarchical graph-based cross inference network for group activity recognition. IEEE Trans. Pattern Anal. Mach. Intell. (2020). https:\/\/doi.org\/10.1109\/TPAMI.2020.3034233","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1561_CR16","doi-asserted-by":"crossref","unstructured":"Yuan, H., Ni, D.: Learning visual context for group activity recognition. In: AAAI, vol. 35, pp. 3261\u20133269 (2021)","DOI":"10.1609\/aaai.v35i4.16437"},{"key":"1561_CR17","doi-asserted-by":"crossref","unstructured":"Li, S., Cao, Q., Liu, L., Yang, K., Liu, S., Hou, J., Yi, S.: Groupformer: Group activity recognition with clustered spatial-temporal transformer. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.01341"},{"key":"1561_CR18","doi-asserted-by":"crossref","unstructured":"Bagautdinov, T., Alahi, A., Fleuret, F., Fua, P., Savarese, S.: Social scene understanding: End-to-end multi-person action localization and collective activity recognition. In: CVPR, pp. 4315\u20134324 (2017)","DOI":"10.1109\/CVPR.2017.365"},{"key":"1561_CR19","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1109\/TIP.2019.2918725","volume":"29","author":"P Zhang","year":"2019","unstructured":"Zhang, P., Tang, Y., Hu, J.-F., Zheng, W.-S.: Fast collective activity recognition under weak supervision. IEEE Trans. Image Process. 29, 29\u201343 (2019)","journal-title":"IEEE Trans. Image Process."},{"key":"1561_CR20","doi-asserted-by":"crossref","unstructured":"Yan, R., Xie, L., Tang, J., Shu, X., Tian, Q.: Social adaptive module for weakly-supervised group activity recognition. In: ECCV, pp. 208\u2013224. Springer (2020)","DOI":"10.1007\/978-3-030-58598-3_13"},{"key":"1561_CR21","doi-asserted-by":"crossref","unstructured":"Kim, D., Lee, J., Cho, M., Kwak, S.: Detector-free weakly supervised group activity recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20083\u201320093 (2022)","DOI":"10.1109\/CVPR52688.2022.01945"},{"key":"1561_CR22","doi-asserted-by":"crossref","unstructured":"Amer, M.R., Xie, D., Zhao, M., Todorovic, S., Zhu, S.-C.: Cost-sensitive top-down\/bottom-up inference for multiscale activity recognition. In: ECCV, pp. 187\u2013200. Springer (2012)","DOI":"10.1007\/978-3-642-33765-9_14"},{"key":"1561_CR23","doi-asserted-by":"crossref","unstructured":"Amer, M.R., Todorovic, S., Fern, A., Zhu, S.-C.: Monte carlo tree search for scheduling activity recognition. In: ICCV, pp. 1353\u20131360 (2013)","DOI":"10.1109\/ICCV.2013.171"},{"key":"1561_CR24","doi-asserted-by":"crossref","unstructured":"Amer, M.R., Lei, P., Todorovic, S.: HIRF: Hierarchical random field for collective activity recognition in videos. In: ECCV, pp. 572\u2013585. Springer (2014)","DOI":"10.1007\/978-3-319-10599-4_37"},{"issue":"4","key":"1561_CR25","doi-asserted-by":"publisher","first-page":"800","DOI":"10.1109\/TPAMI.2015.2465955","volume":"38","author":"MR Amer","year":"2015","unstructured":"Amer, M.R., Todorovic, S.: Sum product networks for activity recognition. IEEE Trans. Pattern Anal. Mach. Intell. 38(4), 800\u2013813 (2015)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"8","key":"1561_CR26","doi-asserted-by":"publisher","first-page":"1549","DOI":"10.1109\/TPAMI.2011.228","volume":"34","author":"T Lan","year":"2011","unstructured":"Lan, T., Wang, Y., Yang, W., Robinovitch, S.N., Mori, G.: Discriminative latent models for recognizing contextual group activities. IEEE Trans. Pattern Anal. Mach. Intell. 34(8), 1549\u20131562 (2011)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1561_CR27","doi-asserted-by":"crossref","unstructured":"Lan, T., Sigal, L., Mori, G.: Social roles in hierarchical models for human activity recognition. In: CVPR, pp. 1354\u20131361. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6247821"},{"key":"1561_CR28","doi-asserted-by":"crossref","unstructured":"Shu, T., Xie, D., Rothrock, B., Todorovic, S., Chun\u00a0Zhu, S.: Joint inference of groups, events and human roles in aerial videos. In: CVPR, pp. 4576\u20134584 (2015)","DOI":"10.1109\/CVPR.2015.7299088"},{"key":"1561_CR29","doi-asserted-by":"crossref","unstructured":"Wang, Z., Shi, Q., Shen, C., Van Den\u00a0Hengel, A.: Bilinear programming for human activity recognition with unknown mrf graphs. In: CVPR, pp. 1690\u20131697 (2013)","DOI":"10.1109\/CVPR.2013.221"},{"key":"1561_CR30","doi-asserted-by":"crossref","unstructured":"Deng, Z., Vahdat, A., Hu, H., Mori, G.: Structure inference machines: Recurrent neural networks for analyzing relations in group activity recognition. In: CVPR, pp. 4772\u20134781 (2016)","DOI":"10.1109\/CVPR.2016.516"},{"key":"1561_CR31","doi-asserted-by":"crossref","unstructured":"Ibrahim, M.S., Mori, G.: Hierarchical relational networks for group activity recognition and retrieval. In: ECCV, pp. 721\u2013736 (2018)","DOI":"10.1007\/978-3-030-01219-9_44"},{"key":"1561_CR32","doi-asserted-by":"crossref","unstructured":"Li, X., Choo\u00a0Chuah, M.: Sbgar: Semantics based group activity recognition. In: ICCV, pp. 2876\u20132885 (2017)","DOI":"10.1109\/ICCV.2017.313"},{"key":"1561_CR33","doi-asserted-by":"crossref","unstructured":"Qi, M., Qin, J., Li, A., Wang, Y., Luo, J., Van\u00a0Gool, L.: stagnet: An attentive semantic RNN for group activity recognition. In: ECCV, pp. 101\u2013117 (2018)","DOI":"10.1007\/978-3-030-01249-6_7"},{"key":"1561_CR34","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2942030","author":"X Shu","year":"2019","unstructured":"Shu, X., Tang, J., Qi, G., Liu, W., Yang, J.: Hierarchical long short-term concurrent memory for human interaction recognition. IEEE Trans. Pattern Anal. Mach. Intell. (2019). https:\/\/doi.org\/10.1109\/TPAMI.2019.2942030","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1561_CR35","doi-asserted-by":"crossref","unstructured":"Wang, M., Ni, B., Yang, X.: Recurrent modeling of interaction context for collective activity recognition. In: CVPR, pp. 3048\u20133056 (2017)","DOI":"10.1109\/CVPR.2017.783"},{"key":"1561_CR36","doi-asserted-by":"crossref","unstructured":"Yan, R., Tang, J., Shu, X., Li, Z., Tian, Q.: Participation-contributed temporal dynamic model for group activity recognition. In: Proceedings of the 26th ACM International Conference on Multimedia, pp. 1292\u20131300 (2018)","DOI":"10.1145\/3240508.3240572"},{"key":"1561_CR37","doi-asserted-by":"crossref","unstructured":"Yuan, H., Ni, D., Wang, M.: Spatio-temporal dynamic inference network for group activity recognition. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00738"},{"key":"1561_CR38","doi-asserted-by":"crossref","unstructured":"Han, M., Zhang, D.J., Wang, Y., Yan, R., Yao, L., Chang, X., Qiao, Y.: Dual-ai: Dual-path actor interaction learning for group activity recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2990\u20132999 (2022)","DOI":"10.1109\/CVPR52688.2022.00300"},{"key":"1561_CR39","doi-asserted-by":"crossref","unstructured":"Tamura, M., Vishwakarma, R., Vennelakanti, R.: Hunting group clues with transformers for social group activity recognition. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part IV, pp. 19\u201335. Springer (2022)","DOI":"10.1007\/978-3-031-19772-7_2"},{"key":"1561_CR40","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Li, X., Marsic, I.: Multi-label activity recognition using activity-specific features and activity correlations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14625\u201314635 (2021)","DOI":"10.1109\/CVPR46437.2021.01439"},{"key":"1561_CR41","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. In: NIPS, pp. 5998\u20136008 (2017)"},{"key":"1561_CR42","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"1561_CR43","unstructured":"Li, M., Cai, W., Liu, R., Weng, Y., Zhao, X., Wang, C., Chen, X., Liu, Z., Pan, C., Li, M., et al. Ffa-ir: Towards an explainable and reliable medical report generation benchmark. In: Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2) (2021)"},{"key":"1561_CR44","doi-asserted-by":"crossref","unstructured":"Yuan, L., Chen, Y., Wang, T., Yu, W., Shi, Y., Jiang, Z., Tay, F.E., Feng, J., Yan, S.: Tokens-to-token vit: Training vision transformers from scratch on imagenet. arXiv preprint arXiv:2101.11986 (2021)","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"1561_CR45","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., Guo, B.: Swin transformer: Hierarchical vision transformer using shifted windows. In: ICCV (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"1561_CR46","doi-asserted-by":"crossref","unstructured":"Wang, W., Xie, E., Li, X., Fan, D.-P., Song, K., Liang, D., Lu, T., Luo, P., Shao, L.: Pyramid vision transformer: A versatile backbone for dense prediction without convolutions. arXiv preprint arXiv:2102.12122 (2021)","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"1561_CR47","doi-asserted-by":"crossref","unstructured":"Han, M., Wang, Y., Chang, X., Qiao, Y.: Mining inter-video proposal relations for video object detection. In: European Conference on Computer Vision, pp. 431\u2013446 (2020). Springer","DOI":"10.1007\/978-3-030-58589-1_26"},{"key":"1561_CR48","doi-asserted-by":"crossref","unstructured":"Arnab, A., Dehghani, M., Heigold, G., Sun, C., Lucic, M., Schmid, C.: Vivit: A video vision transformer. arXiv preprint arXiv:2103.15691 (2021)","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"1561_CR49","doi-asserted-by":"crossref","unstructured":"Li, K., Wang, Y., Zhang, J., Gao, P., Song, G., Liu, Y., Li, H., Qiao, Y.: UniFormer: Unifying Convolution and Self-attention for Visual Recognition (2022)","DOI":"10.1109\/TPAMI.2023.3282631"},{"key":"1561_CR50","doi-asserted-by":"crossref","first-page":"4","DOI":"10.34142\/27091805.2021.2.01.01","volume":"2","author":"G Bertasius","year":"2021","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? ICML 2, 4 (2021)","journal-title":"ICML"},{"key":"1561_CR51","doi-asserted-by":"crossref","unstructured":"Fan, H., Xiong, B., Mangalam, K., Li, Y., Yan, Z., Malik, J., Feichtenhofer, C.: Multiscale vision transformers. arXiv preprint arXiv:2104.11227 (2021)","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"1561_CR52","unstructured":"Patrick, M., Campbell, D., Asano, Y.M., Metze, I.M.F., Feichtenhofer, C., Vedaldi, A., Henriques, J., et al.: Keeping your eye on the ball: Trajectory attention in video transformers. In: NeurIPS (2021)"},{"key":"1561_CR53","doi-asserted-by":"crossref","unstructured":"Yang, A., Miech, A., Sivic, J., Laptev, I., Schmid, C.: Tubedetr: Spatio-temporal video grounding with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16442\u201316453 (2022)","DOI":"10.1109\/CVPR52688.2022.01595"},{"key":"1561_CR54","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., Savarese, S.: Generalized intersection over union: A metric and a loss for bounding box regression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 658\u2013666 (2019)","DOI":"10.1109\/CVPR.2019.00075"},{"key":"1561_CR55","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: ECCV, pp. 213\u2013229 (2020). Springer","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"1561_CR56","doi-asserted-by":"crossref","unstructured":"Yan, B., Peng, H., Fu, J., Wang, D., Lu, H.: Learning spatio-temporal transformer for visual tracking. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10448\u201310457 (2021)","DOI":"10.1109\/ICCV48922.2021.01028"},{"key":"1561_CR57","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., Stoyanov, V.: Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"key":"1561_CR58","doi-asserted-by":"crossref","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., Berg, A.C., Fei-Fei, L.: Imagenet large scale visual recognition challenge. In: IJCV (2015)","DOI":"10.1007\/s11263-015-0816-y"},{"key":"1561_CR59","doi-asserted-by":"crossref","first-page":"4","DOI":"10.34142\/27091805.2021.2.01.01","volume":"2","author":"G Bertasius","year":"2021","unstructured":"Bertasius, G., Wang, H., Torresani, L.: Is space-time attention all you need for video understanding? ICML 2, 4 (2021)","journal-title":"ICML"},{"key":"1561_CR60","unstructured":"Kingma, D.P., Ba, J.: Adam: A method for stochastic optimization. In: ICLR (2015)"},{"key":"1561_CR61","unstructured":"Steiner, A., Kolesnikov, A., Zhai, X., Wightman, R., Uszkoreit, J., Beyer, L.: How to train your vit? Data, augmentation, and regularization in vision transformers (2021)"},{"key":"1561_CR62","doi-asserted-by":"crossref","unstructured":"Chen*, X., Xie*, S., He, K.: An empirical study of training self-supervised vision transformers (2021)","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"1561_CR63","doi-asserted-by":"crossref","unstructured":"Han, R., Yan, H., Li, J., Wang, S., Feng, W., Wang, S.: Panoramic human activity recognition. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part IV, pp. 244\u2013261. Springer (2022)","DOI":"10.1007\/978-3-031-19772-7_15"},{"key":"1561_CR64","doi-asserted-by":"crossref","unstructured":"Azar, S.M., Atigh, M.G., Nickabadi, A., Alahi, A.: Convolutional relational machine for group activity recognition. In: CVPR, pp. 7892\u20137901 (2019)","DOI":"10.1109\/CVPR.2019.00808"},{"key":"1561_CR65","doi-asserted-by":"crossref","unstructured":"Chappa, N.V., Nguyen, P., Nelson, A.H., Seo, H.-S., Li, X., Dobbs, P.D., Luu, K.: Sogar: Self-supervised spatiotemporal attention-based social group activity recognition. arXiv preprint arXiv:2305.06310 (2023)","DOI":"10.2139\/ssrn.4504147"},{"key":"1561_CR66","doi-asserted-by":"crossref","unstructured":"Chappa, N.V., Nguyen, P., Nelson, A.H., Seo, H.-S., Li, X., Dobbs, P.D., Luu, K.: Spartan: Self-supervised spatiotemporal transformers approach to group activity recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5157\u20135167 (2023)","DOI":"10.1109\/CVPRW59228.2023.00544"},{"key":"1561_CR67","doi-asserted-by":"crossref","unstructured":"Ehsanpour, M., Saleh, F., Savarese, S., Reid, I., Rezatofighi, H.: Jrdb-act: A large-scale dataset for spatio-temporal action, social group and activity detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20983\u201320992 (2022)","DOI":"10.1109\/CVPR52688.2022.02031"}],"container-title":["Machine Vision and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-024-01561-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00138-024-01561-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00138-024-01561-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,24]],"date-time":"2024-11-24T09:19:20Z","timestamp":1732439960000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00138-024-01561-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,7]]},"references-count":67,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,7]]}},"alternative-id":["1561"],"URL":"https:\/\/doi.org\/10.1007\/s00138-024-01561-z","relation":{"has-preprint":[{"id-type":"doi","id":"10.21203\/rs.3.rs-4109494\/v1","asserted-by":"object"}]},"ISSN":["0932-8092","1432-1769"],"issn-type":[{"value":"0932-8092","type":"print"},{"value":"1432-1769","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,7]]},"assertion":[{"value":"15 March 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 May 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 May 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 July 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no Conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"102"}}