{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,14]],"date-time":"2026-02-14T07:33:51Z","timestamp":1771054431773,"version":"3.50.1"},"reference-count":30,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2022,8,11]],"date-time":"2022-08-11T00:00:00Z","timestamp":1660176000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,8,11]],"date-time":"2022-08-11T00:00:00Z","timestamp":1660176000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100007129","name":"Natural Science Foundation of Shandong Province","doi-asserted-by":"publisher","award":["No. ZR2020MF136"],"award-info":[{"award-number":["No. ZR2020MF136"]}],"id":[{"id":"10.13039\/501100007129","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100014103","name":"Key Research and Development Plan of Shandong Province","doi-asserted-by":"crossref","award":["No. 2019GGX101015"],"award-info":[{"award-number":["No. 2019GGX101015"]}],"id":[{"id":"10.13039\/100014103","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["No. 20CX05018A"],"award-info":[{"award-number":["No. 20CX05018A"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1007\/s11760-022-02324-x","type":"journal-article","created":{"date-parts":[[2022,8,11]],"date-time":"2022-08-11T06:04:52Z","timestamp":1660197892000},"page":"1173-1180","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Spatial\u2013temporal injection network: exploiting auxiliary losses for action recognition with apparent difference and self-attention"],"prefix":"10.1007","volume":"17","author":[{"given":"Haiwen","family":"Cao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0944-2564","authenticated-orcid":false,"given":"Chunlei","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jing","family":"Lu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Leiquan","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,8,11]]},"reference":[{"key":"2324_CR1","doi-asserted-by":"crossref","unstructured":"He, K., et al.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"2324_CR2","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"2324_CR3","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. Adv Neural Inf. Process. Syst. (2014)"},{"key":"2324_CR4","doi-asserted-by":"crossref","unstructured":"Tran, D., et al.: Learning spatiotemporal features with 3D convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"2324_CR5","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: Temporal segment networks: towards good practices for deep action recognition. In: European Conference on Computer Vision (2016)","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"2324_CR6","doi-asserted-by":"crossref","unstructured":"Wang, H., et al.: Action recognition by dense trajectories (2011)","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"2324_CR7","doi-asserted-by":"crossref","unstructured":"Wang, H., Schmid, C.: Action recognition with improved trajectories. In: Proceedings of the IEEE International Conference on Computer Vision (2013)","DOI":"10.1109\/ICCV.2013.441"},{"key":"2324_CR8","unstructured":"Lan, Z., et al.: Beyond Gaussian pyramid: multi-skip feature stacking for action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2015)"},{"key":"2324_CR9","doi-asserted-by":"crossref","unstructured":"Wang, Y., et al.: Spatiotemporal pyramid network for video action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.226"},{"key":"2324_CR10","doi-asserted-by":"crossref","unstructured":"Yang, K., et al.: IF-TTN: information fused temporal transformation network for video action recognition (2019)","DOI":"10.1109\/ICASSP40776.2020.9053394"},{"key":"2324_CR11","unstructured":"Wang, Y., et al.: Reversing two-stream networks with decoding discrepancy penalty for robust action recognition (2018)"},{"key":"2324_CR12","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xiong, Y., Lin, D.: Recognize actions by disentangling components of dynamics. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00687"},{"key":"2324_CR13","doi-asserted-by":"crossref","unstructured":"Wang, X., et al.: Non-local neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2018)","DOI":"10.1109\/CVPR.2018.00813"},{"key":"2324_CR14","doi-asserted-by":"crossref","unstructured":"Shou, Z., et al.: Dmc-net: generating discriminative motion cues for fast compressed video action recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2019)","DOI":"10.1109\/CVPR.2019.00136"},{"key":"2324_CR15","unstructured":"Ioffe, S., Szegedy, C. Batch normalization: accelerating deep network training by reducing internal covariate shift (2015)"},{"key":"2324_CR16","unstructured":"Girdhar, R., Ramanan, D. Attentional pooling for action recognition. Adv. Neural Inf. Process. Syst. (2017)"},{"key":"2324_CR17","doi-asserted-by":"crossref","unstructured":"Tran, A., Cheong, L.-F.: Two-stream flow-guided convolutional attention networks for action recognition. In: Proceedings of the IEEE International Conference on Computer Vision (2017)","DOI":"10.1109\/ICCVW.2017.368"},{"key":"2324_CR18","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild (2012)"},{"key":"2324_CR19","doi-asserted-by":"crossref","unstructured":"Kuehne, H., et al.: HMDB: a large video database for human motion recognition. In: 2011 International Conference on Computer Vision (2011)","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"2324_CR20","unstructured":"Zach, C., Pock, T., Bischof, H.: A duality based approach for realtime TV-L 1 optical flow. In: Joint Pattern Recognition Symposium (2007)"},{"issue":"3","key":"2324_CR21","doi-asserted-by":"publisher","first-page":"254","DOI":"10.1007\/s11263-015-0859-0","volume":"119","author":"L Wang","year":"2016","unstructured":"Wang, L., Qiao, Y., Tang, X.: MoFAP: a multi-level representation for action recognition. Int. J. Comput. Vis. 119(3), 254\u2013271 (2016)","journal-title":"Int. J. Comput. Vis."},{"key":"2324_CR22","doi-asserted-by":"crossref","unstructured":"Wang, L., Qiao, Y., Tang, X.: Action recognition with trajectory-pooled deep-convolutional descriptors. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (2015)","DOI":"10.1109\/CVPR.2015.7299059"},{"key":"2324_CR23","unstructured":"Sharma, S., Kiros, R., Salakhutdinov, R. Action recognition using visual attention (2015)"},{"issue":"3","key":"2324_CR24","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., et al.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vis. 115(3), 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"2324_CR25","doi-asserted-by":"publisher","first-page":"109","DOI":"10.1016\/j.cviu.2016.03.013","volume":"150","author":"X Peng","year":"2016","unstructured":"Peng, X., et al.: Bag of visual words and fusion methods for action recognition: comprehensive study and good practice. Comput. Vis. Image Underst. 150, 109\u2013125 (2016)","journal-title":"Comput. Vis. Image Underst."},{"issue":"6","key":"2324_CR26","doi-asserted-by":"publisher","first-page":"1510","DOI":"10.1109\/TPAMI.2017.2712608","volume":"40","author":"G Varol","year":"2017","unstructured":"Varol, G., Laptev, I., Schmid, C.: Long-term temporal convolutions for action recognition. IEEE Trans. Pattern Anal. Mach. Intell. 40(6), 1510\u20131517 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"2324_CR27","unstructured":"Huang, S., Lin, X., Karaman, S., et al.: Flow-distilled IP two-stream networks for compressed video action recognition. In: Computer Vision and Pattern Recognition (2019)"},{"key":"2324_CR28","doi-asserted-by":"crossref","unstructured":"Cao, D., Xu, L., Chen, H., et al.: Action recognition in untrimmed videos with composite self-attention two-stream framework. In: Computer Vision and Pattern Recognition (2019)","DOI":"10.1007\/978-3-030-41299-9_3"},{"key":"2324_CR29","doi-asserted-by":"crossref","unstructured":"Du, Y., Yuan, C., Li, B., et al.: Interaction-aware spatio-temporal pyramid attention networks for action classification. In: European Conference on Computer Vision, pp. 388\u2013404 (2018)","DOI":"10.1007\/978-3-030-01270-0_23"},{"key":"2324_CR30","unstructured":"Deep draw. https:\/\/github.com\/auduno\/deepdraw (2016)"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-022-02324-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-022-02324-x\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-022-02324-x.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,4,24]],"date-time":"2023-04-24T16:19:08Z","timestamp":1682353148000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-022-02324-x"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,11]]},"references-count":30,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2023,6]]}},"alternative-id":["2324"],"URL":"https:\/\/doi.org\/10.1007\/s11760-022-02324-x","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"value":"1863-1703","type":"print"},{"value":"1863-1711","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,8,11]]},"assertion":[{"value":"11 January 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"30 August 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 July 2022","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 August 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}