{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T03:05:28Z","timestamp":1740107128952,"version":"3.37.3"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"11","license":[{"start":{"date-parts":[[2022,9,22]],"date-time":"2022-09-22T00:00:00Z","timestamp":1663804800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,9,22]],"date-time":"2022-09-22T00:00:00Z","timestamp":1663804800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Vis Comput"],"published-print":{"date-parts":[[2023,11]]},"DOI":"10.1007\/s00371-022-02673-1","type":"journal-article","created":{"date-parts":[[2022,9,22]],"date-time":"2022-09-22T18:04:03Z","timestamp":1663869843000},"page":"5469-5483","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Weakly supervised graph learning for action recognition in untrimmed video"],"prefix":"10.1007","volume":"39","author":[{"given":"Xiao","family":"Yao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9239-6721","authenticated-orcid":false,"given":"Jia","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruixuan","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dan","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yifeng","family":"Zeng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,9,22]]},"reference":[{"key":"2673_CR1","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. arXiv preprint arXiv:1406.2199 (2014)"},{"key":"2673_CR2","doi-asserted-by":"crossref","unstructured":"Wang, L., Xiong, Y., Wang, Z., Qiao, Y., Lin, D., Tang, X., Van Gool, L.: Temporal segment networks: Towards good practices for deep action recognition. In: European Conference on Computer Vision, pp. 20\u201336 (2016). Springer","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"2673_CR3","doi-asserted-by":"crossref","unstructured":"Fan, L., Huang, W., Gan, C., Ermon, S., Gong, B., Huang, J.: End-to-end learning of motion representation for video understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6016\u20136025 (2018)","DOI":"10.1109\/CVPR.2018.00630"},{"issue":"10","key":"2673_CR4","doi-asserted-by":"publisher","first-page":"983","DOI":"10.1007\/s00371-012-0752-6","volume":"29","author":"S Vishwakarma","year":"2013","unstructured":"Vishwakarma, S., Agrawal, A.: A survey on activity recognition and behavior understanding in video surveillance. Vis. Comput. 29(10), 983\u20131009 (2013)","journal-title":"Vis. Comput."},{"key":"2673_CR5","doi-asserted-by":"crossref","unstructured":"Shou, Z., Chan, J., Zareian, A., Miyazawa, K., Chang, S.-F.: Cdc: Convolutional-de-convolutional networks for precise temporal action localization in untrimmed videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5734\u20135743 (2017)","DOI":"10.1109\/CVPR.2017.155"},{"key":"2673_CR6","doi-asserted-by":"crossref","unstructured":"Shou, Z., Wang, D., Chang, S.-F.: Temporal action localization in untrimmed videos via multi-stage cnns. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1049\u20131058 (2016)","DOI":"10.1109\/CVPR.2016.119"},{"key":"2673_CR7","doi-asserted-by":"crossref","unstructured":"Alwassel, H., Heilbron, F.C., Ghanem, B.: Action search: Spotting actions in videos and its application to temporal action localization. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 251\u2013266 (2018)","DOI":"10.1007\/978-3-030-01240-3_16"},{"key":"2673_CR8","doi-asserted-by":"crossref","unstructured":"Chao, Y.-W., Vijayanarasimhan, S., Seybold, B., Ross, D.A., Deng, J., Sukthankar, R.: Rethinking the faster r-cnn architecture for temporal action localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1130\u20131139 (2018)","DOI":"10.1109\/CVPR.2018.00124"},{"key":"2673_CR9","doi-asserted-by":"crossref","unstructured":"Dai, X., Singh, B., Zhang, G., Davis, L.S., Qiu Chen, Y.: Temporal context network for activity localization in videos. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5793\u20135802 (2017)","DOI":"10.1109\/ICCV.2017.610"},{"key":"2673_CR10","first-page":"1097","volume":"25","author":"A Krizhevsky","year":"2012","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. Adv. Neural. Inf. Process. Syst. 25, 1097\u20131105 (2012)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"2673_CR11","unstructured":"Duan, X., Huang, W., Gan, C., Wang, J., Zhu, W., Huang, J.: Weakly supervised dense event captioning in videos. arXiv preprint arXiv:1812.03849 (2018)"},{"key":"2673_CR12","doi-asserted-by":"crossref","unstructured":"Gan, C., Gong, B., Liu, K., Su, H., Guibas, L.J.: Geometry guided convolutional neural networks for self-supervised video representation learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5589\u20135597 (2018)","DOI":"10.1109\/CVPR.2018.00586"},{"key":"2673_CR13","doi-asserted-by":"crossref","unstructured":"Liu, D., Jiang, T., Wang, Y.: Completeness modeling and context separation for weakly supervised temporal action localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1298\u20131307 (2019)","DOI":"10.1109\/CVPR.2019.00139"},{"key":"2673_CR14","doi-asserted-by":"crossref","unstructured":"Narayan, S., Cholakkal, H., Khan, F.S., Shao, L.: 3c-net: Category count and center loss for weakly-supervised action localization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8679\u20138687 (2019)","DOI":"10.1109\/ICCV.2019.00877"},{"key":"2673_CR15","doi-asserted-by":"crossref","unstructured":"Gao, J., Yang, Z., Nevatia, R.: Cascaded boundary regression for temporal action detection. arXiv preprint arXiv:1705.01180 (2017)","DOI":"10.5244\/C.31.52"},{"key":"2673_CR16","doi-asserted-by":"crossref","unstructured":"Lin, T., Zhao, X., Shou, Z.: Single shot temporal action detection. In: Proceedings of the 25th ACM International Conference on Multimedia, pp. 988\u2013996 (2017)","DOI":"10.1145\/3123266.3123343"},{"key":"2673_CR17","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L., Polosukhin, I.: Attention is all you need. In: Advances in Neural Information Processing Systems, pp. 5998\u20136008 (2017)"},{"key":"2673_CR18","doi-asserted-by":"crossref","unstructured":"Hu, H., Gu, J., Zhang, Z., Dai, J., Wei, Y.: Relation networks for object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3588\u20133597 (2018)","DOI":"10.1109\/CVPR.2018.00378"},{"key":"2673_CR19","unstructured":"Kipf, T.N., Welling, M.: Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:1609.02907 (2016)"},{"key":"2673_CR20","doi-asserted-by":"crossref","unstructured":"Yan, S., Xiong, Y., Lin, D.: Spatial temporal graph convolutional networks for skeleton-based action recognition. In: Thirty-second AAAI Conference on Artificial Intelligence (2018)","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"2673_CR21","doi-asserted-by":"crossref","unstructured":"Zeng, R., Huang, W., Tan, M., Rong, Y., Zhao, P., Huang, J., Gan, C.: Graph convolutional networks for temporal action localization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7094\u20137103 (2019)","DOI":"10.1109\/ICCV.2019.00719"},{"key":"2673_CR22","doi-asserted-by":"crossref","unstructured":"Heilbron, F.C., Niebles, J.C., Ghanem, B.: Fast temporal activity proposals for efficient detection of human actions in untrimmed videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1914\u20131923 (2016)","DOI":"10.1109\/CVPR.2016.211"},{"key":"2673_CR23","unstructured":"Ren S, He K, Girshick R, et al. Faster r-cnn: Towards real-time object detection with region proposal networks[J]. Advances in neural information processing systems, 2015, 28."},{"key":"2673_CR24","doi-asserted-by":"crossref","unstructured":"Gao, J., Yang, Z., Chen, K., Sun, C., Nevatia, R.: Turn tap: Temporal unit regression network for temporal action proposals. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 3628\u20133636 (2017)","DOI":"10.1109\/ICCV.2017.392"},{"key":"2673_CR25","unstructured":"Xiong, Y., Zhao, Y., Wang, L., Lin, D., Tang, X.: A pursuit of temporal accuracy in general activity detection. arXiv preprint arXiv:1703.02716 (2017)"},{"key":"2673_CR26","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xiong, Y., Wang, L., Wu, Z., Tang, X., Lin, D.: Temporal action detection with structured segment networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2914\u20132923 (2017)","DOI":"10.1109\/ICCV.2017.317"},{"key":"2673_CR27","doi-asserted-by":"crossref","unstructured":"Xu, H., Das, A., Saenko, K.: R-c3d: Region convolutional 3d network for temporal activity detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5783\u20135792 (2017)","DOI":"10.1109\/ICCV.2017.617"},{"key":"2673_CR28","doi-asserted-by":"crossref","unstructured":"Yang, K., Qiao, P., Li, D., Lv, S., Dou, Y.: Exploring temporal preservation networks for precise temporal action localization. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32 (2018)","DOI":"10.1609\/aaai.v32i1.12234"},{"key":"2673_CR29","doi-asserted-by":"crossref","unstructured":"Wang, L., Xiong, Y., Lin, D., Van Gool, L.: Untrimmednets for weakly supervised action recognition and detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4325\u20134334 (2017)","DOI":"10.1109\/CVPR.2017.678"},{"key":"2673_CR30","doi-asserted-by":"crossref","unstructured":"Shou, Z., Gao, H., Zhang, L., Miyazawa, K., Chang, S.-F.: Autoloc: Weakly-supervised temporal action localization in untrimmed videos. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 154\u2013171 (2018)","DOI":"10.1007\/978-3-030-01270-0_10"},{"key":"2673_CR31","doi-asserted-by":"crossref","unstructured":"Xu, Y., Zhang, C., Cheng, Z., Xie, J., Niu, Y., Pu, S., Wu, F.: Segregated temporal assembly recurrent networks for weakly supervised multiple action detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 33, pp. 9070\u20139078 (2019)","DOI":"10.1609\/aaai.v33i01.33019070"},{"key":"2673_CR32","doi-asserted-by":"crossref","unstructured":"Tan, M., Shi, Q., van den Hengel, A., Shen, C., Gao, J., Hu, F., Zhang, Z.: Learning graph structure for multi-label image classification via clique generation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4100\u20134109 (2015)","DOI":"10.1109\/CVPR.2015.7299037"},{"key":"2673_CR33","doi-asserted-by":"crossref","unstructured":"Shen, Y., Li, H., Yi, S., Chen, D., Wang, X.: Person re-identification with deep similarity-guided graph neural network. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 486\u2013504 (2018)","DOI":"10.1007\/978-3-030-01267-0_30"},{"key":"2673_CR34","doi-asserted-by":"crossref","unstructured":"Wang, X., Gupta, A.: Videos as space-time region graphs. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 399\u2013417 (2018)","DOI":"10.1007\/978-3-030-01228-1_25"},{"key":"2673_CR35","unstructured":"Hamilton, W.L., Ying, R., Leskovec, J.: Inductive representation learning on large graphs. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, pp. 1025\u20131035 (2017)"},{"key":"2673_CR36","unstructured":"Chen, J., Ma, T., Xiao, C.: Fastgcn: fast learning with graph convolutional networks via importance sampling. arXiv preprint arXiv:1801.10247 (2018)"},{"key":"2673_CR37","unstructured":"Huang, W., Zhang, T., Rong, Y., Huang, J.: Adaptive sampling towards fast graph representation learning. arXiv preprint arXiv:1809.05343 (2018)"},{"key":"2673_CR38","doi-asserted-by":"crossref","unstructured":"Shi, B., Dai, Q., Mu, Y., Wang, J.: Weakly-supervised action localization by generative attention modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1009\u20131019 (2020)","DOI":"10.1109\/CVPR42600.2020.00109"},{"key":"2673_CR39","doi-asserted-by":"crossref","unstructured":"Luo, Z., Guillory, D., Shi, B., Ke, W., Wan, F., Darrell, T., Xu, H.: Weakly-supervised action localization with expectation-maximization multi-instance learning. In: European Conference on Computer Vision, pp. 729\u2013745 (2020). Springer","DOI":"10.1007\/978-3-030-58526-6_43"},{"key":"2673_CR40","doi-asserted-by":"crossref","unstructured":"Min, K., Corso, J.J.: Adversarial background-aware loss for weakly-supervised temporal activity localization. In: European Conference on Computer Vision, pp. 283\u2013299 (2020). Springer","DOI":"10.1007\/978-3-030-58568-6_17"},{"key":"2673_CR41","doi-asserted-by":"crossref","unstructured":"Zhang, C., Cao, M., Yang, D., Chen, J., Zou, Y.: Cola: Weakly-supervised temporal action localization with snippet contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16010\u201316019 (2021)","DOI":"10.1109\/CVPR46437.2021.01575"},{"key":"2673_CR42","unstructured":"Yuan, Y., Lyu, Y., Shen, X., Tsang, I.W., Yeung, D.-Y.: Marginalized average attentional network for weakly-supervised learning. arXiv preprint arXiv:1905.08586 (2019)"},{"key":"2673_CR43","doi-asserted-by":"crossref","unstructured":"Yu, T., Ren, Z., Li, Y., Yan, E., Xu, N., Yuan, J.: Temporal structure mining for weakly supervised action detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5522\u20135531 (2019)","DOI":"10.1109\/ICCV.2019.00562"},{"key":"2673_CR44","doi-asserted-by":"crossref","unstructured":"Lee, P., Uh, Y., Byun, H.: Background suppression network for weakly-supervised temporal action localization. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 34, pp. 11320\u201311327 (2020)","DOI":"10.1609\/aaai.v34i07.6793"},{"key":"2673_CR45","doi-asserted-by":"crossref","unstructured":"Liu, Z., Wang, L., Zhang, Q., Tang, W., Yuan, J., Zheng, N., Hua, G.: Acsnet: Action-context separation network for weakly supervised temporal action localization. arXiv preprint arXiv:2103.15088 (2021)","DOI":"10.1609\/aaai.v35i3.16322"},{"key":"2673_CR46","unstructured":"Jiang, Y.-G., Liu, J., Roshan Zamir, A., Toderici, G., Laptev, I., Shah, M., Sukthankar, R.: THUMOS Challenge: Action Recognition with a Large Number of Classes. http:\/\/crcv.ucf.edu\/THUMOS14\/ (2014)"},{"key":"2673_CR47","unstructured":"Ghanem, B., Niebles, J.C., Snoek, C., Heilbron, F.C., Alwassel, H., Escorcia, V., Krishna, R., Buch, S., Dao, C.D.: The activitynet large-scale activity recognition challenge 2018 summary. arXiv preprint arXiv:1808.03766 (2018)"},{"key":"2673_CR48","doi-asserted-by":"crossref","unstructured":"Lin, T., Zhao, X., Su, H., Wang, C., Yang, M.: Bsn: Boundary sensitive network for temporal action proposal generation. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 3\u201319 (2018)","DOI":"10.1007\/978-3-030-01225-0_1"},{"key":"2673_CR49","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? a new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"}],"container-title":["The Visual Computer"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-022-02673-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00371-022-02673-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00371-022-02673-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T15:05:00Z","timestamp":1698419100000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00371-022-02673-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,9,22]]},"references-count":49,"journal-issue":{"issue":"11","published-print":{"date-parts":[[2023,11]]}},"alternative-id":["2673"],"URL":"https:\/\/doi.org\/10.1007\/s00371-022-02673-1","relation":{},"ISSN":["0178-2789","1432-2315"],"issn-type":[{"type":"print","value":"0178-2789"},{"type":"electronic","value":"1432-2315"}],"subject":[],"published":{"date-parts":[[2022,9,22]]},"assertion":[{"value":"8 September 2022","order":1,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 September 2022","order":2,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declared that they had no conflicts of interest with respect to their authorship or the publication of this article.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}