{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T20:02:28Z","timestamp":1770148948178,"version":"3.49.0"},"reference-count":38,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,4,28]],"date-time":"2023-04-28T00:00:00Z","timestamp":1682640000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,4,28]],"date-time":"2023-04-28T00:00:00Z","timestamp":1682640000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100004608","name":"Natural Science Foundation of Jiangsu Province","doi-asserted-by":"publisher","award":["Grants No.BK20210595"],"award-info":[{"award-number":["Grants No.BK20210595"]}],"id":[{"id":"10.13039\/501100004608","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Open Project of Anhui Provincial Key Laboratory of Multimodal Cognitive Computation,  Anhui University","award":["No MMC202010"],"award-info":[{"award-number":["No MMC202010"]}]},{"name":"Open Project of Anhui Provincial Key Laboratory of Multimodal Cognitive Computation,  Anhui University","award":["No MMC202010"],"award-info":[{"award-number":["No MMC202010"]}]},{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["No.2020AAA0106200"],"award-info":[{"award-number":["No.2020AAA0106200"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Nature Science Foundation of China under Grants","award":["No.61872424"],"award-info":[{"award-number":["No.61872424"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2023,8]]},"DOI":"10.1007\/s00530-023-01091-0","type":"journal-article","created":{"date-parts":[[2023,4,28]],"date-time":"2023-04-28T16:02:12Z","timestamp":1682697732000},"page":"2181-2191","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Centralized sub-critic based hierarchical-structured reinforcement learning for temporal sentence grounding"],"prefix":"10.1007","volume":"29","author":[{"given":"Yingyuan","family":"Zhao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhiyi","family":"Tan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bing-Kun","family":"Bao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhengzheng","family":"Tu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,4,28]]},"reference":[{"key":"1091_CR1","unstructured":"Andrychowicz, M., Wolski, F., Ray, A., Schneider, J., Fong, R., Welinder, P., McGrew, B., Tobin, J., Pieter\u00a0Abbeel, O., Zaremba, W.: Hindsight experience replay. Advances in neural information processing systems 30 (2017)"},{"key":"1091_CR2","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the IEEE international conference on computer vision, pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"1091_CR3","doi-asserted-by":"crossref","unstructured":"Bacon, P.L., Harb, J., Precup, D.: The option-critic architecture. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a031 (2017)","DOI":"10.1609\/aaai.v31i1.10916"},{"key":"1091_CR4","doi-asserted-by":"crossref","unstructured":"Chaplot, D.S., Sathyendra, K.M., Pasumarthi, R.K., Rajagopal, D., Salakhutdinov, R.: Gated-attention architectures for task-oriented language grounding. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a032 (2018)","DOI":"10.1609\/aaai.v32i1.11832"},{"key":"1091_CR5","first-page":"8199","volume":"33","author":"S Chen","year":"2019","unstructured":"Chen, S., Jiang, Y.G.: Semantic proposal for activity localization in videos via sentence query. Proc AAAI Conf Artif Intell 33, 8199\u20138206 (2019)","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"1091_CR6","doi-asserted-by":"crossref","unstructured":"Cho, K., Van\u00a0Merri\u00ebnboer, B., Gulcehre, C., Bahdanau, D., Bougares, F., Schwenk, H., Bengio, Y.: Learning phrase representations using rnn encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"1091_CR7","doi-asserted-by":"crossref","unstructured":"Foerster, J., Farquhar, G., Afouras, T., Nardelli, N., Whiteson, S.: Counterfactual multi-agent policy gradients. In: Proceedings of the AAAI conference on artificial intelligence, vol.\u00a032 (2018)","DOI":"10.1609\/aaai.v32i1.11794"},{"key":"1091_CR8","doi-asserted-by":"crossref","unstructured":"Gao, J., Sun, C., Yang, Z., Nevatia, R.: Tall: Temporal activity localization via language query. In: Proceedings of the IEEE international conference on computer vision, pp. 5267\u20135275 (2017)","DOI":"10.1109\/ICCV.2017.563"},{"key":"1091_CR9","doi-asserted-by":"crossref","unstructured":"Gao, J., Xu, C.: Fast video moment retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1523\u20131532 (2021)","DOI":"10.1109\/ICCV48922.2021.00155"},{"key":"1091_CR10","doi-asserted-by":"crossref","unstructured":"Ge, R., Gao, J., Chen, K., Nevatia, R.: Mac: Mining activity concepts for language-based temporal localization. In: 2019 IEEE winter conference on applications of computer vision (WACV), pp. 245\u2013253. IEEE (2019)","DOI":"10.1109\/WACV.2019.00032"},{"key":"1091_CR11","unstructured":"Hahn, M., Kadav, A., Rehg, J.M., Graf, H.P.: Tripping through time: Efficient localization of activities in videos. arXiv preprint arXiv:1904.09936 (2019)"},{"key":"1091_CR12","first-page":"8393","volume":"33","author":"D He","year":"2019","unstructured":"He, D., Zhao, X., Huang, J., Li, F., Liu, X., Wen, S.: Read, watch, and move: Reinforcement learning for temporally grounding natural language descriptions in videos. Proc AAAI Conf Artif Intell 33, 8393\u20138400 (2019)","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"1091_CR13","doi-asserted-by":"crossref","unstructured":"Jiang, B., Huang, X., Yang, C., Yuan, J.: Cross-modal video moment retrieval with spatial and language-temporal attention. In: Proceedings of the 2019 on international conference on multimedia retrieval, pp. 217\u2013225 (2019)","DOI":"10.1145\/3323873.3325019"},{"key":"1091_CR14","unstructured":"Kiros, R., Zhu, Y., Salakhutdinov, R.R., Zemel, R., Urtasun, R., Torralba, A., Fidler, S.: Skip-thought vectors. Advances in neural information processing systems 28 (2015)"},{"key":"1091_CR15","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Carlos\u00a0Niebles, J.: Dense-captioning events in videos. In: Proceedings of the IEEE international conference on computer vision, pp. 706\u2013715 (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"1091_CR16","doi-asserted-by":"crossref","unstructured":"Liu, M., Wang, X., Nie, L., He, X., Chen, B., Chua, T.S.: Attentive moment retrieval in videos. In: The 41st international ACM SIGIR conference on research & development in information retrieval, pp. 15\u201324 (2018)","DOI":"10.1145\/3209978.3210003"},{"key":"1091_CR17","doi-asserted-by":"crossref","unstructured":"Liu, M., Wang, X., Nie, L., Tian, Q., Chen, B., Chua, T.S.: Cross-modal moment localization in videos. In: Proceedings of the 26th ACM international conference on Multimedia, pp. 843\u2013851 (2018)","DOI":"10.1145\/3240508.3240549"},{"key":"1091_CR18","unstructured":"Lowe, R., Wu, Y.I., Tamar, A., Harb, J., Pieter\u00a0Abbeel, O., Mordatch, I.: Multi-agent actor-critic for mixed cooperative-competitive environments. Advances in neural information processing systems 30 (2017)"},{"key":"1091_CR19","unstructured":"Mnih, V., Badia, A.P., Mirza, M., Graves, A., Lillicrap, T., Harley, T., Silver, D., Kavukcuoglu, K.: Asynchronous methods for deep reinforcement learning. In: International conference on machine learning, pp. 1928\u20131937. PMLR (2016)"},{"issue":"9","key":"1091_CR20","doi-asserted-by":"publisher","first-page":"2434","DOI":"10.1109\/TMM.2019.2957854","volume":"22","author":"K Ning","year":"2019","unstructured":"Ning, K., Cai, M., Xie, D., Wu, F.: An attentive sequence to sequence translator for localizing video clips by natural language. IEEE Transact Multimedia 22(9), 2434\u20132443 (2019)","journal-title":"IEEE Transact Multimedia"},{"key":"1091_CR21","unstructured":"Rodriguez, C., Marrese-Taylor, E., Saleh, F.S., Li, H., Gould, S.: Proposal-free temporal moment localization of a natural-language query in video using guided attention. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2464\u20132473 (2020)"},{"key":"1091_CR22","first-page":"2514","volume":"35","author":"H Ryu","year":"2021","unstructured":"Ryu, H., Kang, S., Kang, H., Yoo, C.D.: Semantic grouping network for video captioning. Proc AAAI Conf Artificial Intell 35, 2514\u20132522 (2021)","journal-title":"Proc AAAI Conf Artificial Intell"},{"key":"1091_CR23","first-page":"11352","volume":"35","author":"J Su","year":"2021","unstructured":"Su, J., Adams, S., Beling, P.: Value-decomposition multi-agent actor-critics. Proc AAAI Conf Artif Intell 35, 11352\u201311360 (2021)","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"1091_CR24","doi-asserted-by":"publisher","first-page":"5589","DOI":"10.1109\/TIP.2021.3086591","volume":"30","author":"X Sun","year":"2021","unstructured":"Sun, X., Wang, H., He, B.: Maban: Multi-agent boundary-aware network for natural language moment retrieval. IEEE Transact Image Proc 30, 5589\u20135599 (2021)","journal-title":"IEEE Transact Image Proc"},{"key":"1091_CR25","volume-title":"Reinforcement learning: an introduction","author":"RS Sutton","year":"2018","unstructured":"Sutton, R.S., Barto, A.G.: Reinforcement learning: an introduction. MIT press (2018)"},{"key":"1091_CR26","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision, pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"1091_CR27","unstructured":"Vezhnevets, A.S., Osindero, S., Schaul, T., Heess, N., Jaderberg, M., Silver, D., Kavukcuoglu, K.: Feudal networks for hierarchical reinforcement learning. In: International Conference on Machine Learning, pp. 3540\u20133549. PMLR (2017)"},{"key":"1091_CR28","doi-asserted-by":"crossref","unstructured":"Wang, J., Ma, L., Jiang, W.: Temporally grounding language queries in videos by contextual boundary-aware prediction. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 12168\u201312175 (2020)","DOI":"10.1609\/aaai.v34i07.6897"},{"key":"1091_CR29","doi-asserted-by":"crossref","unstructured":"Wang, L., Xiong, Y., Wang, Z., Qiao, Y., Lin, D., Tang, X., Gool, L.V.: Temporal segment networks: Towards good practices for deep action recognition. In: European conference on computer vision, pp. 20\u201336. Springer (2016)","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"1091_CR30","doi-asserted-by":"crossref","unstructured":"Wang, W., Huang, Y., Wang, L.: Language-driven temporal activity localization: A semantic matching reinforcement learning model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 334\u2013343 (2019)","DOI":"10.1109\/CVPR.2019.00042"},{"key":"1091_CR31","doi-asserted-by":"crossref","unstructured":"Wang, X., Chen, W., Wu, J., Wang, Y.F., Wang, W.Y.: Video captioning via hierarchical reinforcement learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4213\u20134222 (2018)","DOI":"10.1109\/CVPR.2018.00443"},{"key":"1091_CR32","doi-asserted-by":"crossref","unstructured":"Wu, J., Li, G., Liu, S., Lin, L.: Tree-structured policy based progressive reinforcement learning for temporally language grounding in video. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 12386\u201312393 (2020)","DOI":"10.1609\/aaai.v34i07.6924"},{"key":"1091_CR33","doi-asserted-by":"crossref","unstructured":"Wu, W., He, D., Tan, X., Chen, S., Wen, S.: Multi-agent reinforcement learning based frame sampling for effective untrimmed video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6222\u20136231 (2019)","DOI":"10.1109\/ICCV.2019.00632"},{"key":"1091_CR34","doi-asserted-by":"crossref","unstructured":"Xiao, S., Chen, L., Shao, J., Zhuang, Y., Xiao, J.: Natural language video localization with learnable moment proposals. arXiv preprint arXiv:2109.10678 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.327"},{"key":"1091_CR35","doi-asserted-by":"crossref","unstructured":"Xu, H., He, K., Plummer, B.A., Sigal, L., Sclaroff, S., Saenko, K.: Multilevel language and vision integration for text-to-clip retrieval. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 9062\u20139069 (2019)","DOI":"10.1609\/aaai.v33i01.33019062"},{"key":"1091_CR36","doi-asserted-by":"crossref","unstructured":"Xu, H., He, K., Plummer, B.A., Sigal, L., Sclaroff, S., Saenko, K.: Multilevel language and vision integration for text-to-clip retrieval. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 9062\u20139069 (2019)","DOI":"10.1609\/aaai.v33i01.33019062"},{"key":"1091_CR37","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Ma, L., Wang, J., Liu, W., Zhu, W.: Semantic conditioned dynamic modulation for temporal sentence grounding in videos. Advances in Neural Information Processing Systems 32 (2019)","DOI":"10.1109\/TPAMI.2020.3038993"},{"key":"1091_CR38","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Mei, T., Zhu, W.: To find where you talk: Temporal sentence localization in video with attention based location regression. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 9159\u20139166 (2019)","DOI":"10.1609\/aaai.v33i01.33019159"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-023-01091-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-023-01091-0\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-023-01091-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,7,14]],"date-time":"2023-07-14T10:28:51Z","timestamp":1689330531000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-023-01091-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,4,28]]},"references-count":38,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2023,8]]}},"alternative-id":["1091"],"URL":"https:\/\/doi.org\/10.1007\/s00530-023-01091-0","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"value":"0942-4962","type":"print"},{"value":"1432-1882","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,4,28]]},"assertion":[{"value":"14 January 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 April 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 April 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}