{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T18:42:08Z","timestamp":1743014528607,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":46,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819786190"},{"type":"electronic","value":"9789819786206"}],"license":[{"start":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T00:00:00Z","timestamp":1729382400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,20]],"date-time":"2024-10-20T00:00:00Z","timestamp":1729382400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-8620-6_38","type":"book-chapter","created":{"date-parts":[[2024,10,19]],"date-time":"2024-10-19T21:02:10Z","timestamp":1729371730000},"page":"555-568","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Efficient Language-Driven Action Localization by Feature Aggregation and Prediction Adjustment"],"prefix":"10.1007","author":[{"given":"Zirui","family":"Shang","sequence":"first","affiliation":[]},{"given":"Shuo","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Xinxiao","family":"Wu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,20]]},"reference":[{"key":"38_CR1","doi-asserted-by":"crossref","unstructured":"Cui, R., Qian, T., Peng, P., Daskalaki, E., Chen, J., Guo, X., Sun, H., Jiang, Y.G.: Video moment retrieval from text queries via single frame annotation. In: Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1033\u20131043 (2022)","DOI":"10.1145\/3477495.3532078"},{"key":"38_CR2","doi-asserted-by":"crossref","unstructured":"Gao, J., Sun, C., Yang, Z., Nevatia, R.: Tall: Temporal activity localization via language query. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 5277\u20135285 (2017)","DOI":"10.1109\/ICCV.2017.563"},{"key":"38_CR3","doi-asserted-by":"crossref","unstructured":"Gao, J., Xu, C.: Fast video moment retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 1523\u20131532 (2021)","DOI":"10.1109\/ICCV48922.2021.00155"},{"key":"38_CR4","unstructured":"Ghosh, S., Agarwal, A., Parekh, Z., Hauptmann, A.G.: Excl: Extractive clip localization using natural language descriptions. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 1984\u20131990 (2019)"},{"key":"38_CR5","doi-asserted-by":"crossref","unstructured":"Hendricks, L.A., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 5804\u20135813 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"38_CR6","doi-asserted-by":"publisher","first-page":"5933","DOI":"10.1109\/TIP.2021.3090521","volume":"30","author":"Y Hu","year":"2021","unstructured":"Hu, Y., Nie, L., Liu, M., Wang, K., Wang, Y., Hua, X.S.: Coarse-to-fine semantic alignment for cross-modal moment localization. IEEE Trans. Image Process. 30, 5933\u20135943 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"38_CR7","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"38_CR8","doi-asserted-by":"crossref","unstructured":"Liu, D., Hu, W.: Skimming, locating, then perusing: a human-like framework for natural language video localization. In: Proceedings of the 30th ACM International Conference on Multimedia (ACM MM), pp. 4536\u20134545 (2022)","DOI":"10.1145\/3503161.3547782"},{"key":"38_CR9","doi-asserted-by":"crossref","unstructured":"Liu, D., Qu, X., Di, X., Cheng, Y., Xu, Z., Zhou, P.: Memory-guided semantic learning network for temporal sentence grounding. In: The Thirty-Sixth AAAI Conference on Artificial Intelligence (AAAI), pp. 1665\u20131673 (2022). https:\/\/ojs.aaai.org\/index.php\/AAAI\/article\/view\/20058","DOI":"10.1609\/aaai.v36i2.20058"},{"key":"38_CR10","doi-asserted-by":"crossref","unstructured":"Liu, D., Qu, X., Dong, J., Zhou, P., Cheng, Y., Wei, W., Xu, Z., Xie, Y.: Context-aware biaffine localizing network for temporal sentence grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 11235\u201311244 (2021)","DOI":"10.1109\/CVPR46437.2021.01108"},{"key":"38_CR11","doi-asserted-by":"crossref","unstructured":"Liu, D., Qu, X., Hu, W.: Reducing the vision and language bias for temporal sentence grounding. In: Proceedings of the 30th ACM International Conference on Multimedia (ACM MM), pp. 4092\u20134101 (2022)","DOI":"10.1145\/3503161.3547969"},{"key":"38_CR12","doi-asserted-by":"crossref","unstructured":"Liu, D., Qu, X., Liu, X.Y., Dong, J., et\u00a0al.: Jointly cross- and self-modal graph attention network for query-based moment localization. In: Proceedings of the 28th ACM International Conference on Multimedia (ACM MM), pp. 4070\u20134078 (2020)","DOI":"10.1145\/3394171.3414026"},{"key":"38_CR13","doi-asserted-by":"crossref","unstructured":"Liu, D., Zhou, P.: Jointly visual-and semantic-aware graph memory networks for temporal sentence localization in videos. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10096382"},{"key":"38_CR14","doi-asserted-by":"crossref","unstructured":"Luo, D., Huang, J., Gong, S., Jin, H., Liu, Y.: Towards generalisable video moment retrieval: visual-dynamic injection to image-text pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23045\u201323055 (2023)","DOI":"10.1109\/CVPR52729.2023.02207"},{"key":"38_CR15","doi-asserted-by":"crossref","unstructured":"Moon, W., Hyun, S., Park, S., Park, D., Heo, J.P.: Query-dependent video representation for moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23023\u201323033 (2023)","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"38_CR16","doi-asserted-by":"crossref","unstructured":"Mun, J., Cho, M., Han, B.: Local-global video-text interactions for temporal grounding. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10810\u201310819 (2020)","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"38_CR17","doi-asserted-by":"crossref","unstructured":"Nan, G., Qiao, R., Xiao, Y., Liu, J., Leng, S., et\u00a0al.: Interventional video grounding with dual contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2765\u20132775 (2021)","DOI":"10.1109\/CVPR46437.2021.00279"},{"key":"38_CR18","doi-asserted-by":"crossref","unstructured":"Nguyen, T., Wu, X., Dong, X., Nguyen, C.D., Ng, S.K., Luu, A.: Demaformer: damped exponential moving average transformer with energy-based modeling for temporal language grounding. In: Findings of the Association for Computational Linguistics: EMNLP 2023, pp. 3635\u20133649 (2023)","DOI":"10.18653\/v1\/2023.findings-emnlp.235"},{"key":"38_CR19","doi-asserted-by":"publisher","first-page":"2538","DOI":"10.1109\/TIP.2021.3052086","volume":"30","author":"K Ning","year":"2021","unstructured":"Ning, K., Xie, L., Liu, J., Wu, F., Tian, Q.: Interaction-integrated network for natural language moment localization. IEEE Trans. Image Process. 30, 2538\u20132548 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"38_CR20","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"38_CR21","doi-asserted-by":"crossref","unstructured":"Rohrbach, M., Regneri, M., Andriluka, M., Amin, S., Pinkal, M., Schiele, B.: Script data for attribute-based recognition of composite activities. In: European Conference on Computer Vision (ECCV), pp. 144\u2013157. Springer (2012)","DOI":"10.1007\/978-3-642-33718-5_11"},{"key":"38_CR22","doi-asserted-by":"crossref","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: European Conference on Computer Vision (ECCV), pp. 510\u2013526. Springer (2016)","DOI":"10.1007\/978-3-319-46448-0_31"},{"key":"38_CR23","doi-asserted-by":"crossref","unstructured":"Sun, X., Wang, X., Gao, J., Liu, Q., Zhou, X.: You need to read again: multi-granularity perception network for moment retrieval in videos. In: Proceedings of the 45th International ACM SIGIR Conference on Research and Development in Information Retrieval (ACM SIGIR) (2022)","DOI":"10.1145\/3477495.3532083"},{"key":"38_CR24","doi-asserted-by":"publisher","first-page":"1338","DOI":"10.1109\/TMM.2021.3063631","volume":"24","author":"H Tang","year":"2021","unstructured":"Tang, H., Zhu, J., Liu, M., Gao, Z., Cheng, Z.: Frame-wise cross-modal matching for video moment retrieval. IEEE Trans. Multimedia 24, 1338\u20131349 (2021)","journal-title":"IEEE Trans. Multimedia"},{"key":"38_CR25","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE\/CVF international conference on computer vision (ICCV), pp. 4489\u20134497 (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"38_CR26","doi-asserted-by":"publisher","first-page":"1221","DOI":"10.1109\/TMM.2022.3142420","volume":"24","author":"G Wang","year":"2022","unstructured":"Wang, G., Xu, X., Shen, F., Lu, H., Ji, Y., Shen, H.T.: Cross-modal dynamic networks for video moment retrieval with text query. IEEE Trans. Multimedia 24, 1221\u20131232 (2022)","journal-title":"IEEE Trans. Multimedia"},{"key":"38_CR27","doi-asserted-by":"crossref","unstructured":"Wang, H., Zha, Z.J., Chen, X., Xiong, Z., Luo, J.: Dual path interaction network for video moment localization. In: Proceedings of the 28th ACM International Conference on Multimedia (ACM MM), pp. 4116\u20134124 (2020)","DOI":"10.1145\/3394171.3413975"},{"key":"38_CR28","doi-asserted-by":"crossref","unstructured":"Wu, Z., Gao, J., Huang, S., Xu, C.: Diving into the relations: Leveraging semantic and visual structures for video moment retrieval. In: 2021 IEEE International Conference on Multimedia and Expo (ICME), pp.\u00a01\u20136. IEEE (2021)","DOI":"10.1109\/ICME51207.2021.9428369"},{"key":"38_CR29","doi-asserted-by":"crossref","unstructured":"Xiao, S., Chen, L., Shao, J., Zhuang, Y., Xiao, J.: Natural language video localization with learnable moment proposals. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing (EMNLP) (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.327"},{"key":"38_CR30","doi-asserted-by":"crossref","unstructured":"Xiao, S., Chen, L., Zhang, S., Ji, W., Shao, J., et\u00a0al.: Boundary proposal network for two-stage natural language video localization. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), pp. 2986\u20132994. AAAI Press (2021)","DOI":"10.1609\/aaai.v35i4.16406"},{"key":"38_CR31","doi-asserted-by":"crossref","unstructured":"Xiong, Z., Liu, D., Zhou, P., Zhu, J.: Tracking objects and activities with attention for temporal sentence grounding. In: IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp.\u00a01\u20135. IEEE (2023)","DOI":"10.1109\/ICASSP49357.2023.10096206"},{"key":"38_CR32","doi-asserted-by":"crossref","unstructured":"Xu, H., Das, A., Saenko, K.: R-c3d: region convolutional 3d network for temporal activity detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5783\u20135792 (2017)","DOI":"10.1109\/ICCV.2017.617"},{"key":"38_CR33","doi-asserted-by":"crossref","unstructured":"Xu, H., He, K., Plummer, B.A., Sigal, L., Sclaroff, S., Saenko, K.: Multilevel language and vision integration for text-to-clip retrieval. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), vol.\u00a033, pp. 9062\u20139069 (2019)","DOI":"10.1609\/aaai.v33i01.33019062"},{"key":"38_CR34","doi-asserted-by":"crossref","unstructured":"Xu, M., P\u00e9rez-R\u00faa, J.M., Escorcia, V., Martinez, B., et\u00a0al.: Boundary-sensitive pre-training for temporal localization in videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7220\u20137230 (2021)","DOI":"10.1109\/ICCV48922.2021.00713"},{"key":"38_CR35","doi-asserted-by":"crossref","unstructured":"Xu, Z., Wei, K., Yang, X., Deng, C.: Point-supervised video temporal grounding. IEEE Trans. Multimedia (2022)","DOI":"10.1109\/TMM.2022.3205404"},{"key":"38_CR36","doi-asserted-by":"crossref","unstructured":"Yang, S., Shang, Z., Wu, X.: Probability distribution based frame-supervised language-driven action localization. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 5164\u20135173 (2023)","DOI":"10.1145\/3581783.3612512"},{"key":"38_CR37","doi-asserted-by":"crossref","unstructured":"Yang, S., Wu, X.: Entity-aware and motion-aware transformers for language-driven action localization. In: Raedt, L.D. (ed.) Proceedings of the Twenty-Seventh International Joint Conference on Artificial Intelligence (IJCAI), pp. 1552\u20131558. ijcai.org (2022)","DOI":"10.24963\/ijcai.2022\/216"},{"key":"38_CR38","doi-asserted-by":"crossref","unstructured":"Yang, S., Wu, X., Shang, Z., Luo, J.: Dynamic pathway for query-aware feature learning in language-driven action localization. IEEE Trans. Multimedia (2024)","DOI":"10.1109\/TMM.2024.3368919"},{"key":"38_CR39","doi-asserted-by":"publisher","first-page":"1204","DOI":"10.1109\/TIP.2022.3140611","volume":"31","author":"X Yang","year":"2022","unstructured":"Yang, X., Wang, S., Dong, J., Dong, J., Wang, M., Chua, T.S.: Video moment retrieval with cross-modal neural architecture search. IEEE Trans. Image Process. 31, 1204\u20131216 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"38_CR40","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Ma, L., Wang, J., Liu, W., Zhu, W.: Semantic conditioned dynamic modulation for temporal sentence grounding in videos. Adv. Neural Inf. Process. Syst. (NeurIPS) 32 (2019)","DOI":"10.1109\/TPAMI.2020.3038993"},{"key":"38_CR41","doi-asserted-by":"crossref","unstructured":"Zeng, Y., Cao, D., Wei, X., Liu, M., et\u00a0al.: Multi-modal relational graph for cross-modal video moment retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2215\u20132224 (2021)","DOI":"10.1109\/CVPR46437.2021.00225"},{"key":"38_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, H., Sun, A., Jing, W., Zhen, L., Zhou, J.T., Goh, R.S.M.: Natural language video localization: a revisit in span-based question answering framework. IEEE Trans. Pattern Anal. Mach. Intell. (2021)","DOI":"10.1109\/TPAMI.2021.3060449"},{"key":"38_CR43","doi-asserted-by":"crossref","unstructured":"Zhang, H., Sun, A., Jing, W., Zhou, J.T.: Span-based localizing network for natural language video localization. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL), pp. 6543\u20136554 (2020)","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"38_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, L., Radke, R.J.: Natural language video moment localization through query-controlled temporal convolution. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 682\u2013690 (2022)","DOI":"10.1109\/WACV51458.2022.00258"},{"key":"38_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, S., Peng, H., Fu, J., Luo, J.: Learning 2d temporal adjacent networks for moment localization with natural language. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), vol.\u00a034, pp. 12870\u201312877 (2020)","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"38_CR46","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Chen, X., Jia, J., Liu, S., Ding, K.: Text-visual prompting for efficient 2d temporal video grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14794\u201314804 (2023)","DOI":"10.1109\/CVPR52729.2023.01421"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-8620-6_38","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,14]],"date-time":"2025-01-14T20:18:21Z","timestamp":1736885901000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-8620-6_38"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,20]]},"ISBN":["9789819786190","9789819786206"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-8620-6_38","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,20]]},"assertion":[{"value":"20 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2024.prcv.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}