{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:26:41Z","timestamp":1777656401725,"version":"3.51.4"},"publisher-location":"Cham","reference-count":67,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729515","type":"print"},{"value":"9783031729522","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72952-2_17","type":"book-chapter","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T05:02:02Z","timestamp":1727672522000},"page":"286-304","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":12,"title":["UniMD: Towards Unifying Moment Retrieval and\u00a0Temporal Action Detection"],"prefix":"10.1007","author":[{"given":"Yingsen","family":"Zeng","sequence":"first","affiliation":[]},{"given":"Yujie","family":"Zhong","sequence":"additional","affiliation":[]},{"given":"Chengjian","family":"Feng","sequence":"additional","affiliation":[]},{"given":"Lin","family":"Ma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,1]]},"reference":[{"key":"17_CR1","doi-asserted-by":"crossref","unstructured":"Alwassel, H., Giancola, S., Ghanem, B.: TSP: temporally-sensitive pretraining of video encoders for localization tasks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3173\u20133183 (2021)","DOI":"10.1109\/ICCVW54120.2021.00356"},{"key":"17_CR2","doi-asserted-by":"crossref","unstructured":"Bodla, N., Singh, B., Chellappa, R., Davis, L.S.: Soft-NMS\u2013improving object detection with one line of code. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5561\u20135569 (2017)","DOI":"10.1109\/ICCV.2017.593"},{"key":"17_CR3","doi-asserted-by":"crossref","unstructured":"Caba\u00a0Heilbron, F., Escorcia, V., Ghanem, B., Carlos\u00a0Niebles, J.: ActivityNet: a large-scale video benchmark for human activity understanding. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 961\u2013970 (2015)","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6299\u20136308 (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"17_CR5","unstructured":"Chen, G., et al.: InternVideo-Ego4D: a pack of champion solutions to Ego4D challenges. arXiv preprint arXiv:2211.09529 (2022)"},{"key":"17_CR6","doi-asserted-by":"crossref","unstructured":"Chen, L., et al.: Rethinking the bottom-up framework for query-based video localization. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 10551\u201310558 (2020)","DOI":"10.1609\/aaai.v34i07.6627"},{"key":"17_CR7","unstructured":"Dai, R., Das, S., Bremond, F.: CTRN: class-temporal relational network for action detection. arXiv preprint arXiv:2110.13473 (2021)"},{"key":"17_CR8","doi-asserted-by":"crossref","unstructured":"Dai, R., Das, S., Kahatapitiya, K., Ryoo, M.S., Br\u00e9mond, F.: MS-TCT: multi-scale temporal convtransformer for action detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20041\u201320051 (2022)","DOI":"10.1109\/CVPR52688.2022.01941"},{"key":"17_CR9","doi-asserted-by":"crossref","unstructured":"Dai, R., Das, S., Minciullo, L., Garattoni, L., Francesca, G., Bremond, F.: PDAN: pyramid dilated attention network for action detection. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2970\u20132979 (2021)","DOI":"10.1109\/WACV48630.2021.00301"},{"key":"17_CR10","unstructured":"Dosovitskiy, A., et al.: An image is worth $$16\\times 16$$ words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"17_CR11","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C.: X3D: expanding architectures for efficient video recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 203\u2013213 (2020)","DOI":"10.1109\/CVPR42600.2020.00028"},{"key":"17_CR12","doi-asserted-by":"crossref","unstructured":"Feichtenhofer, C., Fan, H., Malik, J., He, K.: SlowFast networks for video recognition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6202\u20136211 (2019)","DOI":"10.1109\/ICCV.2019.00630"},{"key":"17_CR13","doi-asserted-by":"publisher","first-page":"701","DOI":"10.1007\/978-3-031-20077-9_41","volume-title":"European Conference on Computer Vision","author":"C Feng","year":"2022","unstructured":"Feng, C., et al.: PromptDet: towards open-vocabulary detection using uncurated images. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13669, pp. 701\u2013717. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_41"},{"key":"17_CR14","doi-asserted-by":"crossref","unstructured":"Gao, J., Sun, C., Yang, Z., Nevatia, R.: TALL: temporal activity localization via language query. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5267\u20135275 (2017)","DOI":"10.1109\/ICCV.2017.563"},{"key":"17_CR15","unstructured":"Grauman, K., et al.: Ego4D: around the world in 3,000 hours of egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18995\u201319012 (2022)"},{"key":"17_CR16","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"17_CR17","doi-asserted-by":"publisher","first-page":"105","DOI":"10.1007\/978-3-031-19833-5_7","volume-title":"European Conference on Computer Vision 2022","author":"C Ju","year":"2022","unstructured":"Ju, C., Han, T., Zheng, K., Zhang, Y., Xie, W.: Prompting visual-language models for efficient video understanding. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13695, pp. 105\u2013124. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19833-5_7"},{"key":"17_CR18","doi-asserted-by":"crossref","unstructured":"Kahatapitiya, K., Ren, Z., Li, H., Wu, Z., Ryoo, M.S., Hua, G.: Weakly-guided self-supervised pretraining for temporal activity detection. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a037, pp. 1078\u20131086 (2023)","DOI":"10.1609\/aaai.v37i1.25189"},{"key":"17_CR19","doi-asserted-by":"crossref","unstructured":"Kahatapitiya, K., Ryoo, M.S.: Coarse-fine networks for temporal activity detection in videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8385\u20138394 (2021)","DOI":"10.1109\/CVPR46437.2021.00828"},{"key":"17_CR20","unstructured":"Kay, W., et al.: The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et al.: Segment anything. arXiv:2304.02643 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"17_CR22","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Carlos\u00a0Niebles, J.: Dense-captioning events in videos. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 706\u2013715 (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"17_CR23","unstructured":"Lei, J., Berg, T.L., Bansal, M.: Detecting moments and highlights in videos via natural language queries. In: Advances in Neural Information Processing Systems, vol. 34, pp. 11846\u201311858 (2021)"},{"key":"17_CR24","doi-asserted-by":"crossref","unstructured":"Li, Y., Wang, C., Jia, J.: LLaMA-VID: an image is worth 2 tokens in large language models. arXiv preprint arXiv:2311.17043 (2023)","DOI":"10.1007\/978-3-031-72952-2_19"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Li, Z., Zhong, Y., Song, R., Li, T., Ma, L., Zhang, W.: DeTAL: open-vocabulary temporal action localization with decoupled networks. IEEE Trans. Pattern Anal. Mach. Intell. (2024)","DOI":"10.1109\/TPAMI.2024.3395778"},{"key":"17_CR26","doi-asserted-by":"crossref","unstructured":"Lin, C., et al.: Learning salient boundary feature for anchor-free temporal action localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3320\u20133329 (2021)","DOI":"10.1109\/CVPR46437.2021.00333"},{"key":"17_CR27","unstructured":"Lin, K.Q., et al.: Egocentric video-language pretraining. In: Advances in Neural Information Processing Systems, vol. 35, pp. 7575\u20137586 (2022)"},{"key":"17_CR28","doi-asserted-by":"crossref","unstructured":"Lin, T., Zhao, X., Shou, Z.: Single shot temporal action detection. In: Proceedings of the 25th ACM International Conference on Multimedia, pp. 988\u2013996 (2017)","DOI":"10.1145\/3123266.3123343"},{"key":"17_CR29","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"17_CR30","doi-asserted-by":"publisher","first-page":"5427","DOI":"10.1109\/TIP.2022.3195321","volume":"31","author":"X Liu","year":"2022","unstructured":"Liu, X., et al.: End-to-end temporal action detection with transformer. IEEE Trans. Image Process. 31, 5427\u20135441 (2022)","journal-title":"IEEE Trans. Image Process."},{"key":"17_CR31","doi-asserted-by":"crossref","unstructured":"Liu, Y., Li, S., Wu, Y., Chen, C.W., Shan, Y., Qie, X.: UMT: unified multi-modal transformers for joint video moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3042\u20133051 (2022)","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Liu, Z., Mao, H., Wu, C.Y., Feichtenhofer, C., Darrell, T., Xie, S.: A convnet for the 2020s. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"17_CR33","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"71","DOI":"10.1007\/978-3-030-58526-6_5","volume-title":"Computer Vision \u2013 ECCV 2020","author":"E Mavroudi","year":"2020","unstructured":"Mavroudi, E., Haro, B.B., Vidal, R.: Representation learning on visual-symbolic graphs for video understanding. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12374, pp. 71\u201390. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58526-6_5"},{"key":"17_CR34","doi-asserted-by":"crossref","unstructured":"Moon, W., Hyun, S., Park, S., Park, D., Heo, J.P.: Query-dependent video representation for moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23023\u201323033 (2023)","DOI":"10.1109\/CVPR52729.2023.02205"},{"key":"17_CR35","doi-asserted-by":"crossref","unstructured":"Mun, J., Cho, M., Han, B.: Local-global video-text interactions for temporal grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10810\u201310819 (2020)","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"17_CR36","doi-asserted-by":"publisher","first-page":"681","DOI":"10.1007\/978-3-031-20062-5_39","volume-title":"European Conference on Computer Vision 2022","author":"S Nag","year":"2022","unstructured":"Nag, S., Zhu, X., Song, Y.Z., Xiang, T.: Zero-shot temporal action detection via vision-language prompting. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13663, pp. 681\u2013697. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20062-5_39"},{"key":"17_CR37","unstructured":"Piergiovanni, A., Ryoo, M.: Temporal Gaussian mixture layer for videos. In: International Conference on Machine Learning, pp. 5152\u20135161. PMLR (2019)"},{"key":"17_CR38","doi-asserted-by":"crossref","unstructured":"Qing, Z., et al.: Temporal context aggregation network for temporal action proposal refinement. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 485\u2013494 (2021)","DOI":"10.1109\/CVPR46437.2021.00055"},{"key":"17_CR39","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"17_CR40","doi-asserted-by":"crossref","unstructured":"Sardari, F., Mustafa, A., Jackson, P.J., Hilton, A.: PAT: position-aware transformer for dense multi-label action detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2988\u20132997 (2023)","DOI":"10.1109\/ICCVW60793.2023.00321"},{"key":"17_CR41","doi-asserted-by":"crossref","unstructured":"Shi, D., Zhong, Y., Cao, Q., Ma, L., Li, J., Tao, D.: TriDet: temporal action detection with relative boundary modeling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18857\u201318866 (2023)","DOI":"10.1109\/CVPR52729.2023.01808"},{"key":"17_CR42","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1007\/978-3-319-46448-0_31","volume-title":"Computer Vision \u2013 ECCV 2016","author":"GA Sigurdsson","year":"2016","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016, Part I. LNCS, vol. 9905, pp. 510\u2013526. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31"},{"key":"17_CR43","doi-asserted-by":"crossref","unstructured":"Soldan, M., Xu, M., Qu, S., Tegner, J., Ghanem, B.: VLG-Net: video-language graph matching network for video grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3224\u20133234 (2021)","DOI":"10.1109\/ICCVW54120.2021.00361"},{"key":"17_CR44","doi-asserted-by":"crossref","unstructured":"Tan, M., Pang, R., Le, Q.V.: EfficientDet: scalable and efficient object detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10781\u201310790 (2020)","DOI":"10.1109\/CVPR42600.2020.01079"},{"key":"17_CR45","doi-asserted-by":"crossref","unstructured":"Tian, Z., Shen, C., Chen, H., He, T.: FCOS: fully convolutional one-stage object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9627\u20139636 (2019)","DOI":"10.1109\/ICCV.2019.00972"},{"key":"17_CR46","doi-asserted-by":"crossref","unstructured":"Tirupattur, P., Duarte, K., Rawat, Y.S., Shah, M.: Modeling multi-label action dependencies for temporal action localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1460\u20131470 (2021)","DOI":"10.1109\/CVPR46437.2021.00151"},{"key":"17_CR47","unstructured":"Tong, Z., Song, Y., Wang, J., Wang, L.: VideoMAE: masked autoencoders are data-efficient learners for self-supervised video pre-training. In: Advances in Neural Information Processing Systems, vol. 35, pp. 10078\u201310093 (2022)"},{"key":"17_CR48","unstructured":"Wang, M., Xing, J., Liu, Y.: ActionCLIP: a new paradigm for video action recognition. arXiv preprint arXiv:2109.08472 (2021)"},{"key":"17_CR49","unstructured":"Wang, X., et al.: Proposal relation network for temporal action detection. arXiv preprint arXiv:2106.11812 (2021)"},{"key":"17_CR50","unstructured":"Wang, Y., et al.: InternVid: a large-scale video-text dataset for multimodal understanding and generation. arXiv preprint arXiv:2307.06942 (2023)"},{"key":"17_CR51","unstructured":"Wang, Y., et al.: InternVideo: general video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191 (2022)"},{"key":"17_CR52","doi-asserted-by":"crossref","unstructured":"Yan, B., et al.: Universal instance perception as object discovery and retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15325\u201315336 (2023)","DOI":"10.1109\/CVPR52729.2023.01471"},{"key":"17_CR53","doi-asserted-by":"crossref","unstructured":"Yan, S., et al.: UnLoc: a unified framework for video localization tasks. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13623\u201313633 (2023)","DOI":"10.1109\/ICCV51070.2023.01253"},{"key":"17_CR54","unstructured":"Yao, L., et al.: FILIP: fine-grained interactive language-image pre-training. arXiv preprint arXiv:2111.07783 (2021)"},{"key":"17_CR55","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Ma, L., Wang, J., Liu, W., Zhu, W.: Semantic conditioned dynamic modulation for temporal sentence grounding in videos. In: Advances in Neural Information Processing Systems, vol. 32 (2019)","DOI":"10.1109\/TPAMI.2020.3038993"},{"key":"17_CR56","doi-asserted-by":"crossref","unstructured":"Yuan, Y., Mei, T., Zhu, W.: To find where you talk: temporal sentence localization in video with attention based location regression. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a033, pp. 9159\u20139166 (2019)","DOI":"10.1609\/aaai.v33i01.33019159"},{"key":"17_CR57","doi-asserted-by":"crossref","unstructured":"Zeng, R., et al.: Graph convolutional networks for temporal action localization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7094\u20137103 (2019)","DOI":"10.1109\/ICCV.2019.00719"},{"key":"17_CR58","doi-asserted-by":"crossref","unstructured":"Zeng, R., Xu, H., Huang, W., Chen, P., Tan, M., Gan, C.: Dense regression network for video grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10287\u201310296 (2020)","DOI":"10.1109\/CVPR42600.2020.01030"},{"key":"17_CR59","doi-asserted-by":"publisher","first-page":"492","DOI":"10.1007\/978-3-031-19772-7_29","volume-title":"European Conference on Computer Vision 2022","author":"CL Zhang","year":"2022","unstructured":"Zhang, C.L., Wu, J., Li, Y.: ActionFormer: localizing moments of actions with transformers. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13664, pp. 492\u2013510. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19772-7_29"},{"key":"17_CR60","doi-asserted-by":"crossref","unstructured":"Zhang, D., Dai, X., Wang, X., Wang, Y.F., Davis, L.S.: MAN: moment alignment network for natural language moment retrieval via iterative graph adjustment. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1247\u20131257 (2019)","DOI":"10.1109\/CVPR.2019.00134"},{"key":"17_CR61","doi-asserted-by":"crossref","unstructured":"Zhang, H., Sun, A., Jing, W., Zhou, J.T.: Span-based localizing network for natural language video localization. arXiv preprint arXiv:2004.13931 (2020)","DOI":"10.18653\/v1\/2020.acl-main.585"},{"key":"17_CR62","doi-asserted-by":"crossref","unstructured":"Zhang, S., Peng, H., Fu, J., Luo, J.: Learning 2D temporal adjacent networks for moment localization with natural language. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 12870\u201312877 (2020)","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"17_CR63","doi-asserted-by":"crossref","unstructured":"Zhao, C., Thabet, A.K., Ghanem, B.: Video self-stitching graph network for temporal action localization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13658\u201313667 (2021)","DOI":"10.1109\/ICCV48922.2021.01340"},{"key":"17_CR64","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Xiong, Y., Wang, L., Wu, Z., Tang, X., Lin, D.: Temporal action detection with structured segment networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2914\u20132923 (2017)","DOI":"10.1109\/ICCV.2017.317"},{"key":"17_CR65","doi-asserted-by":"crossref","unstructured":"Zhong, Y., et al.: RegionCLIP: region-based language-image pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16793\u201316803 (2022)","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"17_CR66","doi-asserted-by":"crossref","unstructured":"Zhu, Z., Tang, W., Wang, L., Zheng, N., Hua, G.: Enriching local and global contexts for temporal action localization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13516\u201313525 (2021)","DOI":"10.1109\/ICCV48922.2021.01326"},{"key":"17_CR67","doi-asserted-by":"crossref","unstructured":"Zou, X., et al.: Generalized decoding for pixel, image, and language. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15116\u201315127 (2023)","DOI":"10.1109\/CVPR52729.2023.01451"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72952-2_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:39:10Z","timestamp":1732829950000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72952-2_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,1]]},"ISBN":["9783031729515","9783031729522"],"references-count":67,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72952-2_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,1]]},"assertion":[{"value":"1 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}