{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,9]],"date-time":"2025-09-09T22:01:27Z","timestamp":1757455287651,"version":"3.41.0"},"publisher-location":"Cham","reference-count":49,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031925900","type":"print"},{"value":"9783031925917","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-92591-7_15","type":"book-chapter","created":{"date-parts":[[2025,5,23]],"date-time":"2025-05-23T07:25:22Z","timestamp":1747985122000},"page":"236-252","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Video Editing for\u00a0Video Retrieval"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9213-2611","authenticated-orcid":false,"given":"Bin","family":"Zhu","sequence":"first","affiliation":[]},{"given":"Kevin","family":"Flanagan","sequence":"additional","affiliation":[]},{"given":"Adriano","family":"Fragomeni","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5918-9029","authenticated-orcid":false,"given":"Michael","family":"Wray","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8804-6238","authenticated-orcid":false,"given":"Dima","family":"Damen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"15_CR1","doi-asserted-by":"crossref","unstructured":"Alwassel, H., Heilbron, F.C., Escorcia, V., Ghanem, B.: Diagnosing error in temporal action detectors. In: Proceedings of the European Conference on Computer Vision, pp. 256\u2013272 (2018)","DOI":"10.1007\/978-3-030-01219-9_16"},{"key":"15_CR2","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"15_CR3","doi-asserted-by":"crossref","unstructured":"Anne\u00a0Hendricks, L., Wang, O., Shechtman, E., Sivic, J., Darrell, T., Russell, B.: Localizing moments in video with natural language. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5803\u20135812 (2017)","DOI":"10.1109\/ICCV.2017.618"},{"key":"15_CR4","unstructured":"Ashutosh, K., Girdhar, R., Torresani, L., Grauman, K.: What you say is what you show: Visual narration detection in instructional videos. arXiv preprint arXiv:2301.02307 (2023)"},{"key":"15_CR5","doi-asserted-by":"crossref","unstructured":"Bucilu\u01ce, C., Caruana, R., Niculescu-Mizil, A.: Model compression. In: Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 535\u2013541 (2006)","DOI":"10.1145\/1150402.1150464"},{"key":"15_CR6","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"15_CR7","doi-asserted-by":"crossref","unstructured":"Cheng, F., Wang, X., Lei, J., Crandall, D., Bansal, M., Bertasius, G.: VindLU: a recipe for effective video-and-language pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10739\u201310750 (2023)","DOI":"10.1109\/CVPR52729.2023.01034"},{"key":"15_CR8","doi-asserted-by":"crossref","unstructured":"Cui, R., et al.: Video moment retrieval from text queries via single frame annotation. In: Proceedings of the International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1033\u20131043 (2022)","DOI":"10.1145\/3477495.3532078"},{"key":"15_CR9","unstructured":"Flanagan, K., Damen, D., Wray, M.: Learning temporal sentence grounding from narrated egovideos. arXiv preprint arXiv:2310.17395 (2023)"},{"key":"15_CR10","doi-asserted-by":"crossref","unstructured":"Fragomeni, A., Wray, M., Damen, D.: ConTra:(Con)text (Tra)nsformer for cross-modal video retrieval. In: Proceedings of the Asian Conference on Computer Vision, pp. 3481\u20133499 (2022)","DOI":"10.1007\/978-3-031-26316-3_27"},{"key":"15_CR11","doi-asserted-by":"crossref","unstructured":"Gabeur, V., Sun, C., Alahari, K., Schmid, C.: Multi-modal transformer for video retrieval. In: Proceedings of the European Conference on Computer Vision, pp. 214\u2013229 (2020)","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"15_CR12","doi-asserted-by":"crossref","unstructured":"Gao, J., Sun, C., Yang, Z., Nevatia, R.: TALL: temporal activity localization via language query. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 5267\u20135275 (2017)","DOI":"10.1109\/ICCV.2017.563"},{"key":"15_CR13","unstructured":"Ging, S., Zolfaghari, M., Pirsiavash, H., Brox, T.: COOT: cooperative hierarchical transformer for video-text representation learning. In: Advances in Neural Information Processing Systems, pp. 22605\u201322618 (2020)"},{"key":"15_CR14","doi-asserted-by":"crossref","unstructured":"Gorti, S.K., et al.: X-Pool: cross-modal language-video attention for text-video retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5006\u20135015 (2022)","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"15_CR15","unstructured":"Grill, J.B., et\u00a0al.: Bootstrap your own latent-a new approach to self-supervised learning. In: Advances in Neural Information Processing Systems, pp. 21271\u201321284 (2020)"},{"key":"15_CR16","doi-asserted-by":"crossref","unstructured":"Han, T., Xie, W., Zisserman, A.: Temporal alignment networks for long-term video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2906\u20132916 (2022)","DOI":"10.1109\/CVPR52688.2022.00292"},{"key":"15_CR17","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)"},{"key":"15_CR18","doi-asserted-by":"crossref","unstructured":"Hu, F., Chen, A., Wang, Z., Zhou, F., Dong, J., Li, X.: Lightweight attentional feature fusion: a new baseline for text-to-video retrieval. In: Proceedings of the European Conference on Computer Vision, pp. 444\u2013461 (2022)","DOI":"10.1007\/978-3-031-19781-9_26"},{"key":"15_CR19","unstructured":"Huang, Z., et al.: Learning with noisy correspondence for cross-modal matching. In: Advances in Neural Information Processing Systems, pp. 29406\u201329419 (2021)"},{"key":"15_CR20","unstructured":"Jiang, L., Zhou, Z., Leung, T., Li, L.J., Fei-Fei, L.: MentorNet: learning data-driven curriculum for very deep neural networks on corrupted labels. In: International Conference on Machine Learning, pp. 2304\u20132313 (2018)"},{"key":"15_CR21","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Carlos\u00a0Niebles, J.: Dense-captioning events in videos. In: Proceedings of the International Conference on Computer Vision, pp. 706\u2013715 (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"15_CR22","doi-asserted-by":"crossref","unstructured":"Lee, P., Byun, H.: Learning action completeness from points for weakly-supervised temporal action localization. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13648\u201313657 (2021)","DOI":"10.1109\/ICCV48922.2021.01339"},{"key":"15_CR23","doi-asserted-by":"crossref","unstructured":"Lei, J., Li, L., Zhou, L., Gan, Z., Berg, T.L., Bansal, M., Liu, J.: Less is more: ClipBERT for video-and-language learning via sparse sampling. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7331\u20137341 (2021)","DOI":"10.1109\/CVPR46437.2021.00725"},{"key":"15_CR24","unstructured":"Lin, K.Q., et\u00a0al.: Egocentric video-language pretraining. In: Advances in Neural Information Processing Systems, pp. 7575\u20137586 (2022)"},{"key":"15_CR25","unstructured":"Liu, Y., Albanie, S., Nagrani, A., Zisserman, A.: Use what you have: video retrieval using representations from collaborative experts. In: Proceedings of the British Machine Vision Conference, pp. 279\u2013295 (2019)"},{"key":"15_CR26","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","volume":"508","author":"H Luo","year":"2022","unstructured":"Luo, H., et al.: CLIP4Clip: an empirical study of clip for end to end video clip retrieval and captioning. Neurocomputing 508, 293\u2013304 (2022)","journal-title":"Neurocomputing"},{"key":"15_CR27","doi-asserted-by":"crossref","unstructured":"Ma, F., et al.: SF-Net: single-frame supervision for temporal action localization. In: Proceedings of the European Conference on Computer Vision, pp. 420\u2013437 (2020)","DOI":"10.1007\/978-3-030-58548-8_25"},{"key":"15_CR28","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Smaira, L., Laptev, I., Sivic, J., Zisserman, A.: End-to-end learning of visual representations from uncurated instructional videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9879\u20139889 (2020)","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"15_CR29","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: HowTo100M: learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"15_CR30","doi-asserted-by":"crossref","unstructured":"Mithun, N.C., Paul, S., Roy-Chowdhury, A.K.: Weakly supervised video moment retrieval from text queries. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11592\u201311601 (2019)","DOI":"10.1109\/CVPR.2019.01186"},{"key":"15_CR31","doi-asserted-by":"crossref","unstructured":"Moltisanti, D., Wray, M., Mayol-Cuevas, W., Damen, D.: Trespassing the boundaries: labeling temporal bounds for object interactions in egocentric video. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2886\u20132894 (2017)","DOI":"10.1109\/ICCV.2017.314"},{"key":"15_CR32","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"15_CR33","doi-asserted-by":"crossref","unstructured":"Pramanick, S., et al.: EgoVLPv2: egocentric video-language pre-training with fusion in the backbone. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5285\u20135297 (2023)","DOI":"10.1109\/ICCV51070.2023.00487"},{"key":"15_CR34","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021)"},{"key":"15_CR35","unstructured":"Reed, S., Lee, H., Anguelov, D., Szegedy, C., Erhan, D., Rabinovich, A.: Training deep neural networks on noisy labels with bootstrapping. arXiv preprint arXiv:1412.6596 (2014)"},{"key":"15_CR36","unstructured":"Ren, M., Zeng, W., Yang, B., Urtasun, R.: Learning to reweight examples for robust deep learning. In: International Conference on Machine Learning, pp. 4334\u20134343 (2018)"},{"key":"15_CR37","doi-asserted-by":"crossref","unstructured":"Shvetsova, N., et al.: Everything at once-multi-modal fusion transformer for video retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 20020\u201320029 (2022)","DOI":"10.1109\/CVPR52688.2022.01939"},{"key":"15_CR38","unstructured":"Song, H., Kim, M., Park, D., Shin, Y., Lee, J.G.: Learning from noisy labels with deep neural networks: a survey. IEEE Transactions on Neural Networks and Learning Systems, pp. 1\u201319 (2022)"},{"key":"15_CR39","doi-asserted-by":"crossref","unstructured":"Tanaka, D., Ikami, D., Yamasaki, T., Aizawa, K.: Joint optimization framework for learning with noisy labels. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5552\u20135560 (2018)","DOI":"10.1109\/CVPR.2018.00582"},{"key":"15_CR40","doi-asserted-by":"crossref","unstructured":"Wray, M., Larlus, D., Csurka, G., Damen, D.: Fine-grained action retrieval through multiple parts-of-speech embeddings. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 450\u2013459 (2019)","DOI":"10.1109\/ICCV.2019.00054"},{"key":"15_CR41","doi-asserted-by":"crossref","unstructured":"Wu, W., Luo, H., Fang, B., Wang, J., Ouyang, W.: Cap4Video: what can auxiliary captions do for text-video retrieval? In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10704\u201310713 (2023)","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"15_CR42","unstructured":"Xiao, T., Xia, T., Yang, Y., Huang, C., Wang, X.: Learning from massive noisy labeled data for image classification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2691\u20132699 (2015)"},{"key":"15_CR43","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: VideoCLIP: contrastive pre-training for zero-shot video-text understanding. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing, pp. 6787\u20136800 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"15_CR44","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"15_CR45","unstructured":"Xue, H., et al.: CLIP-ViP: adapting pre-trained image-text model to video-language alignment. In: International Conference on Learning Representations (2023)"},{"key":"15_CR46","doi-asserted-by":"crossref","unstructured":"Yi, K., Wu, J.: Probabilistic end-to-end noise correction for learning with noisy labels. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7017\u20137025 (2019)","DOI":"10.1109\/CVPR.2019.00718"},{"key":"15_CR47","doi-asserted-by":"crossref","unstructured":"Zhang, B., Hu, H., Sha, F.: Cross-modal and hierarchical modeling of video and text. In: Proceedings of the European Conference on Computer Vision, pp. 374\u2013390 (2018)","DOI":"10.1007\/978-3-030-01261-8_23"},{"key":"15_CR48","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Misra, I., Kr\u00e4henb\u00fchl, P., Girdhar, R.: Learning video representations from large language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6586\u20136597 (2023)","DOI":"10.1109\/CVPR52729.2023.00637"},{"key":"15_CR49","doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., Corso, J.J.: Towards automatic learning of procedures from web instructional videos. In: AAAI Conference on Artificial Intelligence, pp. 7590\u20137598 (2018)","DOI":"10.1609\/aaai.v32i1.12342"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-92591-7_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,23]],"date-time":"2025-05-23T07:25:54Z","timestamp":1747985154000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-92591-7_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031925900","9783031925917"],"references-count":49,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-92591-7_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}