{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T21:19:43Z","timestamp":1778275183533,"version":"3.51.4"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729324","type":"print"},{"value":"9783031729331","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,3]],"date-time":"2024-10-03T00:00:00Z","timestamp":1727913600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,3]],"date-time":"2024-10-03T00:00:00Z","timestamp":1727913600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72933-1_20","type":"book-chapter","created":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T12:02:53Z","timestamp":1727870573000},"page":"347-363","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Efficient Pre-training for\u00a0Localized Instruction Generation of\u00a0Procedural Videos"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-1919-9712","authenticated-orcid":false,"given":"Anil","family":"Batra","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4265-8882","authenticated-orcid":false,"given":"Davide","family":"Moltisanti","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8276-0094","authenticated-orcid":false,"given":"Laura","family":"Sevilla-Lara","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5908-7751","authenticated-orcid":false,"given":"Marcus","family":"Rohrbach","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8242-4362","authenticated-orcid":false,"given":"Frank","family":"Keller","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,3]]},"reference":[{"key":"20_CR1","unstructured":"wikihow. https:\/\/www.wikiHow.com\/"},{"key":"20_CR2","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp. 65\u201372. Association for Computational Linguistics, Ann Arbor, Michigan, June 2005. https:\/\/aclanthology.org\/W05-0909"},{"key":"20_CR3","unstructured":"Batra, A., Gowda, S.N., Keller, F., Sevilla-Lara, L.: A closer look at temporal ordering in the segmentation of instructional videos. In: British Machine Vision Conference (BMVC) (2022)"},{"key":"20_CR4","doi-asserted-by":"crossref","unstructured":"Bie\u0144, M., Gilski, M., Maciejewska, M., Taisner, W., Wisniewski, D., Lawrynowicz, A.: RecipeNLG: a cooking recipes dataset for semi-structured text generation. In: Proceedings of the 13th International Conference on Natural Language Generation, pp. 22\u201328 (2020)","DOI":"10.18653\/v1\/2020.inlg-1.4"},{"key":"20_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"issue":"1","key":"20_CR6","doi-asserted-by":"publisher","first-page":"383","DOI":"10.1146\/annurev.an.16.100187.002123","volume":"16","author":"W Chafe","year":"1987","unstructured":"Chafe, W., Tannen, D.: The relation between written and spoken language. Annu. Rev. Anthropol. 16(1), 383\u2013407 (1987)","journal-title":"Annu. Rev. Anthropol."},{"key":"20_CR7","doi-asserted-by":"crossref","unstructured":"Cheng, F., Wang, X., Lei, J., Crandall, D., Bansal, M., Bertasius, G.: VindLU: a recipe for effective video-and-language pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10739\u201310750 (2023)","DOI":"10.1109\/CVPR52729.2023.01034"},{"issue":"3","key":"20_CR8","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1080\/10417947809372388","volume":"43","author":"L Einhorn","year":"1978","unstructured":"Einhorn, L.: Oral and written style: an examination of differences. Southern J. Commun. 43(3), 302\u2013311 (1978)","journal-title":"Southern J. Commun."},{"key":"20_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"517","DOI":"10.1007\/978-3-030-58539-6_31","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Fujita","year":"2020","unstructured":"Fujita, S., Hirao, T., Kamigaito, H., Okumura, M., Nagata, M.: SODA: story oriented dense video captioning evaluation framework. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12351, pp. 517\u2013531. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58539-6_31"},{"key":"20_CR10","doi-asserted-by":"crossref","unstructured":"Jiang, B., Luo, R., Mao, J., Xiao, T., Jiang, Y.: Acquisition of localization confidence for accurate object detection. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 784\u2013799 (2018)","DOI":"10.1007\/978-3-030-01264-9_48"},{"key":"20_CR11","unstructured":"Koupaee, M., Wang, W.Y.: WikiHow: a large scale text summarization dataset. arXiv preprint arXiv:1810.09305 (2018)"},{"key":"20_CR12","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Niebles, J.C.: Dense-captioning events in videos. In: International Conference on Computer Vision (ICCV) (2017)","DOI":"10.1109\/ICCV.2017.83"},{"key":"20_CR13","doi-asserted-by":"crossref","unstructured":"Li, F., Zeng, A., Liu, S., Zhang, H., Li, H., Zhang, L., Ni, L.M.: Lite DETR: an interleaved multi-scale encoder for efficient detr. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18558\u201318567 (2023)","DOI":"10.1109\/CVPR52729.2023.01780"},{"key":"20_CR14","unstructured":"Li, J., et al.: Gain: On the generalization of instructional action understanding. In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"20_CR15","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"20_CR16","doi-asserted-by":"crossref","unstructured":"Lin, X., Petroni, F., Bertasius, G., Rohrbach, M., Chang, S.F., Torresani, L.: Learning to recognize procedural activities with distant supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13853\u201313863 (2022)","DOI":"10.1109\/CVPR52688.2022.01348"},{"key":"20_CR17","doi-asserted-by":"publisher","first-page":"5427","DOI":"10.1109\/TIP.2022.3195321","volume":"31","author":"X Liu","year":"2022","unstructured":"Liu, X., et al.: End-to-end temporal action detection with transformer. IEEE Trans. Image Process. (TIP) 31, 5427\u20135441 (2022)","journal-title":"IEEE Trans. Image Process. (TIP)"},{"key":"20_CR18","unstructured":"Luo, H., et al.: UniVL: a unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353 (2020)"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Smaira, L., Laptev, I., Sivic, J., Zisserman, A.: End-to-end learning of visual representations from uncurated instructional videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9879\u20139889 (2020)","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"20_CR20","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: Howto100 m: Learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"20_CR21","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"Reimers, N., Gurevych, I.: Sentence-bert: sentence embeddings using siamese bert-networks. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics, November 2019. http:\/\/arxiv.org\/abs\/1908.10084","DOI":"10.18653\/v1\/D19-1410"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., Savarese, S.: Generalized intersection over union: a metric and a loss for bounding box regression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 658\u2013666 (2019)","DOI":"10.1109\/CVPR.2019.00075"},{"key":"20_CR24","doi-asserted-by":"crossref","unstructured":"Sener, F., Yao, A.: Zero-shot anticipation for instructional activities. In: The IEEE International Conference on Computer Vision (ICCV) (2019)","DOI":"10.1109\/ICCV.2019.00095"},{"key":"20_CR25","doi-asserted-by":"publisher","unstructured":"Shi, D., et al.: ReAct: temporal action detection with relational queries. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision - ECCV 2022, ECCV 2022, LNCS, vol. 13670. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20080-9_7","DOI":"10.1007\/978-3-031-20080-9_7"},{"key":"20_CR26","first-page":"16857","volume":"33","author":"K Song","year":"2020","unstructured":"Song, K., Tan, X., Qin, T., Lu, J., Liu, T.Y.: Mpnet: masked and permuted pre-training for language understanding. Adv. Neural. Inf. Process. Syst. 33, 16857\u201316867 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR27","doi-asserted-by":"crossref","unstructured":"Song, X., Salcianu, A., Song, Y., Dopson, D., Zhou, D.: Fast wordpiece tokenization. arXiv preprint arXiv:2012.15524 (2020)","DOI":"10.18653\/v1\/2021.emnlp-main.160"},{"key":"20_CR28","doi-asserted-by":"crossref","unstructured":"Tang, Y., et al.: Coin: a large-scale dataset for comprehensive instructional video analysis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1207\u20131216 (2019)","DOI":"10.1109\/CVPR.2019.00130"},{"key":"20_CR29","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., Parikh, D.: Cider: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"20_CR30","doi-asserted-by":"crossref","unstructured":"Wang, T., Zhang, R., Lu, Z., Zheng, F., Cheng, R., Luo, P.: End-to-end dense video captioning with parallel decoding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 6847\u20136857 (2021)","DOI":"10.1109\/ICCV48922.2021.00677"},{"key":"20_CR31","unstructured":"Wolf, T., et\u00a0al.: Huggingface\u2019s transformers: state-of-the-art natural language processing. arXiv preprint arXiv:1910.03771 (2019)"},{"key":"20_CR32","unstructured":"Yang, A., Nagrani, A., Laptev, I., Sivic, J., Schmid, C.: Vidchapters-7m: Video chapters at scale. In: NeurIPS (2023)"},{"key":"20_CR33","doi-asserted-by":"crossref","unstructured":"Yang, A., et al.: Vid2seq: large-scale pretraining of a visual language model for dense video captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10714\u201310726 (2023)","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"20_CR34","unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: Coca: contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)"},{"key":"20_CR35","doi-asserted-by":"crossref","unstructured":"Zellers, R., et al.: Merlot reserve: neural script knowledge through vision and language and sound. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16375\u201316387 (2022)","DOI":"10.1109\/CVPR52688.2022.01589"},{"key":"20_CR36","doi-asserted-by":"crossref","unstructured":"Zhang, H., Wang, Y., Dayoub, F., Sunderhauf, N.: Varifocalnet: An IOU-aware dense object detector. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8514\u20138523 (2021)","DOI":"10.1109\/CVPR46437.2021.00841"},{"key":"20_CR37","doi-asserted-by":"crossref","unstructured":"Zhou, H., Mart\u00edn-Mart\u00edn, R., Kapadia, M., Savarese, S., Niebles, J.C.: Procedure-aware pretraining for instructional video understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10727\u201310738 (2023)","DOI":"10.1109\/CVPR52729.2023.01033"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., Corso, J.J.: Towards automatic learning of procedures from web instructional videos. In: AAAI Conference on Artificial Intelligence (2018). https:\/\/www.aaai.org\/ocs\/index.php\/AAAI\/AAAI18\/paper\/view\/17344","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"20_CR39","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable detr: deformable transformers for end-to-end object detection. In: International Conference on Learning Representations (2021)"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Zhukov, D., Alayrac, J.B., Cinbis, R.G., Fouhey, D., Laptev, I., Sivic, J.: Cross-task weakly supervised learning from instructional videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3537\u20133545 (2019)","DOI":"10.1109\/CVPR.2019.00365"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72933-1_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,2]],"date-time":"2024-10-02T12:38:01Z","timestamp":1727872681000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72933-1_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,3]]},"ISBN":["9783031729324","9783031729331"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72933-1_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,3]]},"assertion":[{"value":"3 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}