{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T00:38:54Z","timestamp":1743035934613,"version":"3.40.3"},"publisher-location":"Cham","reference-count":58,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031736490"},{"type":"electronic","value":"9783031736506"}],"license":[{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,21]],"date-time":"2024-11-21T00:00:00Z","timestamp":1732147200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73650-6_8","type":"book-chapter","created":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T18:17:30Z","timestamp":1732126650000},"page":"123-140","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["COM Kitchens: An Unedited Overhead-View Video Dataset as\u00a0a\u00a0Vision-Language Benchmark"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-0529-3152","authenticated-orcid":false,"given":"Koki","family":"Maeda","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4657-8214","authenticated-orcid":false,"given":"Tosho","family":"Hirasawa","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0799-4269","authenticated-orcid":false,"given":"Atsushi","family":"Hashimoto","sequence":"additional","affiliation":[]},{"given":"Jun","family":"Harashima","sequence":"additional","affiliation":[]},{"given":"Leszek","family":"Rybicki","sequence":"additional","affiliation":[]},{"given":"Yusuke","family":"Fukasawa","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9014-1389","authenticated-orcid":false,"given":"Yoshitaka","family":"Ushiku","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,21]]},"reference":[{"doi-asserted-by":"crossref","unstructured":"Ashutosh, K., Girdhar, R., Torresani, L., Grauman, K.: HierVL: learning hierarchical video-language embeddings. In: CVPR, pp. 23066\u201323078 (2023)","key":"8_CR1","DOI":"10.1109\/CVPR52729.2023.02209"},{"unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop IEEMMTS, pp. 65\u201372 (2005)","key":"8_CR2"},{"doi-asserted-by":"crossref","unstructured":"Ben-Shabat, Y., et al.: The IKEA ASM dataset: Understanding people assembling furniture through actions, objects and pose. In: WACV, pp. 847\u2013859 (2021)","key":"8_CR3","DOI":"10.1109\/WACV48630.2021.00089"},{"doi-asserted-by":"crossref","unstructured":"Chandu, K., Nyberg, E., Black, A.W.: Storyboarding of recipes: grounded contextual generation. In: Annual Meeting of the Association for Computational Linguistics, pp. 6040\u20136046 (2019)","key":"8_CR4","DOI":"10.18653\/v1\/P19-1606"},{"doi-asserted-by":"crossref","unstructured":"Chen, S., Zhao, Y., Jin, Q., Wu, Q.: Fine-grained video-text retrieval with hierarchical graph reasoning. In: CVPR (2020)","key":"8_CR5","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"8_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"753","DOI":"10.1007\/978-3-030-01225-0_44","volume-title":"Computer Vision \u2013 ECCV 2018","author":"D Damen","year":"2018","unstructured":"Damen, D., et al.: Scaling egocentric vision: the dataset. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11208, pp. 753\u2013771. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01225-0_44"},{"key":"8_CR7","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1007\/s11263-021-01531-2","volume":"130","author":"D Damen","year":"2022","unstructured":"Damen, D., et al.: Rescaling egocentric vision: collection, pipeline and challenges for EPIC-KITCHENS-100. IJCV 130, 33\u201355 (2022)","journal-title":"IJCV"},{"doi-asserted-by":"crossref","unstructured":"Deng, C., Chen, S., Chen, D., He, Y., Wu, Q.: Sketch, ground, and refine: top-down dense video captioning. In: CVPR, pp. 234\u2013243 (2021)","key":"8_CR8","DOI":"10.1109\/CVPR46437.2021.00030"},{"doi-asserted-by":"crossref","unstructured":"Dvornik, N., Hadji, I., Zhang, R., Derpanis, K.G., Wildes, R.P., Jepson, A.D.: StepFormer: self-supervised step discovery and localization in instructional videos. In: CVPR, pp. 18952\u201318961 (2023)","key":"8_CR9","DOI":"10.1109\/CVPR52729.2023.01817"},{"doi-asserted-by":"crossref","unstructured":"Elhamifar, E., Naing, Z.: Unsupervised procedure learning via joint dynamic summarization. In: ICCV (2019)","key":"8_CR10","DOI":"10.1109\/ICCV.2019.00644"},{"key":"8_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"314","DOI":"10.1007\/978-3-642-33718-5_23","volume-title":"Computer Vision \u2013 ECCV 2012","author":"A Fathi","year":"2012","unstructured":"Fathi, A., Li, Y., Rehg, J.M.: Learning to recognize daily actions using gaze. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012. LNCS, vol. 7572, pp. 314\u2013327. Springer, Heidelberg (2012). https:\/\/doi.org\/10.1007\/978-3-642-33718-5_23"},{"key":"8_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"517","DOI":"10.1007\/978-3-030-58539-6_31","volume-title":"Computer Vision \u2013 ECCV 2020","author":"S Fujita","year":"2020","unstructured":"Fujita, S., Hirao, T., Kamigaito, H., Okumura, M., Nagata, M.: SODA: story oriented dense video captioning evaluation framework. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12351, pp. 517\u2013531. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58539-6_31"},{"doi-asserted-by":"crossref","unstructured":"Garg, S., Peitz, S., Nallasamy, U., Paulik, M.: Jointly learning to align and translate with transformer models. In: Conference on Empirical Methods in Natural Language Processing and International Joint Conference on Natural Language Processing, pp. 4453\u20134462. Hong Kong, China (2019)","key":"8_CR13","DOI":"10.18653\/v1\/D19-1453"},{"unstructured":"Grauman, K., et al.: Ego4D: around the world in 3,000 hours of egocentric video. In: CVPR, pp. 18995\u201319012 (2022)","key":"8_CR14"},{"unstructured":"Grauman, K., et al.: Ego-Exo4D: understanding skilled human activity from first- and third-person perspectives (2023)","key":"8_CR15"},{"unstructured":"Harashima, J., Ariga, M., Murata, K., Ioki, M.: A large-scale recipe and meal data collection as infrastructure for food research. In: International Conference on Language Resources and Evaluation, pp. 2455\u20132459 (2016)","key":"8_CR16"},{"doi-asserted-by":"crossref","unstructured":"Huang, D.A., Lim, J.J., Fei-Fei, L., Niebles, J.C.: Unsupervised visual-linguistic reference resolution in instructional videos. In: CVPR, pp. 2183\u20132192 (2017)","key":"8_CR17","DOI":"10.1109\/CVPR.2017.116"},{"doi-asserted-by":"crossref","unstructured":"Jermsurawong, J., Habash, N.: Predicting the structure of cooking recipes. In: Conference on Empirical Methods in Natural Language Processing, pp. 781\u2013786 (2015)","key":"8_CR18","DOI":"10.18653\/v1\/D15-1090"},{"doi-asserted-by":"crossref","unstructured":"Ji, W., et al.: VidVRD 2021: The third grand challenge on video relation detection. In: ACM MM, pp. 4779\u20134783 (2021)","key":"8_CR19","DOI":"10.1145\/3474085.3479232"},{"doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: DenseCap: fully convolutional localization networks for dense captioning. In: CVPR, pp. 4565\u20134574 (2016)","key":"8_CR20","DOI":"10.1109\/CVPR.2016.494"},{"doi-asserted-by":"crossref","unstructured":"Kiddon, C., Ponnuraj, G.T., Zettlemoyer, L., Choi, Y.: Mise en place: unsupervised interpretation of instructional recipes. In: Conference on Empirical Methods in Natural Language Processing, pp. 982\u2013992 (2015)","key":"8_CR21","DOI":"10.18653\/v1\/D15-1114"},{"doi-asserted-by":"crossref","unstructured":"Kuehne, H., Arslan, A., Serre, T.: The language of actions: recovering the syntax and semantics of goal-directed human activities. In: CVPR, pp. 780\u2013787 (2014)","key":"8_CR22","DOI":"10.1109\/CVPR.2014.105"},{"doi-asserted-by":"crossref","unstructured":"Lei, J., Wang, L., Shen, Y., Yu, D., Berg, T., Bansal, M.: MART: memory-augmented recurrent transformer for coherent video paragraph captioning. In: Annual Meeting of the Association for Computational Linguistics, pp. 2603\u20132614 (2020)","key":"8_CR23","DOI":"10.18653\/v1\/2020.acl-main.233"},{"unstructured":"Lin, K.Q., et al.: Egocentric video-language pretraining. In: NeurIPS (2022)","key":"8_CR24"},{"unstructured":"Luo, H., et al.: UniVL: a unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353 (2020)","key":"8_CR25"},{"key":"8_CR26","doi-asserted-by":"publisher","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","volume":"508","author":"H Luo","year":"2022","unstructured":"Luo, H., et al.: CLIP4Clip: an empirical study of CLIP for end to end video clip retrieval. Neurocomputing 508, 293\u2013304 (2022)","journal-title":"Neurocomputing"},{"doi-asserted-by":"crossref","unstructured":"Ma, Y., Xu, G., Sun, X., Yan, M., Zhang, J., Ji, R.: X-CLIP: end-to-end multi-grained contrastive learning for video-text retrieval. In: ACM MM, pp. 638\u2013647 (2022)","key":"8_CR27","DOI":"10.1145\/3503161.3547910"},{"doi-asserted-by":"crossref","unstructured":"Ma, Y., Hiraoka, T., Okazaki, N.: Joint entity and relation extraction based on table labeling using convolutional neural networks. In: Workshop on Structured Prediction for NLP, pp. 11\u201321 (2022)","key":"8_CR28","DOI":"10.18653\/v1\/2022.spnlp-1.2"},{"doi-asserted-by":"crossref","unstructured":"Miech, A., Alayrac, J.B., Smaira, L., Laptev, I., Sivic, J., Zisserman, A.: End-to-end learning of visual representations from uncurated instructional videos. In: CVPR (2020)","key":"8_CR29","DOI":"10.1109\/CVPR42600.2020.00990"},{"doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: HowTo100M: learning a text-video embedding by watching hundred million narrated video clips. In: ICCV (2019)","key":"8_CR30","DOI":"10.1109\/ICCV.2019.00272"},{"doi-asserted-by":"crossref","unstructured":"Momouchi, Y.: Control structures for actions in procedural texts and PT-chart. In: International Conference on Computational Linguistics, pp. 108\u2013114 (1980)","key":"8_CR31","DOI":"10.3115\/990174.990192"},{"unstructured":"Mori, S., Maeta, H., Yamakata, Y., Sasada, T.: Flow graph corpus from recipe texts. In: International Conference on Language Resources and Evaluation, pp. 2370\u20132377 (2014)","key":"8_CR32"},{"doi-asserted-by":"crossref","unstructured":"Nakamura, K., Ohashi, H., Okada, M.: Sensor-augmented egocentric-video captioning with dynamic modal attention. In: ACM MM, pp. 4220\u20134229 (2021)","key":"8_CR33","DOI":"10.1145\/3474085.3475557"},{"doi-asserted-by":"crossref","unstructured":"Nishimura, T., Hashimoto, A., Mori, S.: Procedural text generation from a photo sequence. In: International Natural Language Generation Conference, pp. 409\u2013414 (2019)","key":"8_CR34","DOI":"10.18653\/v1\/W19-8650"},{"doi-asserted-by":"crossref","unstructured":"Nishimura, T., Hashimoto, A., Ushiku, Y., Kameko, H., Mori, S.: State-aware video procedural captioning. In: ACM MM, pp. 1766\u20131774 (2021)","key":"8_CR35","DOI":"10.1145\/3474085.3475322"},{"key":"8_CR36","doi-asserted-by":"publisher","first-page":"2125","DOI":"10.1109\/ACCESS.2020.3043452","volume":"9","author":"T Nishimura","year":"2020","unstructured":"Nishimura, T., Hashimoto, A., Ushiku, Y., Kameko, H., Yamakata, Y., Mori, S.: Structure-aware procedural text generation from an image sequence. IEEE Access 9, 2125\u20132141 (2020)","journal-title":"IEEE Access"},{"unstructured":"Nishimura, T., et al.: Egocentric biochemical video-and-language dataset. In: ICCV Workshop, pp. 3129\u20133133 (2021)","key":"8_CR37"},{"doi-asserted-by":"crossref","unstructured":"Nishimura, T., et al.: BioVL2 dataset: egocentric biochemical video-and-language dataset. J. Nat. Lang. Process. (2022)","key":"8_CR38","DOI":"10.5715\/jnlp.29.1106"},{"unstructured":"Schiele, B.: A database for fine grained activity detection of cooking activities. In: CVPR, pp. 1194\u20131201 (2012)","key":"8_CR39"},{"doi-asserted-by":"crossref","unstructured":"Sener, F., et al.: Assembly101: a large-scale multi-view video dataset for understanding procedural activities. In: CVPR, pp. 21064\u201321074 (2022)","key":"8_CR40","DOI":"10.1109\/CVPR52688.2022.02042"},{"doi-asserted-by":"crossref","unstructured":"Shi, B., et al.: Dense procedure captioning in narrated instructional videos. In: Annual Meeting of the Association for Computational Linguistics, pp. 6382\u20136391 (2019)","key":"8_CR41","DOI":"10.18653\/v1\/P19-1641"},{"doi-asserted-by":"crossref","unstructured":"Shi, B., Ji, L., Niu, Z., Duan, N., Zhou, M., Chen, X.: Learning semantic concepts and temporal alignment for narrated video procedural captioning. In: ACM MM, pp. 4355\u20134363 (2020)","key":"8_CR42","DOI":"10.1145\/3394171.3413498"},{"key":"8_CR43","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"168","DOI":"10.1007\/978-3-642-40303-3_18","volume-title":"Advances in Depth Image Analysis and Applications","author":"A Shimada","year":"2013","unstructured":"Shimada, A., Kondo, K., Deguchi, D., Morin, G., Stern, H.: Kitchen scene context based gesture recognition: a contest in ICPR2012. In: Jiang, X., Bellon, O.R.P., Goldgof, D., Oishi, T. (eds.) WDIA 2012. LNCS, vol. 7854, pp. 168\u2013185. Springer, Heidelberg (2013). https:\/\/doi.org\/10.1007\/978-3-642-40303-3_18"},{"unstructured":"Shirai, K., et al.: Visual recipe flow: a dataset for learning visual state changes of objects with recipe flows. In: International Conference on Computational Linguistics, pp. 3570\u20133577 (2022)","key":"8_CR44"},{"doi-asserted-by":"crossref","unstructured":"Spriggs, E.H., De\u00a0La\u00a0Torre, F., Hebert, M.: Temporal segmentation and activity classification from first-person sensing. In: CVPRW, pp. 17\u201324 (2009)","key":"8_CR45","DOI":"10.1109\/CVPRW.2009.5204354"},{"doi-asserted-by":"crossref","unstructured":"Stein, S., McKenna, S.J.: Combining embedded accelerometers with computer vision for recognizing food preparation activities. In: ACM International Joint Conference on Pervasive and Ubiquitous Computing, pp. 729\u2013738 (2013)","key":"8_CR46","DOI":"10.1145\/2493432.2493482"},{"doi-asserted-by":"crossref","unstructured":"Tang, Y., et al.: COIN: a large-scale dataset for comprehensive instructional video analysis. In: CVPR (2019)","key":"8_CR47","DOI":"10.1109\/CVPR.2019.00130"},{"doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: CIDEr: consensus-based image description evaluation. In: CVPR, pp. 4566\u20134575 (2015)","key":"8_CR48","DOI":"10.1109\/CVPR.2015.7299087"},{"doi-asserted-by":"crossref","unstructured":"Wang, T., Zhang, R., Lu, Z., Zheng, F., Cheng, R., Luo, P.: End-to-end dense video captioning with parallel decoding. In: ICCV, pp. 6847\u20136857 (2021)","key":"8_CR49","DOI":"10.1109\/ICCV48922.2021.00677"},{"doi-asserted-by":"crossref","unstructured":"Wu, J., Pan, L., Chen, J., Jiang, Y.G.: Ingredient-enriched recipe generation from cooking videos. In: ACM International Conference on Multimedia Retrieval, pp. 249\u2013257 (2022)","key":"8_CR50","DOI":"10.1145\/3512527.3531388"},{"unstructured":"Yagi, T., Ohashi, M., Huang, Y., Furuta, R., Adachi, S., Mitsuyama, T., Sato, Y.: FineBio: a fine-grained video dataset of biological experiments with hierarchical annotation. arXiv preprint arXiv:2402.00293 (2024)","key":"8_CR51"},{"unstructured":"Yamakata, Y., Mori, S., Carroll, J.: English recipe flow graph corpus. In: International Conference on Language Resources and Evaluation, pp. 5187\u20135194 (2020)","key":"8_CR52"},{"doi-asserted-by":"crossref","unstructured":"Yang, A., et al.: Vid2Seq: large-scale pretraining of a visual language model for dense video captioning. In: CVPR (2023)","key":"8_CR53","DOI":"10.1109\/CVPR52729.2023.01032"},{"doi-asserted-by":"crossref","unstructured":"Yang, J., et al.: Panoptic video scene graph generation. In: CVPR, pp. 18675\u201318685 (2023)","key":"8_CR54","DOI":"10.1109\/CVPR52729.2023.01791"},{"doi-asserted-by":"crossref","unstructured":"Zellers, R., et al.: MERLOT reserve: neural script knowledge through vision and language and sound. In: CVPR, pp. 16375\u201316387 (2022)","key":"8_CR55","DOI":"10.1109\/CVPR52688.2022.01589"},{"doi-asserted-by":"crossref","unstructured":"Zhou, L., Kalantidis, Y., Chen, X., Corso, J.J., Rohrbach, M.: Grounded video description. In: CVPR, pp. 6578\u20136587 (2019)","key":"8_CR56","DOI":"10.1109\/CVPR.2019.00674"},{"doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., Corso, J.J.: Towards automatic learning of procedures from web instructional videos. In: AAAI (2018)","key":"8_CR57","DOI":"10.1609\/aaai.v32i1.12342"},{"doi-asserted-by":"crossref","unstructured":"Zhukov, D., Alayrac, J.B., Cinbis, R.G., Fouhey, D., Laptev, I., Sivic, J.: Cross-task weakly supervised learning from instructional videos. In: CVPR, pp. 3537\u20133545 (2019)","key":"8_CR58","DOI":"10.1109\/CVPR.2019.00365"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73650-6_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,20]],"date-time":"2024-11-20T19:03:53Z","timestamp":1732129433000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73650-6_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,21]]},"ISBN":["9783031736490","9783031736506"],"references-count":58,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73650-6_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,21]]},"assertion":[{"value":"21 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}