{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,21]],"date-time":"2025-10-21T15:49:56Z","timestamp":1761061796559,"version":"3.37.3"},"reference-count":54,"publisher":"Springer Science and Business Media LLC","issue":"24","license":[{"start":{"date-parts":[[2023,3,20]],"date-time":"2023-03-20T00:00:00Z","timestamp":1679270400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,3,20]],"date-time":"2023-03-20T00:00:00Z","timestamp":1679270400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001691","name":"Japan Society for the Promotion of Science","doi-asserted-by":"publisher","award":["JP21J20250"],"award-info":[{"award-number":["JP21J20250"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001691","name":"Japan Society for the Promotion of Science","doi-asserted-by":"publisher","award":["JP20H04210"],"award-info":[{"award-number":["JP20H04210"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001691","name":"Japan Society for the Promotion of Science","doi-asserted-by":"publisher","award":["JP21H04910"],"award-info":[{"award-number":["JP21H04910"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001691","name":"Japan Society for the Promotion of Science","doi-asserted-by":"publisher","award":["JP17H06100"],"award-info":[{"award-number":["JP17H06100"]}],"id":[{"id":"10.13039\/501100001691","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100020959","name":"JST-Mirai Program","doi-asserted-by":"crossref","award":["JPMJMI21G2"],"award-info":[{"award-number":["JPMJMI21G2"]}],"id":[{"id":"10.13039\/501100020959","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100020958","name":"ACT-I","doi-asserted-by":"crossref","award":["JPMJPR17U5"],"award-info":[{"award-number":["JPMJPR17U5"]}],"id":[{"id":"10.13039\/501100020958","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2023,10]]},"DOI":"10.1007\/s11042-023-14774-7","type":"journal-article","created":{"date-parts":[[2023,3,20]],"date-time":"2023-03-20T10:02:58Z","timestamp":1679306578000},"page":"37273-37301","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["State-aware video procedural captioning"],"prefix":"10.1007","volume":"82","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8725-7164","authenticated-orcid":false,"given":"Taichi","family":"Nishimura","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Atsushi","family":"Hashimoto","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yoshitaka","family":"Ushiku","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hirotaka","family":"Kameko","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shinsuke","family":"Mori","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,3,20]]},"reference":[{"key":"14774_CR1","unstructured":"Akbik A, Blythe D, Vollgraf R (2018) Contextual string embeddings for sequence labeling. In: Proc COLING, pp 1638\u20131649"},{"key":"14774_CR2","doi-asserted-by":"crossref","unstructured":"Alayrac J-B, Bojanowski P, Agrawal N, Sivic J, Laptev I, Lacoste-Julien S (2016) Unsupervised learning from narrated instruction videos. In: Proc CVPR, pp 4575\u20134583","DOI":"10.1109\/CVPR.2016.495"},{"key":"14774_CR3","doi-asserted-by":"crossref","unstructured":"Alayrac J-B, Sivic J, Laptev I, Lacoste-Julien S (2017) Joint discovery of object states and manipulation actions. In: Proc ICCV, pp 2127\u20132136","DOI":"10.1109\/ICCV.2017.234"},{"key":"14774_CR4","doi-asserted-by":"crossref","unstructured":"Amac MS, Yagcioglu S, Erdem A, Erdem E (2019) Procedural reasoning networks for understanding multimodal procedures. In: Proc coNLL, pp 441\u2013451","DOI":"10.18653\/v1\/K19-1041"},{"key":"14774_CR5","unstructured":"Banerjee S, Lavie A (2005) METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In: Proc ACL workshop IEEMMTS, pp 65\u201372"},{"key":"14774_CR6","unstructured":"Bosselut A, Levy O, Holtzman A, Ennis C, Fox D, Choi Y (2018) Simulating action dynamics with neural process networks. In: Proc ICLR"},{"key":"14774_CR7","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G, Usunier N, Kirillov A, Zagoruyko S (2020) End-to-end object detection with transformers. In: Proc ECCV, pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"14774_CR8","doi-asserted-by":"crossref","unstructured":"Chen J, Ngo C-W (2016) Deep-based ingredient recognition for cooking recipe retrieval. In: Proc ACMMM, pp 32\u201341","DOI":"10.1145\/2964284.2964315"},{"key":"14774_CR9","doi-asserted-by":"crossref","unstructured":"Dai Z, Yang Z, Yang Y, Carbonell J, Le Q, Salakhutdinov R (2019) Transformer-xl: attentive language models beyond a fixed-length context. In: Proc ACL, pp 2978\u20132988","DOI":"10.18653\/v1\/P19-1285"},{"key":"14774_CR10","doi-asserted-by":"crossref","unstructured":"Dalvi B, Huang L, Tandon N, Yih W-t, Clark P (2018) Tracking state changes in procedural text: a challenge dataset and models for process paragraph comprehension. In: Proc NAACL, pp 1595\u20131604","DOI":"10.18653\/v1\/N18-1144"},{"key":"14774_CR11","doi-asserted-by":"crossref","unstructured":"Damen D, Doughty H, Farinella GM, Fidler S, Furnari A, Kazakos E, Moltisanti D, Munro J, Perrett T, Price W, Wray M (2018) Scaling egocentric vision: The EPIC-KITCHENS dataset. In: Proc ECCV, pp 720\u2013736","DOI":"10.1007\/978-3-030-01225-0_44"},{"key":"14774_CR12","unstructured":"Devlin J, Chang M -W, Lee K, Toutanova K (2019) Bert: pre-training of deep bidirectional transformers for language understanding. In: Proc NAACL, pp 4171\u20134186"},{"key":"14774_CR13","doi-asserted-by":"crossref","unstructured":"Donahue J, Hendricks LA, Rohrbach M, Venugopalan S, Guadarrama S, Saenko K, Darrell T (2015) Long-term recurrent convolutional networks for visual recognition and description. In: Proc CVPR, pp 2625\u20132634","DOI":"10.21236\/ADA623249"},{"key":"14774_CR14","doi-asserted-by":"crossref","unstructured":"Escorcia V, Heilbron FC, Niebles JC, Ghanem B (2016) DAPS: deep action proposals for action understanding. In: Proc ECCV, pp 768\u2013784","DOI":"10.1007\/978-3-319-46487-9_47"},{"key":"14774_CR15","doi-asserted-by":"crossref","unstructured":"Gupta A, Durrett G (2019) Tracking discrete and continuous entity state for process understanding. In: Proc NAACL workshop SPNLP, pp 7\u201312","DOI":"10.18653\/v1\/W19-1502"},{"key":"14774_CR16","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proc CVPR, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"14774_CR17","unstructured":"Ioffe S, Szegedy C (2015) Batch normalization: accelerating deep network training by reducing internal covariate shift. In: Proc ICML, pp 448\u2013456"},{"key":"14774_CR18","unstructured":"Jang E, Gu S, Poole B (2017) Categorical reparametrization with gumble-softmax. In: Proc ICLR"},{"key":"14774_CR19","doi-asserted-by":"crossref","unstructured":"Jermsurawong J, Habash N (2015) Predicting the structure of cooking recipes. In: Proc EMNLP, pp 781\u2013786","DOI":"10.18653\/v1\/D15-1090"},{"key":"14774_CR20","doi-asserted-by":"crossref","unstructured":"Kiddon C, Ponnuraj GT, Zettlemoyer L, Choi Y (2015) Mise en Place: unsupervised interpretation of instructional recipes. In: Proc EMNLP, pp 982\u2013992","DOI":"10.18653\/v1\/D15-1114"},{"key":"14774_CR21","unstructured":"Kingma DP, Ba J (2015) Adam: a method for stochastic optimization. In: Proc ICLR, USA"},{"key":"14774_CR22","doi-asserted-by":"crossref","unstructured":"Lei J, Wang L, Shen Y, Yu D, Berg T, Bansal M (2020) Mart: memory-augmented recurrent transformer for coherent video paragraph captioning. In: Proc ACL, pp 2603\u20132614","DOI":"10.18653\/v1\/2020.acl-main.233"},{"key":"14774_CR23","doi-asserted-by":"crossref","unstructured":"Lin C-Y, Och FJ (2004) Automatic evaluation of machine translation quality using longest common subsequence and skip-bigram statistics. In: Proc ACL, pp 605\u2013612","DOI":"10.3115\/1218955.1219032"},{"key":"14774_CR24","doi-asserted-by":"crossref","unstructured":"Maeta H, Sasada T, Mori S (2015) A framework for procedural text understanding. In: Proc IWPT, pp 50\u201360","DOI":"10.18653\/v1\/W15-2206"},{"key":"14774_CR25","doi-asserted-by":"crossref","unstructured":"Miech A, Alayrac J-B, Smaira L, Laptev I, Sivic J, Zisserman A (2020) End-to-end learning of visual representations from uncurated instructional videos. In: Proc CVPR, pp 9879\u20139889","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"14774_CR26","doi-asserted-by":"crossref","unstructured":"Miech A, Zhukov D, Alayrac J-B, Tapaswi M, Laptev I, Sivic J (2019) HowTo100M: learning a text-video embedding by watching hundred million narrated video clips. In: Proc ICCV, pp 2630\u20132640","DOI":"10.1109\/ICCV.2019.00272"},{"key":"14774_CR27","unstructured":"Mikolov T, Sutskever I, Chen K, Corrado GS, Dean J (2013) Distributed representations of words and phrases and their compositionality. In: NeurIPS, pp 3111\u20133119"},{"key":"14774_CR28","doi-asserted-by":"crossref","unstructured":"Mintz M, Bills S, Snow R, Jurafsky D (2009) Distant supervision for relation extraction without labeled data. In: Proc ACL-IJCNLP, pp 1003\u20131011","DOI":"10.3115\/1690219.1690287"},{"key":"14774_CR29","doi-asserted-by":"crossref","unstructured":"Nishimura T, Hashimoto A, Ushiku Y, Kameko H, Mori S (2021) State-aware video procedural captioning. In: Proc ACMMM","DOI":"10.1145\/3474085.3475322"},{"key":"14774_CR30","doi-asserted-by":"publisher","first-page":"2125","DOI":"10.1109\/ACCESS.2020.3043452","volume":"9","author":"T Nishimura","year":"2020","unstructured":"Nishimura T, Hashimoto A, Ushiku Y, Kameko H, Yamakata Y, Mori S (2020) Structure-aware procedural text generation from an image sequence. IEEE Access 9:2125\u20132141","journal-title":"IEEE Access"},{"key":"14774_CR31","doi-asserted-by":"crossref","unstructured":"Nishimura T, Sakoda K, Hashimoto A, Ushiku Y, Tanaka N, Ono F, Kameko H, Mori S (2021) Egocentric biochemical video-and-language dataset. In: Proc CLVL, pp 3129\u20133133","DOI":"10.1109\/ICCVW54120.2021.00348"},{"key":"14774_CR32","doi-asserted-by":"crossref","unstructured":"Pan L, Chen J, Wu J, Liu S, Ngo C-W, Kan M-Y, Jiang Y-G, Chua T-S (2020) Multi-modal cooking workflow construction for food recipes. In: Proc ACMMM, pp 1132\u20131141","DOI":"10.1145\/3394171.3413765"},{"key":"14774_CR33","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu W-J (2002) BLEU: a method for automatic evaluation of machine translation. In: Proc ACL, pp 311\u2013318","DOI":"10.3115\/1073083.1073135"},{"key":"14774_CR34","doi-asserted-by":"crossref","unstructured":"Park JS, Rohrbach M, Darrell T, Rohrbach A (2019) Adversarial inference for multi-sentence video description. In: Proc CVPR, pp 6598\u20136608","DOI":"10.1109\/CVPR.2019.00676"},{"key":"14774_CR35","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning C (2014) Glove: global vectors for word representation. In: Proc EMNLP, pp 1532\u20131543","DOI":"10.3115\/v1\/D14-1162"},{"key":"14774_CR36","unstructured":"Radford A, Metz L, Chintala S (2015) Unsupervised representation learning with deep convolutional generative adversarial networks arXiv"},{"key":"14774_CR37","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster r-CNN: towards real-time object detection with region proposal networks. In: Proc NeurIPS, pp 91\u201399"},{"key":"14774_CR38","doi-asserted-by":"crossref","unstructured":"Salvador A, Hynes N, Aytar Y, Marin J, Ofli F, Weber I, Torralba A (2017) Learning cross-modal embeddings for cooking recipes and food images. In: Proc CVPR, pp 3020\u20133028","DOI":"10.1109\/CVPR.2017.327"},{"key":"14774_CR39","unstructured":"Santoro A, Faulkner R, Raposo D, Rae J, Chrzanowski M, Weber T, Wierstra D, Vinyals O, Pascanu R, Lillicrap T (2019) Relational recurrent neural networks. In: Proc NeurIPS, pp 7299\u20137310"},{"key":"14774_CR40","doi-asserted-by":"crossref","unstructured":"See A, Liu PJ, Manning CD (2017) Get to the point: summarization with pointer-generator networks. In: Proc ACL, pp 1073\u20131083","DOI":"10.18653\/v1\/P17-1099"},{"key":"14774_CR41","doi-asserted-by":"crossref","unstructured":"Shi B, Ji L, Liang Y, Duan N, Chen P, Niu Z, Zhou M (2019) Dense procedure captioning in narrated instructional videos. In: Proc ACL, pp 6382\u20136391","DOI":"10.18653\/v1\/P19-1641"},{"key":"14774_CR42","doi-asserted-by":"crossref","unstructured":"Shi B, Ji L, Niu Z, Duan N, Zhou M, Chen X (2020) Learning semantic concepts and temporal alignment for narrated video procedural captioning. In: Proc ACMMM, pp 4355\u20134363","DOI":"10.1145\/3394171.3413498"},{"key":"14774_CR43","doi-asserted-by":"crossref","unstructured":"Sun C, Myers A, Vondrick C, Murphy K, Schmid C (2019) Videobert: a joint model for video and language representation learning. In: Proc ICCV, pp 7464\u20137473","DOI":"10.1109\/ICCV.2019.00756"},{"key":"14774_CR44","doi-asserted-by":"crossref","unstructured":"Tan G, Liu D, Wang M, Zha Z-J (2020) Learning to discretely compose reasoning module networks for video captioning. In: Proc IJCAI, pp 745\u2013752","DOI":"10.24963\/ijcai.2020\/104"},{"key":"14774_CR45","first-page":"2579","volume":"9","author":"L van der Maaten","year":"2008","unstructured":"van der Maaten L, Hinton G (2008) Visualizing data using t-sne. J Mach Learn Res 9:2579\u20132605","journal-title":"J Mach Learn Res"},{"key":"14774_CR46","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I (2017) Attention is all you need. In: Proc NeurIPS, pp 5998\u20136008"},{"key":"14774_CR47","doi-asserted-by":"crossref","unstructured":"Vedantam R, Zitnick CL, Parikh D (2015) CIDER: consensus-based image description evaluation. In: Proc CVPR, pp 4566\u20134575","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"14774_CR48","doi-asserted-by":"publisher","first-page":"270","DOI":"10.1162\/neco.1989.1.2.270","volume":"1","author":"RJ Williams","year":"1989","unstructured":"Williams RJ, Zipser D (1989) A learning algorithm for continually running fully recurrent neural networks. Neural Comput 1:270\u2013280","journal-title":"Neural Comput"},{"key":"14774_CR49","doi-asserted-by":"crossref","unstructured":"Xiong Y, Dai B, Lin D (2018) Move forward and tell: a progressive generator of video descriptions. In: Proc ECCV, pp 489\u2013505","DOI":"10.1007\/978-3-030-01252-6_29"},{"key":"14774_CR50","unstructured":"Yamakata Y, Mori S, Carroll J (2020) English recipe flow graph corpus. In: Proc LREC, pp 5187\u20135194"},{"key":"14774_CR51","unstructured":"Zamir N, Noy A, Friedman I, Protter M, Zelnik-Manor L (2020) Asymmetric loss for multi-label classification"},{"key":"14774_CR52","doi-asserted-by":"crossref","unstructured":"Zhou L, Kalantidis Y, Chen X, Corso JJ, Rohrbach M (2019) Grounded video description. In: Proc CVPR, pp 6578\u20136587","DOI":"10.1109\/CVPR.2019.00674"},{"key":"14774_CR53","doi-asserted-by":"crossref","unstructured":"Zhou L, Xu C, Corso JJ (2018) Towards automatic learning of procedures from web instructional videos. In: Proc AAAI, pp 7590\u20137598","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"14774_CR54","doi-asserted-by":"crossref","unstructured":"Zhou L, Zhou Y, Corso JJ, Socher R, Xiong C (2018) End-to-end dense video captioning with masked transformer. In: Proc CVPR, pp 8739\u20138748","DOI":"10.1109\/CVPR.2018.00911"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-023-14774-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-023-14774-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-023-14774-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,10,3]],"date-time":"2023-10-03T09:25:57Z","timestamp":1696325157000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-023-14774-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,3,20]]},"references-count":54,"journal-issue":{"issue":"24","published-print":{"date-parts":[[2023,10]]}},"alternative-id":["14774"],"URL":"https:\/\/doi.org\/10.1007\/s11042-023-14774-7","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"type":"print","value":"1380-7501"},{"type":"electronic","value":"1573-7721"}],"subject":[],"published":{"date-parts":[[2023,3,20]]},"assertion":[{"value":"5 November 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 June 2022","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 February 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 March 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"All of them are research grants from the Japanese government.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"<!--Emphasis Type='Bold' removed-->Competing interests"}},{"value":"All authors state that no financial\/non-financial support has been received from any organization that may have an interest in this work.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"<!--Emphasis Type='Bold' removed-->Conflict of Interests"}}]}}