{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T01:53:55Z","timestamp":1781574835394,"version":"3.54.5"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783031060465","type":"print"},{"value":"9783031060472","type":"electronic"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-06047-2_4","type":"book-chapter","created":{"date-parts":[[2022,6,15]],"date-time":"2022-06-15T19:03:42Z","timestamp":1655319822000},"page":"44-57","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Cross-modal Representation Learning for\u00a0Understanding Manufacturing Procedure"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0799-4269","authenticated-orcid":false,"given":"Atsushi","family":"Hashimoto","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Taichi","family":"Nishimura","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9014-1389","authenticated-orcid":false,"given":"Yoshitaka","family":"Ushiku","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9844-6198","authenticated-orcid":false,"given":"Hirotaka","family":"Kameko","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shinsuke","family":"Mori","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2022,6,16]]},"reference":[{"key":"4_CR1","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings ACL Workshop IEEMMTS, pp. 65\u201372 (2005)"},{"key":"4_CR2","unstructured":"Bosselut, A., Levy, O., Holtzman, A., Ennis, C., Fox, D., Choi, Y.: Simulating action dynamics with neural process networks. In: Proceedings ICLR (2018)"},{"key":"4_CR3","doi-asserted-by":"crossref","unstructured":"Chandu, K., Nyberg, E., Black, A.W.: Storyboarding of recipes: grounded contextual generation. In: Proceedings ACL, pp. 6040\u20136046 (2019)","DOI":"10.18653\/v1\/P19-1606"},{"key":"4_CR4","doi-asserted-by":"crossref","unstructured":"Dai, Z., Yang, Z., Yang, Y., Carbonell, J., Le, Q., Salakhutdinov, R.: Transformer-XL: attentive language models beyond a fixed-length context. In: Proceedings ACL, pp. 2978\u20132988 (2019)","DOI":"10.18653\/v1\/P19-1285"},{"key":"4_CR5","doi-asserted-by":"crossref","unstructured":"Gu, J., Im, D.J., Li., V.O.: Neural machine translation with Gumbel-greedy decoding. In: Proceedings AAAI, pp. 5125\u20135132 (2018)","DOI":"10.1609\/aaai.v32i1.12016"},{"key":"4_CR6","doi-asserted-by":"crossref","unstructured":"Harashima, J., Someya, Y., Kikuta, Y.: Cookpad image dataset: an image collection as infrastructure for food research. In: SIGIR (2017)","DOI":"10.1145\/3077136.3080686"},{"key":"4_CR7","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"4_CR8","unstructured":"Huang, T.K., et al.: Visual storytelling. In: Proceedings NAACL-HLT, pp. 1233\u20131239 (2016)"},{"key":"4_CR9","doi-asserted-by":"crossref","unstructured":"Jermsurawong, J., Habash, N.: Predicting the structure of cooking recipes. In: Proceedings EMNLP (2015)","DOI":"10.18653\/v1\/D15-1090"},{"key":"4_CR10","doi-asserted-by":"crossref","unstructured":"Kiddon, C., Ponnuraj, G.T., Zettlemoyer, L., Choi, Y.: Mise EN place: unsupervised interpretation of instructional recipes. In: EMNLP (2015)","DOI":"10.18653\/v1\/D15-1114"},{"key":"4_CR11","unstructured":"Kim, T., Heo, M., Son, S., Park, K., Zhang, B.: GLAC Net: glocal attention cascading networks for multi-image cued story generation. arXiv (2018)"},{"key":"4_CR12","unstructured":"Koehn, P.: Statistical significance tests for machine translation evaluation. In: Proceedings EMNLP, pp. 388\u2013395 (2004)"},{"key":"4_CR13","doi-asserted-by":"crossref","unstructured":"Lei, J., Wang, L., Shen, Y., Yu, D., Berg, T., Bansal, M.: MART: memory-augmented recurrent transformer for coherent video paragraph captioning. In: Proceedings ACL, pp. 2603\u20132614 (2020)","DOI":"10.18653\/v1\/2020.acl-main.233"},{"key":"4_CR14","doi-asserted-by":"crossref","unstructured":"Li, J., Galley, M., Brockett, C., Gao, J., Dolan, B.: A diversity-promoting objective function for neural conversation models. In: Proceedings NAACL-HLT, pp. 110\u2013119 (2016)","DOI":"10.18653\/v1\/N16-1014"},{"key":"4_CR15","doi-asserted-by":"crossref","unstructured":"Lin, C.Y., Och, F.J.: Automatic evaluation of machine translation quality using longest common subsequence and skip-bigram statistics. In: Proceedings ACL, pp. 605\u2013612 (2004)","DOI":"10.3115\/1218955.1219032"},{"key":"4_CR16","first-page":"2579","volume":"9","author":"L van der Maaten","year":"2008","unstructured":"van der Maaten, L., Hinton, G.: Visualizing data using t-SNE. J. Mach. Learn. Res. 9, 2579\u20132605 (2008)","journal-title":"J. Mach. Learn. Res."},{"key":"4_CR17","doi-asserted-by":"crossref","unstructured":"Miech, A., Zhukov, D., Alayrac, J.B., Tapaswi, M., Laptev, I., Sivic, J.: HowTo100M: learning a text-video embedding by watching hundred million narrated video clips. In: Proceedings ICCV, pp. 2630\u20132640 (2019)","DOI":"10.1109\/ICCV.2019.00272"},{"key":"4_CR18","doi-asserted-by":"crossref","unstructured":"Mori, S., Maeta, H., Yamakata, Y., Sasada, T.: Flow graph corpus from recipe texts. In: Proceedings LREC (2014)","DOI":"10.63317\/5gpnteuutj4x"},{"key":"4_CR19","doi-asserted-by":"crossref","unstructured":"Nishimura, T., Hashimoto, A., Mori, S.: Procedural text generation from a photo sequence. In: Proceedings INLG, pp. 409\u2013414 (2019)","DOI":"10.18653\/v1\/W19-8650"},{"key":"4_CR20","doi-asserted-by":"crossref","unstructured":"Nishimura, T., Hashimoto, A., Ushiku, Y., Kameko, H., Mori, S.: State-aware video procedural captioning. In: ACMMM, pp. 1766\u20131774 (2021)","DOI":"10.1145\/3474085.3475322"},{"key":"4_CR21","doi-asserted-by":"publisher","first-page":"2125","DOI":"10.1109\/ACCESS.2020.3043452","volume":"9","author":"T Nishimura","year":"2020","unstructured":"Nishimura, T., Hashimoto, A., Ushiku, Y., Kameko, H., Yamakata, Y., Mori, S.: Structure-aware procedural text generation from an image sequence. IEEE Access 9, 2125\u20132141 (2020)","journal-title":"IEEE Access"},{"key":"4_CR22","doi-asserted-by":"crossref","unstructured":"Nishimura, T., et al.: Egocentric biochemical video-and-language dataset. In: Proceedings ICCVW, pp. 3122\u20133126 (2021)","DOI":"10.1109\/ICCVW54120.2021.00348"},{"key":"4_CR23","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: A method for automatic evaluation of machine translation. In: Proceedings ACL, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"4_CR24","doi-asserted-by":"crossref","unstructured":"Park, J.S., Rohrbach, M., Darrell, T., Rohrbach, A.: Adversarial inference for multi-sentence video description. In: Proceedings CVPR, pp. 6598\u20136608 (2019)","DOI":"10.1109\/CVPR.2019.00676"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"Shi, B., et al.: Dense procedure captioning in narrated instructional videos. In: Proceedings ACL, pp. 6382\u20136391 (2019)","DOI":"10.18653\/v1\/P19-1641"},{"key":"4_CR26","doi-asserted-by":"crossref","unstructured":"Shi, B., Ji, L., Niu, Z., Duan, N., Zhou, M., Chen, X.: Learning semantic concepts and temporal alignment for narrated video procedural captioning. In: Proceedings ACMMM, pp. 4355\u20134363 (2020)","DOI":"10.1145\/3394171.3413498"},{"key":"4_CR27","doi-asserted-by":"crossref","unstructured":"Tai, K.S., Socher, R., Manning, C.D.: Improved semantic representations from tree-structured long short-term memory networks. In: Proceedings ACL-IJCNLP, pp. 1556\u20131566 (2015)","DOI":"10.3115\/v1\/P15-1150"},{"key":"4_CR28","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: CIDEr: consensus-based image description evaluation. In: Proceedings CVPR, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"4_CR29","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, CVPR, June 2015","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"4_CR30","doi-asserted-by":"crossref","unstructured":"Wang, W., Wang, Y., Chen, S., Jin, Q.: YouMakeup: a large-scale domain-specific multimodal dataset for fine-grained semantic comprehension. In: Proceedings EMNLP-IJCNLP, pp. 5133\u20135143 (2019)","DOI":"10.18653\/v1\/D19-1517"},{"key":"4_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"489","DOI":"10.1007\/978-3-030-01252-6_29","volume-title":"Computer Vision \u2013 ECCV 2018","author":"Y Xiong","year":"2018","unstructured":"Xiong, Y., Dai, B., Lin, D.: Move forward and tell: a progressive generator of video descriptions. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11215, pp. 489\u2013505. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01252-6_29"},{"key":"4_CR32","doi-asserted-by":"crossref","unstructured":"Yagcioglu, S., Erdem, A., Erdem, E., Ikizler-Cinbis, N.: RecipeQA: a challenge dataset for multimodal comprehension of cooking recipes. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pp. 1358\u20131368 (2018)","DOI":"10.18653\/v1\/D18-1166"},{"key":"4_CR33","unstructured":"Yamakata, Y., Mori, S., Carroll, J.: English recipe flow graph corpus. In: Proceedings LREC, pp. 5187\u20135194 (2020)"},{"key":"4_CR34","doi-asserted-by":"crossref","unstructured":"Zhou, L., Xu, C., Corso, J.J.: Towards automatic learning of procedures from web instructional videos. In: Proceedings AAAI, pp. 7590\u20137598 (2018)","DOI":"10.1609\/aaai.v32i1.12342"},{"key":"4_CR35","doi-asserted-by":"crossref","unstructured":"Zhou, L., Zhou, Y., Corso, J.J., Socher, R., Xiong, C.: End-to-end dense video captioning with masked transformer. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, CVPR, June 2018","DOI":"10.1109\/CVPR.2018.00911"}],"container-title":["Lecture Notes in Computer Science","Cross-Cultural Design. Applications in Learning, Arts, Cultural Heritage, Creative Industries, and Virtual Reality"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-06047-2_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T01:27:15Z","timestamp":1781573235000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-06047-2_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031060465","9783031060472"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-06047-2_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"16 June 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"HCII","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Human-Computer Interaction","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26 June 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 July 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"hcii2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/2022.hci.international\/index.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}