{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T16:24:06Z","timestamp":1778084646472,"version":"3.51.4"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726545","type":"print"},{"value":"9783031726552","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T00:00:00Z","timestamp":1733443200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,6]],"date-time":"2024-12-06T00:00:00Z","timestamp":1733443200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72655-2_25","type":"book-chapter","created":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T10:10:57Z","timestamp":1733393457000},"page":"436-452","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Propose, Assess, Search: Harnessing LLMs for\u00a0Goal-Oriented Planning in\u00a0Instructional Videos"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3337-0000","authenticated-orcid":false,"given":"Md Mohaiminul","family":"Islam","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1627-3842","authenticated-orcid":false,"given":"Tushar","family":"Nagarajan","sequence":"additional","affiliation":[]},{"given":"Huiyu","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3290-8094","authenticated-orcid":false,"given":"Fu-Jen","family":"Chu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9389-4060","authenticated-orcid":false,"given":"Kris","family":"Kitani","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1800-4790","authenticated-orcid":false,"given":"Gedas","family":"Bertasius","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4372-241X","authenticated-orcid":false,"given":"Xitong","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,6]]},"reference":[{"key":"25_CR1","unstructured":"wikihow. https:\/\/www.wikiHow.com\/"},{"key":"25_CR2","doi-asserted-by":"crossref","unstructured":"Abdelsalam, M.A., Rangrej, S.B., Hadji, I., Dvornik, N., Derpanis, K.G., Fazly, A.: Gepsan: generative procedure step anticipation in cooking videos. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2988\u20132997 (2023)","DOI":"10.1109\/ICCV51070.2023.00279"},{"key":"25_CR3","unstructured":"Ahn, M., et\u00a0al.: Do as i can, not as i say: grounding language in robotic affordances. arXiv preprint arXiv:2204.01691 (2022)"},{"key":"25_CR4","doi-asserted-by":"crossref","unstructured":"Bellman, R.: A Markovian decision process. J. Math. Mech. 679\u2013684 (1957)","DOI":"10.1512\/iumj.1957.6.56038"},{"key":"25_CR5","doi-asserted-by":"crossref","unstructured":"Bi, J., Luo, J., Xu, C.: Procedure planning in instructional videos via contextual modeling and model-based policy learning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15611\u201315620 (2021)","DOI":"10.1109\/ICCV48922.2021.01532"},{"key":"25_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"334","DOI":"10.1007\/978-3-030-58621-8_20","volume-title":"Computer Vision \u2013 ECCV 2020","author":"C-Y Chang","year":"2020","unstructured":"Chang, C.-Y., Huang, D.-A., Xu, D., Adeli, E., Fei-Fei, L., Niebles, J.C.: Procedure planning in instructional videos. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12356, pp. 334\u2013350. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58621-8_20"},{"issue":"240","key":"25_CR7","first-page":"1","volume":"24","author":"A Chowdhery","year":"2023","unstructured":"Chowdhery, A., et al.: Palm: scaling language modeling with pathways. J. Mach. Learn. Res. 24(240), 1\u2013113 (2023)","journal-title":"J. Mach. Learn. Res."},{"key":"25_CR8","unstructured":"Finn, C., Yu, T., Zhang, T., Abbeel, P., Levine, S.: One-shot visual imitation learning via meta-learning. In: Conference on Robot Learning, pp. 357\u2013368. PMLR (2017)"},{"key":"25_CR9","unstructured":"Grauman, K., et\u00a0al.: Ego4d: around the world in 3,000 hours of egocentric video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18995\u201319012 (2022)"},{"key":"25_CR10","unstructured":"Huang, W., Abbeel, P., Pathak, D., Mordatch, I.: Language models as zero-shot planners: extracting actionable knowledge for embodied agents. In: International Conference on Machine Learning, pp. 9118\u20139147. PMLR (2022)"},{"key":"25_CR11","doi-asserted-by":"crossref","unstructured":"Islam, M.M., Ho, N., Yang, X., Nagarajan, T., Torresani, L., Bertasius, G.: Video recap: recursive captioning of hour-long videos. arXiv preprint arXiv:2402.13250 (2024)","DOI":"10.1109\/CVPR52733.2024.01723"},{"key":"25_CR12","unstructured":"Kahatapitiya, K., Ranasinghe, K., Park, J., Ryoo, M.S.: Language repository for long video understanding. arXiv preprint arXiv:2403.14622 (2024)"},{"key":"25_CR13","unstructured":"Kahneman, D.: Thinking, Fast and Slow. Macmillan (2011)"},{"key":"25_CR14","doi-asserted-by":"crossref","unstructured":"Liu, J., Li, S., Wang, Z., Li, M., Ji, H.: A language-first approach for procedure planning. In: Findings of the Association for Computational Linguistics: ACL 2023, pp. 1941\u20131954 (2023)","DOI":"10.18653\/v1\/2023.findings-acl.122"},{"key":"25_CR15","unstructured":"Newell, A., Shaw, J.C., Simon, H.A.: Report on a general problem solving program. In: IFIP Congress, Pittsburgh, PA, vol.\u00a0256, p.\u00a064 (1959)"},{"key":"25_CR16","doi-asserted-by":"crossref","unstructured":"Patel, D., Eghbalzadeh, H., Kamra, N., Iuzzolino, M.L., Jain, U., Desai, R.: Pretrained language models as visual planners for human assistance. arXiv preprint arXiv:2304.09179 (2023)","DOI":"10.1109\/ICCV51070.2023.01404"},{"key":"25_CR17","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"25_CR18","doi-asserted-by":"crossref","unstructured":"Reimers, N., Gurevych, I.: Sentence-bert: sentence embeddings using siamese bert-networks. arXiv preprint arXiv:1908.10084 (2019)","DOI":"10.18653\/v1\/D19-1410"},{"issue":"2","key":"25_CR19","doi-asserted-by":"publisher","first-page":"145","DOI":"10.1037\/h0030806","volume":"26","author":"HA Simon","year":"1971","unstructured":"Simon, H.A., Newell, A.: Human problem solving: the state of the theory in 1970. Am. Psychol. 26(2), 145 (1971)","journal-title":"Am. Psychol."},{"issue":"1","key":"25_CR20","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1037\/0033-2909.119.1.3","volume":"119","author":"SA Sloman","year":"1996","unstructured":"Sloman, S.A.: The empirical case for two systems of reasoning. Psychol. Bull. 119(1), 3 (1996)","journal-title":"Psychol. Bull."},{"key":"25_CR21","doi-asserted-by":"crossref","unstructured":"Song, C.H., Wu, J., Washington, C., Sadler, B.M., Chao, W.L., Su, Y.: Llm-planner: few-shot grounded planning for embodied agents with large language models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2998\u20133009 (2023)","DOI":"10.1109\/ICCV51070.2023.00280"},{"issue":"2","key":"25_CR22","doi-asserted-by":"publisher","first-page":"4924","DOI":"10.1109\/LRA.2022.3150855","volume":"7","author":"J Sun","year":"2022","unstructured":"Sun, J., Huang, D.A., Lu, B., Liu, Y.H., Zhou, B., Garg, A.: Plate: visually-grounded planning with transformers in procedural tasks. IEEE Rob. Autom. Lett. 7(2), 4924\u20134930 (2022)","journal-title":"IEEE Rob. Autom. Lett."},{"issue":"4\u20135","key":"25_CR23","doi-asserted-by":"publisher","first-page":"405","DOI":"10.1177\/0278364918770733","volume":"37","author":"N S\u00fcnderhauf","year":"2018","unstructured":"S\u00fcnderhauf, N., et al.: The limits and potentials of deep learning for robotics. Int. J. Rob. Res. 37(4\u20135), 405\u2013420 (2018)","journal-title":"Int. J. Rob. Res."},{"key":"25_CR24","doi-asserted-by":"crossref","unstructured":"Tang, Y., et al.: Coin: a large-scale dataset for comprehensive instructional video analysis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1207\u20131216 (2019)","DOI":"10.1109\/CVPR.2019.00130"},{"key":"25_CR25","unstructured":"Touvron, H., et\u00a0al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"25_CR26","doi-asserted-by":"crossref","unstructured":"Wang, A.L., Lin, K.Y., Du, J.R., Meng, J., Zheng, W.S.: Event-guided procedure planning from instructional videos with text supervision. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 13565\u201313575 (2023)","DOI":"10.1109\/ICCV51070.2023.01248"},{"key":"25_CR27","doi-asserted-by":"crossref","unstructured":"Wang, H., Wu, Y., Guo, S., Wang, L.: Pdpp: projected diffusion for procedure planning in instructional videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14836\u201314845 (2023)","DOI":"10.1109\/CVPR52729.2023.01425"},{"key":"25_CR28","doi-asserted-by":"crossref","unstructured":"Xu, H., et al.: Videoclip: contrastive pre-training for zero-shot video-text understanding. arXiv preprint arXiv:2109.14084 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"25_CR29","unstructured":"Yao, S., et al.: Tree of thoughts: deliberate problem solving with large language models. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"25_CR30","unstructured":"Yao, S., et al.: React: synergizing reasoning and acting in language models. arXiv preprint arXiv:2210.03629 (2022)"},{"key":"25_CR31","unstructured":"Zeng, A., et\u00a0al.: Socratic models: composing zero-shot multimodal reasoning with language. arXiv preprint arXiv:2204.00598 (2022)"},{"key":"25_CR32","doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: A simple llm framework for long-range video question-answering. arXiv preprint arXiv:2312.17235 (2023)","DOI":"10.18653\/v1\/2024.emnlp-main.1209"},{"key":"25_CR33","doi-asserted-by":"crossref","unstructured":"Zhao, H., Hadji, I., Dvornik, N., Derpanis, K.G., Wildes, R.P., Jepson, A.D.: P3iv: probabilistic procedure planning from instructional videos with weak supervision. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2938\u20132948 (2022)","DOI":"10.1109\/CVPR52688.2022.00295"},{"key":"25_CR34","unstructured":"Zhao, Q., et al.: Antgpt: can large language models help long-term action anticipation from videos? arXiv preprint arXiv:2307.16368 (2023)"},{"key":"25_CR35","doi-asserted-by":"crossref","unstructured":"Zhukov, D., Alayrac, J.B., Cinbis, R.G., Fouhey, D., Laptev, I., Sivic, J.: Cross-task weakly supervised learning from instructional videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3537\u20133545 (2019)","DOI":"10.1109\/CVPR.2019.00365"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72655-2_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,5]],"date-time":"2024-12-05T11:33:45Z","timestamp":1733398425000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72655-2_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,6]]},"ISBN":["9783031726545","9783031726552"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72655-2_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,6]]},"assertion":[{"value":"6 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}