{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:14:16Z","timestamp":1777655656562,"version":"3.51.4"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726835","type":"print"},{"value":"9783031726842","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72684-2_2","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:02:45Z","timestamp":1730574165000},"page":"18-34","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["m &amp;m\u2019s: A Benchmark to\u00a0Evaluate Tool-Use for\u00a0multi-step multi-modal Tasks"],"prefix":"10.1007","author":[{"given":"Zixian","family":"Ma","sequence":"first","affiliation":[]},{"given":"Weikai","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Jieyu","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Tanmay","family":"Gupta","sequence":"additional","affiliation":[]},{"given":"Ranjay","family":"Krishna","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"issue":"47","key":"2_CR1","doi-asserted-by":"publisher","first-page":"29302","DOI":"10.1073\/pnas.1912341117","volume":"117","author":"KR Allen","year":"2020","unstructured":"Allen, K.R., Smith, K.A., Tenenbaum, J.B.: Rapid trial-and-error learning with simulation supports flexible tool use and physical reasoning. Proc. Nat. Acad. Sci. 117(47), 29302\u201329310 (2020)","journal-title":"Proc. Nat. Acad. Sci."},{"key":"2_CR2","unstructured":"Chen, P.L., Chang, C.S.: Interact: exploring the potentials of ChatGPT as a cooperative agent. arXiv preprint arXiv:2308.01552 (2023)"},{"issue":"6","key":"2_CR3","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pcbi.1011087","volume":"19","author":"CG Correa","year":"2023","unstructured":"Correa, C.G., Ho, M.K., Callaway, F., Daw, N.D., Griffiths, T.L.: Humans decompose tasks by trading off utility and computational cost. PLoS Comput. Biol. 19(6), e1011087 (2023)","journal-title":"PLoS Comput. Biol."},{"key":"2_CR4","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"2_CR5","unstructured":"Gao, D., et al.: AssistGPT: a general multi-modal assistant that can plan, execute, inspect, and learn. arXiv preprint arXiv:2306.08640 (2023)"},{"key":"2_CR6","unstructured":"Grunde-McLaughlin, M., Lam, M.S., Krishna, R., Weld, D.S., Heer, J.: Designing LLM chains by adapting techniques from crowdsourcing workflows. arXiv preprint arXiv:2312.11681 (2023)"},{"key":"2_CR7","doi-asserted-by":"crossref","unstructured":"Gupta, T., Kembhavi, A.: Visual programming: compositional visual reasoning without training (2022)","DOI":"10.1109\/CVPR52729.2023.01436"},{"key":"2_CR8","unstructured":"Huang, W., et\u00a0al.: Inner monologue: embodied reasoning through planning with language models. arXiv preprint arXiv:2207.05608 (2022)"},{"key":"2_CR9","unstructured":"Huang, Y., et\u00a0al.: Metatool benchmark for large language models: deciding whether to use tools and which to use. arXiv preprint arXiv:2310.03128 (2023)"},{"key":"2_CR10","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6700\u20136709 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"2_CR11","doi-asserted-by":"publisher","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.: ReferItGame: referring to objects in photographs of natural scenes. In: Moschitti, A., Pang, B., Daelemans, W. (eds.) Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP), Doha, Qatar, pp. 787\u2013798. Association for Computational Linguistics (2014). https:\/\/doi.org\/10.3115\/v1\/D14-1086. https:\/\/aclanthology.org\/D14-1086","DOI":"10.3115\/v1\/D14-1086"},{"key":"2_CR12","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"2_CR13","unstructured":"Madaan, A., et\u00a0al.: Self-refine: iterative refinement with self-feedback. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"2_CR14","unstructured":"Mialon, G., Fourrier, C., Swift, C., Wolf, T., LeCun, Y., Scialom, T.: Gaia: a benchmark for general AI assistants. arXiv preprint arXiv:2311.12983 (2023)"},{"key":"2_CR15","unstructured":"Miao, N., Teh, Y.W., Rainforth, T.: SelfCheck: using LLMs to zero-shot check their own step-by-step reasoning. arXiv preprint arXiv:2308.00436 (2023)"},{"key":"2_CR16","doi-asserted-by":"publisher","unstructured":"Panayotov, V., Chen, G., Povey, D., Khudanpur, S.: LibriSpeech: an ASR corpus based on public domain audio books. In: 2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 5206\u20135210 (2015). https:\/\/doi.org\/10.1109\/ICASSP.2015.7178964","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"2_CR17","unstructured":"Qin, Y., et al.: ToolLLM: facilitating large language models to master 16000+ real-world APIs (2023)"},{"key":"2_CR18","doi-asserted-by":"publisher","unstructured":"Rajpurkar, P., Zhang, J., Lopyrev, K., Liang, P.: SQuAD: 100,000+ questions for machine comprehension of text. In: Su, J., Duh, K., Carreras, X. (eds.) Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing, Austin, Texas, pp. 2383\u20132392. Association for Computational Linguistics (2016). https:\/\/doi.org\/10.18653\/v1\/D16-1264. https:\/\/aclanthology.org\/D16-1264","DOI":"10.18653\/v1\/D16-1264"},{"key":"2_CR19","unstructured":"Rana, K., Haviland, J., Garg, S., Abou-Chakra, J., Reid, I., Suenderhauf, N.: SayPlan: grounding large language models using 3D scene graphs for scalable task planning. arXiv preprint arXiv:2307.06135 (2023)"},{"key":"2_CR20","unstructured":"Ruan, Y., et al.: Identifying the risks of LM agents with an LM-emulated sandbox (2023)"},{"key":"2_CR21","unstructured":"Schick, T., et al.: ToolFormer: language models can teach themselves to use tools. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"2_CR22","unstructured":"Shen, Y., Song, K., Tan, X., Li, D., Lu, W., Zhuang, Y.: HuggingGPT: solving AI tasks with ChatGPT and its friends in hugging face (2023)"},{"key":"2_CR23","unstructured":"Shen, Y., et al.: TaskBench: benchmarking large language models for task automation. arXiv preprint arXiv:2311.18760 (2023)"},{"key":"2_CR24","unstructured":"Shinn, N., Labash, B., Gopinath, A.: Reflexion: an autonomous agent with dynamic memory and self-reflection. arXiv preprint arXiv:2303.11366 (2023)"},{"key":"2_CR25","doi-asserted-by":"publisher","unstructured":"Suhr, A., Lewis, M., Yeh, J., Artzi, Y.: A corpus of natural language for visual reasoning. In: Barzilay, R., Kan, M.Y. (eds.) Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics, Vancouver, Canada (Volume 2: Short Papers), pp. 217\u2013223. Association for Computational Linguistics (2017). https:\/\/doi.org\/10.18653\/v1\/P17-2034. https:\/\/aclanthology.org\/P17-2034","DOI":"10.18653\/v1\/P17-2034"},{"key":"2_CR26","unstructured":"Sun, H., Zhuang, Y., Kong, L., Dai, B., Zhang, C.: Adaplanner: adaptive planning from feedback with language models. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"2_CR27","doi-asserted-by":"crossref","unstructured":"Sur\u00eds, D., Menon, S., Vondrick, C.: ViperGPT: visual inference via Python execution for reasoning. arXiv preprint arXiv:2303.08128 (2023)","DOI":"10.1109\/ICCV51070.2023.01092"},{"key":"2_CR28","unstructured":"Wang, G., et al.: Voyager: an open-ended embodied agent with large language models. arXiv preprint arXiv:2305.16291 (2023)"},{"key":"2_CR29","unstructured":"Wang, L., et\u00a0al.: A survey on large language model based autonomous agents. arXiv preprint arXiv:2308.11432 (2023)"},{"key":"2_CR30","unstructured":"Wang, X., et al.: Mint: evaluating LLMs in multi-turn interaction with tools and language feedback. arXiv preprint arXiv:2309.10691 (2023)"},{"key":"2_CR31","unstructured":"Wang, X., et al.: Self-consistency improves chain of thought reasoning in language models (2023)"},{"key":"2_CR32","unstructured":"Wu, Q., et al.: AutoGen: enabling next-gen LLM applications via multi-agent conversation framework. arXiv preprint arXiv:2308.08155 (2023)"},{"key":"2_CR33","unstructured":"Wu, Z., et al.: Fine-grained human feedback gives better rewards for language model training (2023)"},{"key":"2_CR34","doi-asserted-by":"publisher","unstructured":"Yang, Z., et al.: HotpotQA: a dataset for diverse, explainable multi-hop question answering. In: Riloff, E., Chiang, D., Hockenmaier, J., Tsujii, J. (eds.) Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, Brussels, Belgium, pp. 2369\u20132380. Association for Computational Linguistics (2018). https:\/\/doi.org\/10.18653\/v1\/D18-1259. https:\/\/aclanthology.org\/D18-1259","DOI":"10.18653\/v1\/D18-1259"},{"key":"2_CR35","unstructured":"Yao, S., Chen, H., Yang, J., Narasimhan, K.: WebShop: towards scalable real-world web interaction with grounded language agents. In: Advances in Neural Information Processing Systems, vol. 35, pp. 20744\u201320757 (2022)"},{"key":"2_CR36","unstructured":"Yao, S., et al.: Tree of thoughts: deliberate problem solving with large language models (2023)"},{"key":"2_CR37","unstructured":"Yao, S., et al.: React: synergizing reasoning and acting in language models (2023)"},{"key":"2_CR38","unstructured":"Zhang, J., Krishna, R., Awadallah, A.H., Wang, C.: EcoAssistant: using LLM assistant more affordably and accurately. arXiv preprint arXiv:2310.03046 (2023)"},{"key":"2_CR39","unstructured":"Zhang, K., Mo, L., Chen, W., Sun, H., Su, Y.: MagicBrush: a manually annotated dataset for instruction-guided image editing. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"2_CR40","unstructured":"Zhu, X., et\u00a0al.: Ghost in the minecraft: generally capable agents for open-world enviroments via large language models with text-based knowledge and memory. arXiv preprint arXiv:2305.17144 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72684-2_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T19:02:52Z","timestamp":1730574172000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72684-2_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031726835","9783031726842"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72684-2_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}