{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T16:41:40Z","timestamp":1769186500654,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":30,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819555666","type":"print"},{"value":"9789819555673","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5567-3_19","type":"book-chapter","created":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T21:14:28Z","timestamp":1769116468000},"page":"271-283","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Improving Spatio-Temporal Awareness of\u00a0Multimodal Large Language Models via\u00a0Reinforcement Fine-Tuning"],"prefix":"10.1007","author":[{"given":"Jingyi","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Hefeng","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Liang","family":"Lin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,23]]},"reference":[{"key":"19_CR1","unstructured":"Awadalla, A., et al.: OpenFlamingo: an open-source framework for training large autoregressive vision-language models. arXiv preprint arXiv:2308.01390 (2023)"},{"key":"19_CR2","unstructured":"Bai, S., et al: Qwen2.5-VL technical report. arXiv preprint arXiv:2502.13923 (2025)"},{"key":"19_CR3","unstructured":"Belkhale, S., et al.: Rt-1: robotics transformer for real-world control at scale. arXiv preprint arXiv:2212.06817 (2022)"},{"key":"19_CR4","unstructured":"Brohan, A., et al.: Rt-h: action hierarchies using language. arXiv preprint arXiv:2403.01823 (2024)"},{"key":"19_CR5","unstructured":"DeepSeek-AI, Guo, D., et al.: DeepSeek-R1: incentivizing reasoning capability in LLMs via reinforcement learning. arXiv preprint arXiv:2501.12948 (2025)"},{"key":"19_CR6","unstructured":"Deng, S.L., et al.: Graspvla: a grasping foundation model pre-trained on billion-scale synthetic action data. arXiv preprint arXiv:2505.03233 (2025)"},{"key":"19_CR7","unstructured":"Driess, D., et al.: Palm-e: an embodied multimodal language model. arXiv preprint arXiv:2303.03378 (2023)"},{"key":"19_CR8","unstructured":"Hu, J.C., Zhang, Y.M., Han, Q., Jiang, D.X., Zhang, X.Y., Shum, H.Y.: Open-reasoner-zero: an open source approach to scaling up reinforcement learning on the base model. arXiv preprint arXiv:2503.24290 (2025)"},{"key":"19_CR9","doi-asserted-by":"crossref","unstructured":"Huang, Z., Ling, G., Zhong, S., Wu, H., Lin, L.: MiniLongBench: the low-cost long context understanding benchmark for large language models. In: Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics, pp. 11442\u201311460 (2025)","DOI":"10.18653\/v1\/2025.acl-long.560"},{"key":"19_CR10","doi-asserted-by":"crossref","unstructured":"Ji, Y.H., et al.: Robobrain: a unified brain model for robotic manipulation from abstract to concrete. arXiv preprint arXiv:2502.21257 (2025)","DOI":"10.1109\/CVPR52734.2025.00168"},{"key":"19_CR11","unstructured":"Kim, M.J., et al.: Openvla: an open-source vision-language-action model. arXiv preprint arXiv:2406.09246 (2024)"},{"key":"19_CR12","unstructured":"Kimi Team, Du, A., et al.: Kimi K1.5: scaling reinforcement learning with LLMs. arXiv preprint arXiv:2501.12599 (2025)"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Li, X.Q., et al.: Manipllm: embodied multimodal large language model for object-centric robotic manipulation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18061\u201318070 (2024)","DOI":"10.1109\/CVPR52733.2024.01710"},{"key":"19_CR14","unstructured":"Liu, H.T., Li, C.Y., Wu, Q.Y., Lee, Y.J.: Visual instruction tuning. In: Proceedings of the Annual Conference on Neural Information Processing Systems (2023)"},{"key":"19_CR15","unstructured":"Liu, J., et al.: Robomamba: multimodal state space model for efficient robot reasoning and manipulation. arXiv preprint arXiv:2406.04339 (2024)"},{"key":"19_CR16","unstructured":"Liu, Z.Y., et al.: Visual-RFT: visual reinforcement fine-tuning. arXiv preprint arXiv:2503.01785 (2025)"},{"key":"19_CR17","unstructured":"OpenAI: learning to reason with LLMs. https:\/\/openai.com\/index\/learning-to-reason-with-llms (2024)"},{"key":"19_CR18","unstructured":"Shao, Z.H., et al.: DeepSeekMath: pushing the limits of mathematical reasoning in open language models. arXiv preprint arXiv:2402.03300 (2024)"},{"key":"19_CR19","unstructured":"Shen, H.Z., et al.: VLM-R1: a stable and generalizable R1-style large vision-language model. arXiv preprint arXiv:2504.07615 (2025)"},{"key":"19_CR20","unstructured":"Shridhar, M., Manuelli, L., Fox, D.: Cliport: what and where pathways for robotic manipulation. In: Conference on Robot Learning (CoRL), pp. 894\u2013906 (2022)"},{"key":"19_CR21","unstructured":"Tang, T., Zhang, L., Wen, Y., Zhang, K., Bian, J.-W., Zhou, X., et al.: RoboPearls: editable video simulation for robot manipulation. In: Proceedings of International Conference on Computer Vision (2025)"},{"key":"19_CR22","unstructured":"Touvron, H., et al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"19_CR23","unstructured":"Wei, J., et al.: Chain-of-thought prompting elicits reasoning in large language models. In: Advances in Neural Information Processing Systems, pp. 24824\u201324837 (2022)"},{"key":"19_CR24","unstructured":"Yu, E., et al.: Perception-R1: pioneering perception policy with reinforcement learning. arXiv preprint arXiv:2504.07954 (2025)"},{"key":"19_CR25","unstructured":"Yu, Q.Y., et al: DAPO: an open-source LLM reinforcement learning system at scale. arXiv preprint arXiv:2503.14476 (2025)"},{"key":"19_CR26","unstructured":"Zhan, Y.F., et al.: Vision-R1: evolving human-free alignment in large vision-language models via vision-guided reinforcement learning. arXiv preprint arXiv:2503.18013 (2025)"},{"key":"19_CR27","unstructured":"Zhang, J.Y., et al.: R1-VL: learning to reason with multimodal large language models via step-wise group relative policy optimization. arXiv preprint arXiv:2503.12937 (2025)"},{"key":"19_CR28","unstructured":"Zhang, K., Xu, R., Ren, P., Lin, J., Wu, H., Lin, L., Liang, X.: RoBridge: a hierarchical architecture bridging cognition and execution for general robotic manipulation. In: Proceedings of International Conference on Computer Vision (2025)"},{"key":"19_CR29","unstructured":"Zhang, S.Q., et al.: LoHoRavens: a long-horizon language-conditioned benchmark for robotic tabletop manipulation. arXiv preprint arXiv:2310.12020 (2023)"},{"key":"19_CR30","unstructured":"Zhang, W.Q., et al.: Embodied-reasoner: synergizing visual search, reasoning, and action for embodied interactive tasks. arXiv preprint arXiv:2503.21696 (2025)"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5567-3_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T21:14:33Z","timestamp":1769116473000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5567-3_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819555666","9789819555673"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5567-3_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"23 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}