{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,11]],"date-time":"2025-09-11T16:38:26Z","timestamp":1757608706281,"version":"3.44.0"},"reference-count":59,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11127507","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"12002-12002","source":"Crossref","is-referenced-by-count":0,"title":["NavigateDiff: Visual Predictors are Zero-Shot Navigation Assistants"],"prefix":"10.1109","author":[{"given":"Yiran","family":"Qin","sequence":"first","affiliation":[{"name":"Sun Yat-sen University"}]},{"given":"Ao","family":"Sun","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen"}]},{"given":"Yuze","family":"Hong","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen"}]},{"given":"Benyou","family":"Wang","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong,Shenzhen"}]},{"given":"Ruimao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr42600.2020.01289"},{"article-title":"Thda: Treasure hunt data augmentation for semantic navigation","volume-title":"Thda: Treasure hunt data augmentation for semantic navigation","author":"Oleksandr","key":"ref2"},{"article-title":"Memory-augmented reinforcement learning for image-goal navigation","volume-title":"Memory-augmented reinforcement learning for image-goal navigation","author":"Lina","key":"ref3"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01652"},{"article-title":"Auxiliary tasks and exploration enable objectgoal navigation","volume-title":"Auxiliary tasks and exploration enable objectgoal navigation","author":"Joel","key":"ref5"},{"key":"ref6","article-title":"Object goal navigation using goaloriented semantic exploration","volume":"33","author":"Devendra Singh","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Poni: Potential functions for objectgoal navigation with interaction-free learning","volume-title":"Poni: Potential functions for objectgoal navigation with interaction-free learning","author":"Krishnan","key":"ref7"},{"article-title":"Habitat-web: Learning embodied objectsearch strategies from human demonstrations at scale","volume-title":"Habitat-web: Learning embodied objectsearch strategies from human demonstrations at scale","author":"Ram","key":"ref8"},{"article-title":"Visual representations for semantic target driven navigation","volume-title":"Visual representations for semantic target driven navigation","author":"Arsalan","key":"ref9"},{"key":"ref10","article-title":"Diffusion models beat gans on image synthesis","volume":"34","author":"Prafulla","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref11","article-title":"Classifier-free diffusion guidance Classifier-free diffusion guidance","author":"Jonathan","year":"2022","journal-title":"arXiv preprint"},{"key":"ref12","article-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models, Glide: Towards photorealistic image generation and editing with text-guided diffusion models","author":"Alex","year":"2021","journal-title":"arXiv preprint"},{"article-title":"Instructpix2pix: Learning to follow image editing instructions","volume-title":"Instructpix2pix: Learning to follow image editing instructions","author":"Tim","key":"ref13"},{"key":"ref14","article-title":"Guiding instruction-based image editing via multimodal large language models, Guiding instruction-based image editing via multimodal large language models","author":"Tsu-Jui","year":"2023","journal-title":"arXiv preprint"},{"article-title":"Instructdiffusion: A generalist modeling interface for vision tasks","volume-title":"Instructdiffusion: A generalist modeling interface for vision tasks","author":"Zigang","key":"ref15"},{"key":"ref16","article-title":"Minedreamer: Learning to follow instructions via chain-of-imagination for simulated-world control Minedreamer: Learning to follow instructions via chain-of-imagination for simulated-world control","author":"Enshen","year":"2024","journal-title":"arXiv preprint"},{"article-title":"Esc: Exploration with soft commonsense constraints for zero-shot object navigation","volume-title":"International Conference on Machine Learning. PMLR","author":"Kaiwen","key":"ref17"},{"article-title":"L3mvn: Leveraging large language models for visual target navigation","volume-title":"L3mvn: Leveraging large language models for visual target navigation","author":"Bangguo","key":"ref18"},{"article-title":"Cows on pasture: Baselines and benchmarks for language-driven zero-shot object navigation","volume-title":"Cows on pasture: Baselines and benchmarks for language-driven zero-shot object navigation","author":"Samir Yitzhak","key":"ref19"},{"article-title":"Navigation with large language models: Semantic guesswork as a heuristic for planning","volume-title":"Conference on Robot Learning. PMLR","author":"Dhruv","key":"ref20"},{"article-title":"Bridging zero-shot object navigation and foundation models through pixel-guided navigation skill","volume-title":"Bridging zero-shot object navigation and foundation models through pixel-guided navigation skill","author":"Wenzhe","key":"ref21"},{"key":"ref22","article-title":"Co-NavGPT: Multirobot cooperative visual semantic navigation using large language models Co-NavGPT: Multirobot cooperative visual semantic navigation using large language models","author":"Bangguo","year":"2023","journal-title":"arXiv preprint"},{"key":"ref23","article-title":"Voronav: Voronoi-based zero-shot object navigation with large language model Voronav: Voronoi-based zero-shot object navigation with large language model","author":"Pengying","year":"2024","journal-title":"arXiv preprint"},{"key":"ref24","article-title":"Mp5: A multi-modal open-ended embodied system in minecraft via active perception Mp5: A multi-modal open-ended embodied system in minecraft via active perception","author":"Yiran","year":"2023","journal-title":"arXiv preprint"},{"key":"ref25","article-title":"Learning universal policies via text-guided video generation","volume":"36","author":"Yilun","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref26","article-title":"Compositional foundation models for hierarchical planning","volume":"36","author":"Anurag","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Skilldiffuser: Interpretable hierarchical planning via skill abstractions in diffusion-based task execution","volume-title":"Skilldiffuser: Interpretable hierarchical planning via skill abstractions in diffusion-based task execution","author":"Zhixuan","key":"ref27"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1706.08500"},{"article-title":"The unreasonable effectiveness of deep features as a perceptual metric","volume-title":"The unreasonable effectiveness of deep features as a perceptual metric","author":"Richard","key":"ref29"},{"key":"ref30","article-title":"Language models are few-shot learners, Language models are few-shot learners","author":"Brown","year":"2020","journal-title":"arXiv preprint"},{"key":"ref31","article-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis Sdxl: Improving latent diffusion models for high-resolution image synthesis","author":"Dustin","year":"2023","journal-title":"arXiv preprint"},{"key":"ref32","article-title":"Rt-1: Robotics transformer for real-world control at scale, Rt-1: Robotics transformer for real-world control at scale","author":"Anthony","year":"2022","journal-title":"arXiv preprint"},{"key":"ref33","article-title":"Rt-2: Vision-language-action models transfer web knowledge to robotic control, Rt-2: Vision-language-action models transfer web knowledge to robotic control","author":"Anthony","year":"2023","journal-title":"arXiv preprint"},{"article-title":"Manipllm: Embodied multimodal large language model for object-centric robotic manipulation","volume-title":"Manipllm: Embodied multimodal large language model for object-centric robotic manipulation","author":"Xiaoqi","key":"ref34"},{"key":"ref35","article-title":"ViNT: A foundation model for visual navigation, ViNT: A foundation model for visual navigation","author":"Dhruv","year":"2023","journal-title":"arXiv preprint"},{"key":"ref36","article-title":"Visual instruction tuning","volume":"36","author":"Haotian","year":"2024","journal-title":"Advances in neural information processing systems"},{"key":"ref37","article-title":"Lora: Low-rank adaptation of large language models, Lora: Low-rank adaptation of large language models","author":"Hu","year":"2021","journal-title":"arXiv preprint"},{"article-title":"SupFusion: Supervised LiDAR-camera fusion for 3D object detection","volume-title":"SupFusion: Supervised LiDAR-camera fusion for 3D object detection","author":"Yiran","key":"ref38"},{"key":"ref39","article-title":"Worldsimbench: Towards video generation models as world simulators Worldsimbench: Towards video generation models as world simulators","author":"Yiran","year":"2024","journal-title":"arXiv preprint"},{"key":"ref40","article-title":"GameFactory: Creating New Games with Generative Interactive Videos GameFactory: Creating New Games with Generative Interactive Videos","author":"Jiwen","year":"2025","journal-title":"arXiv preprint"},{"key":"ref41","article-title":"Code-as-Monitor: Constraint-aware Visual Programming for Reactive and Proactive Robotic Failure Detection, Code-as-Monitor: Constraint-aware Visual Programming for Reactive and Proactive Robotic Failure Detection","author":"Enshen","year":"2024","journal-title":"arXiv preprint"},{"key":"ref42","article-title":"AD-H: Autonomous Driving with Hierarchical Agents AD-H: Autonomous Driving with Hierarchical Agents","author":"Zaibin","year":"2024","journal-title":"arXiv preprint"},{"key":"ref43","article-title":"Toward Accurate Camera-based 3D Object Detection via Cascade Depth Estimation and Calibration, Toward Accurate Camera-based 3D Object Detection via Cascade Depth Estimation and Calibration","author":"Chaoqun","year":"2024","journal-title":"arXiv preprint"},{"key":"ref44","article-title":"Story3d-agent: Exploring 3d storytelling visualization with large language models, Story3d-agent: Exploring 3d storytelling visualization with large language models","author":"Yuzhou","year":"2024","journal-title":"arXiv preprint"},{"key":"ref45","article-title":"Semiparametric topological memory for navigation, Semiparametric topological memory for navigation","author":"Nikolay","year":"2018","journal-title":"arXiv preprint"},{"key":"ref46","first-page":"32340","article-title":"Zson: Zero-shot object-goal navigation using multimodal goal embeddings","volume":"35","author":"Arjun","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref47","article-title":"Offline visual representation learning for embodied navigation","volume-title":"Workshop on Reincarnating Reinforcement Learning at ICLR","author":"Karmesh","year":"2023"},{"key":"ref48","article-title":"Ovrl-v2: A simple state-of-art baseline for imagenav and objectnav, Ovrl-v2: A simple state-of-art baseline for imagenav and objectnav","author":"Karmesh","year":"2023","journal-title":"arXiv preprint"},{"key":"ref49","article-title":"FGPrompt: fine-grained goal prompting for imagegoal navigation","volume":"36","author":"Xinyu","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"article-title":"Target-driven visual navigation in indoor scenes using deep reinforcement learning","volume-title":"Target-driven visual navigation in indoor scenes using deep reinforcement learning","author":"Yuke","key":"ref50"},{"key":"ref51","article-title":"Generating images with multimodal language models","volume":"36","author":"Jing","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref52","article-title":"Instance-specific image goal navigation: Training embodied agents to find object instances, Instance-specific image goal navigation: Training embodied agents to find object instances","author":"Jacob","year":"2022","journal-title":"arXiv preprint"},{"key":"ref53","article-title":"Proximal policy optimization algorithms, Proximal policy optimization algorithms","author":"John","year":"2017","journal-title":"arXiv preprint"},{"key":"ref54","article-title":"On evaluation of embodied navigation agents, On evaluation of embodied navigation agents","author":"Peter","year":"2018","journal-title":"arXiv preprint"},{"key":"ref55","article-title":"NavCoT: Boosting LLM-Based Vision-andLanguage Navigation via Learning Disentangled Reasoning, NavCoT: Boosting LLM-Based Vision-andLanguage Navigation via Learning Disentangled Reasoning","author":"Bingqian","year":"2024","journal-title":"arXiv preprint"},{"article-title":"Navgpt: Explicit reasoning in vision-and-language navigation with large language models","volume-title":"Navgpt: Explicit reasoning in vision-and-language navigation with large language models","author":"Gengze","key":"ref56"},{"key":"ref57","first-page":"26661","article-title":"No rl, no simulation: Learning to navigate without navigating","volume":"34","author":"Meera","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref58","article-title":"T2ISafety: Benchmark for Assessing Fairness, Toxicity, and Privacy in Image Generation, T2ISafety: Benchmark for Assessing Fairness, Toxicity, and Privacy in Image Generation","author":"Lijun","year":"2025","journal-title":"arXiv preprint"},{"key":"ref59","article-title":"AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in Text-to-Image Generation, AGFSync: Leveraging AI-Generated Feedback for Preference Optimization in Text-to-Image Generation","author":"Jingkun","year":"2024","journal-title":"arXiv preprint"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","start":{"date-parts":[[2025,5,19]]},"location":"Atlanta, GA, USA","end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11127507.pdf?arnumber=11127507","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T06:12:15Z","timestamp":1756879935000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11127507\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":59,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11127507","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}