{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T16:42:44Z","timestamp":1778258564710,"version":"3.51.4"},"reference-count":70,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,5,19]],"date-time":"2025-05-19T00:00:00Z","timestamp":1747612800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,5,19]]},"DOI":"10.1109\/icra55743.2025.11127968","type":"proceedings-article","created":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T17:28:56Z","timestamp":1756834136000},"page":"13289-13296","source":"Crossref","is-referenced-by-count":7,"title":["AVD2: Accident Video Diffusion for Accident Video Description"],"prefix":"10.1109","author":[{"given":"Cheng","family":"Li","sequence":"first","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Keyuan","family":"Zhou","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tong","family":"Liu","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yu","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingqiao","family":"Zhuang","sequence":"additional","affiliation":[{"name":"College of Foreign Language and Literature, Fudan University."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huan-ang","family":"Gao","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bu","family":"Jin","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hao","family":"Zhao","sequence":"additional","affiliation":[{"name":"Institute for AI Industry Research (AIR), Tsinghua University."}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3435937"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/QoMEX.2012.6263880"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA48891.2023.10160326"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72980-5_15"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC57777.2023.10421901"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00975"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28253"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00999"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.320"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00886"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_35"},{"key":"ref12","article-title":"B-SCST: Bayesian self-critical sequence training for image captioning","author":"Bujimalla","year":"2020","journal-title":"arXiv preprint arXiv"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3284038"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/ITSC48978.2021.9564966"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/UR61395.2024.10597464"},{"key":"ref16","article-title":"Hint-ad: Holistically aligned interpretability in end-to-end autonomous driving","author":"Ding","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref17","article-title":"Understanding Embodied Reference with Touch-Line Transformer","author":"Li","year":"2023","journal-title":"ICLR"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01373"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"ref20","article-title":"Retrieval-augmented generation for large language models: A survey","author":"Gao","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00677"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01742"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00293"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA57147.2024.10611261"},{"key":"ref25","article-title":"Generating videos with scene dynamics","volume":"29","author":"Vondrick","year":"2016","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_23"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"ref28","article-title":"PixArt- $\\alpha$: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis","author":"Chen","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.123"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02073"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02079"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00659"},{"key":"ref33","article-title":"MagicTime: Time-lapse Video Generation Models as Metamorphic Simulators","author":"Yuan","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref34","article-title":"Ctrl-u: Robust conditional image generation via uncertaintyaware reward modeling","author":"Zhang","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/wacv61041.2025.00099"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73411-3_3"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72384-1_58"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-78172-8_9"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/TSMC.2024.3392930"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00165"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3109419"},{"key":"ref42","first-page":"1174","article-title":"Stochastic video generation with a learned prior","volume-title":"International Conference on Machine Learning","author":"Denton","year":"2018"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00240"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02080"},{"key":"ref45","article-title":"Opensora: Democratizing efficient video production for all","author":"Zheng","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref46","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/iccvw54120.2021.00217"},{"key":"ref48","volume-title":"Ads of the World, No Author, n.d.","year":"2024"},{"key":"ref49","volume-title":"Cannes Lions","year":"2017"},{"key":"ref50","article-title":"Video generation models as world simulators","author":"Brooks","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00976"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-11021-5_5"},{"key":"ref53","article-title":"Generating images with perceptual similarity metrics based on deep networks","volume":"29","author":"Dosovitskiy","year":"2016","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.3390\/rs15020330"},{"key":"ref55","article-title":"Language Model Beats Diffusion-Tokenizer is Key to Visual Generation","author":"Yu","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3727200.3727224"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2941820"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00728"},{"key":"ref60","article-title":"A better variant of self-critical sequence training","author":"Luo","year":"2020","journal-title":"arXiv preprint arXiv"},{"key":"ref61","article-title":"Adaptive input representations for neural language modeling","author":"Baevski","year":"2018","journal-title":"arXiv preprint arXiv"},{"key":"ref62","article-title":"Attention is all you need","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-6301"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref65","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","volume-title":"Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization","author":"Banerjee","year":"2005"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.3115\/1218955.1219032"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-63322-6_8"},{"key":"ref69","article-title":"GANs trained by a two time-scale update rule converge to a local Nash equilibrium","volume":"30","author":"Heusel","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2011.2173206"}],"event":{"name":"2025 IEEE International Conference on Robotics and Automation (ICRA)","location":"Atlanta, GA, USA","start":{"date-parts":[[2025,5,19]]},"end":{"date-parts":[[2025,5,23]]}},"container-title":["2025 IEEE International Conference on Robotics and Automation (ICRA)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11127273\/11127223\/11127968.pdf?arnumber=11127968","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T06:18:29Z","timestamp":1756880309000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11127968\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,19]]},"references-count":70,"URL":"https:\/\/doi.org\/10.1109\/icra55743.2025.11127968","relation":{},"subject":[],"published":{"date-parts":[[2025,5,19]]}}}