{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T06:19:56Z","timestamp":1778048396854,"version":"3.51.4"},"reference-count":89,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,3,6]]},"DOI":"10.1109\/wacv61042.2026.00022","type":"proceedings-article","created":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T19:59:32Z","timestamp":1778011172000},"page":"138-149","source":"Crossref","is-referenced-by-count":0,"title":["GenHSI: Controllable Generation of Human-Scene Interaction Videos"],"prefix":"10.1109","author":[{"given":"Zekun","family":"Li","sequence":"first","affiliation":[{"name":"Brown University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rui","family":"Zhou","sequence":"additional","affiliation":[{"name":"Brown University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Rahul","family":"Sajnani","sequence":"additional","affiliation":[{"name":"Brown University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiaoyan","family":"Cong","sequence":"additional","affiliation":[{"name":"Brown University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Daniel","family":"Ritchie","sequence":"additional","affiliation":[{"name":"Brown University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Srinath","family":"Sridhar","sequence":"additional","affiliation":[{"name":"Brown University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref2","volume-title":"Kling ai 1.6 elements"},{"key":"ref3","volume-title":"Kling ai 1.6 frames"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02032"},{"key":"ref5","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00727"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00182"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"ref10","author":"Chen","year":"2023","journal-title":"Pixart-\u03b1: Fast training of diffusion transformer for photorealistic text-to-image synthesis"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01626"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00572"},{"key":"ref13","article-title":"Dreamcinema: Cinematic transfer with free camera and 3d character","author":"Chen","year":"2024"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00840"},{"key":"ref15","article-title":"Laserhuman: language-guided scene-aware human motion generation in free environment","author":"Cong","year":"2024"},{"key":"ref16","article-title":"Scaling rectified flow transformers for high-resolution image synthesis","volume-title":"Forty-first international conference on machine learning","author":"Esser"},{"key":"ref17","author":"Fang","year":"2024","journal-title":"Motioncharacter: Identity-preserving and motion controllable human video generation"},{"key":"ref18","author":"Feng","year":"2023","journal-title":"Dreamoving: A human video generation framework based on diffusion models"},{"key":"ref19","article-title":"Humandit: Pose-guided diffusion transformer for long-form human motion video generation","author":"Gan","year":"2025"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02096"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3721238.3730607"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657407"},{"key":"ref23","author":"Guo","year":"2023","journal-title":"Animatediff: Animate your personalized text-to-image diffusion models without specific tuning"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00430"},{"key":"ref25","article-title":"Ltx-video: Realtime video latent diffusion","author":"HaCohen","year":"2024"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00237"},{"key":"ref27","article-title":"Cameractrl: Enabling camera control for text-to-video generation","volume-title":"The Thirteenth International Conference on Learning Representations","author":"He"},{"key":"ref28","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0628"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.00951"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01607"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00171"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3592433"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72983-6_23"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.00962"},{"key":"ref37","article-title":"Target-aware video diffusion models","author":"Kim","year":"2025"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01639"},{"key":"ref40","article-title":"Flux","author":"Labs"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01934"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00339"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3528979\/mm1"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01934"},{"key":"ref45","author":"Lin","year":"2024","journal-title":"Open-sora plan: Open-source large video generation model"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01285"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01387"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/2816795.2818013"},{"key":"ref50","article-title":"Step-video-t2v technical report: The practice, challenges, and future of video foundation model","author":"Ma","year":"2025"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00683"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01973"},{"key":"ref53","article-title":"Genheld: Generating and editing handheld objects","author":"Min","year":"2024"},{"key":"ref54","article-title":"Do generative video models understand physical principles","author":"Motamed","year":"2025"},{"key":"ref55","article-title":"Chatgpt-4o","year":"2025"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73347-5_10"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72646-0_4"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01123"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/cvprw67362.2025.00271"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01316"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51701.2025.01482"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/wacv61041.2025.00056"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657497"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00844"},{"key":"ref67","article-title":"Denoising diffusion implicit models","author":"Song","year":"2020"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00622"},{"key":"ref69","article-title":"Add-it: Training-free object insertion in images with pretrained diffusion models","author":"Tewel","year":"2024"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00496"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02295-1"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1088"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1145\/3641519.3657518"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00049"},{"key":"ref75","article-title":"Orient anything: Learning robust object orientation estimation from rendering 3d models","author":"Wang","year":"2024"},{"key":"ref76","article-title":"Detectron2","author":"Wu","year":"2019"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.02000"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72952-2_23"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72952-2_23"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/tvcg.2026.3662720"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00780"},{"key":"ref82","first-page":"246","article-title":"Generating human interaction motions in scenes with text control","volume-title":"European Conference on Computer Vision","author":"Yi"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00623"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00740"},{"key":"ref86","author":"Zhang","year":"2025","journal-title":"Magic mirror: Id-preserved video generation in video diffusion transformers"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_18"},{"key":"ref88","article-title":"Oscillation inversion: Understand the structure of large flow model through the lens of inversion method","author":"Zheng","year":"2024"},{"key":"ref89","author":"Zhou","year":"2024","journal-title":"Motion control for enhanced complex action video generation"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/iccvw69036.2025.00201"}],"event":{"name":"2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","location":"Tucson, AZ, USA","start":{"date-parts":[[2026,3,6]]},"end":{"date-parts":[[2026,3,10]]}},"container-title":["2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11491838\/11491925\/11492739.pdf?arnumber=11492739","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T06:00:03Z","timestamp":1778047203000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11492739\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,6]]},"references-count":89,"URL":"https:\/\/doi.org\/10.1109\/wacv61042.2026.00022","relation":{},"subject":[],"published":{"date-parts":[[2026,3,6]]}}}