{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,28]],"date-time":"2026-05-28T03:51:44Z","timestamp":1779940304689,"version":"3.53.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foundation of China","award":["62272298"],"award-info":[{"award-number":["62272298"]}]},{"name":"the Shanghai Municipal Science and Technology Key Project China","award":["20511100300"],"award-info":[{"award-number":["20511100300"]}]},{"name":"the Shanghai Municipal Science and Technology Major Project China","award":["2021SHZDZX0102"],"award-info":[{"award-number":["2021SHZDZX0102"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612439","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"2847-2855","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":36,"title":["Scene-aware Human Pose Generation using Transformer"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-1374-5672","authenticated-orcid":false,"given":"Jieteng","family":"Yao","sequence":"first","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3647-8674","authenticated-orcid":false,"given":"Junjie","family":"Chen","sequence":"additional","affiliation":[{"name":"MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1970-8634","authenticated-orcid":false,"given":"Li","family":"Niu","sequence":"additional","affiliation":[{"name":"MoE Key Lab of Artificial Intelligence, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8678-2784","authenticated-orcid":false,"given":"Bin","family":"Sheng","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Mykhaylo Andriluka Leonid Pishchulin Peter Gehler and Bernt Schiele. 2014. 2D Human Pose Estimation: New Benchmark and State of the Art Analysis. In CVPR.","DOI":"10.1109\/CVPR.2014.471"},{"key":"e_1_3_2_1_2_1","volume-title":"Unipose: Unified human pose estimation in single images and videos. In CVPR.","author":"Artacho Bruno","year":"2020","unstructured":"Bruno Artacho and Andreas Savakis. 2020. Unipose: Unified human pose estimation in single images and videos. In CVPR."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01336-9"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Nicolas Carion Francisco Massa Gabriel Synnaeve Nicolas Usunier Alexander Kirillov and Sergey Zagoruyko. 2020. End-to-end object detection with transformers. In ECCV.","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Joao Carreira Pulkit Agrawal Katerina Fragkiadaki and Jitendra Malik. 2016. Human pose estimation with iterative error feedback. In CVPR.","DOI":"10.1109\/CVPR.2016.512"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TAMD.2011.2106782"},{"key":"e_1_3_2_1_7_1","unstructured":"Bowen Cheng Alex Schwing and Alexander Kirillov. 2021. Per-Pixel Classification is Not All You Need for Semantic Segmentation. In NIPS."},{"key":"e_1_3_2_1_8_1","volume-title":"Higherhrnet: Scale-aware representation learning for bottom-up human pose estimation. In CVPR.","author":"Cheng Bowen","year":"2020","unstructured":"Bowen Cheng, Bin Xiao, Jingdong Wang, Honghui Shi, Thomas S Huang, and Lei Zhang. 2020. Higherhrnet: Scale-aware representation learning for bottom-up human pose estimation. In CVPR."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Xiao Chu Wei Yang Wanli Ouyang Cheng Ma Alan L Yuille and Xiaogang Wang. 2017. Multi-context attention for human pose estimation. In CVPR.","DOI":"10.1109\/CVPR.2017.601"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Matthias Dantone Juergen Gall Christian Leistner and Luc Van Gool. 2013. Human pose estimation using body parts dependent joint regressors. In CVPR.","DOI":"10.1109\/CVPR.2013.391"},{"key":"e_1_3_2_1_11_1","volume-title":"Imagenet: A large-scale hierarchical image database. In CVPR.","author":"Deng Jia","year":"2009","unstructured":"Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. 2009. Imagenet: A large-scale hierarchical image database. In CVPR."},{"key":"e_1_3_2_1_12_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ICLR","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. ICLR (2021)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"David Eigen and Rob Fergus. 2015. Predicting depth surface normals and semantic labels with a common multi-scale convolutional architecture. In ICCV.","DOI":"10.1109\/ICCV.2015.304"},{"key":"e_1_3_2_1_14_1","volume-title":"The Ecological Approach to Visual Perception. Houghton Mifflin Comp","author":"Gibson JJ","year":"1979","unstructured":"JJ Gibson. 1979. The Ecological Approach to Visual Perception. Houghton Mifflin Comp (1979)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Helmut Grabner Juergen Gall and Luc Van Gool. 2011. What makes a chair a chair?. In CVPR.","DOI":"10.1109\/CVPR.2011.5995327"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Abhinav Gupta Scott Satkin Alexei A Efros and Martial Hebert. 2011. From 3d scene geometry to human workspace. In CVPR.","DOI":"10.1109\/CVPR.2011.5995448"},{"key":"e_1_3_2_1_17_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530094"},{"key":"e_1_3_2_1_19_1","unstructured":"Diederik P Kingma and Max Welling. 2014. Auto-encoding variational Bayes. In ICLR."},{"key":"e_1_3_2_1_20_1","unstructured":"Donghoon Lee Sifei Liu Jinwei Gu Ming-Yu Liu Ming-Hsuan Yang and Jan Kautz. 2018. Context-aware Synthesis and Placement of Object Instances. In NIPS."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Ke Li Shijie Wang Xiang Zhang Yifan Xu Weijian Xu and Zhuowen Tu. 2021. Pose recognition with cascade transformers. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00198"},{"key":"e_1_3_2_1_22_1","unstructured":"Xueting Li Sifei Liu Kihwan Kim Xiaolong Wang Ming-Hsuan Yang and Jan Kautz. 2019. Putting humans in a scene: Learning affordance in 3d indoor environments. In CVPR."},{"key":"e_1_3_2_1_23_1","volume-title":"St-gan: Spatial transformer generative adversarial networks for image compositing. In CVPR.","author":"Lin Chen-Hsuan","year":"2018","unstructured":"Chen-Hsuan Lin, Ersin Yumer, Oliver Wang, Eli Shechtman, and Simon Lucey. 2018b. St-gan: Spatial transformer generative adversarial networks for image compositing. In CVPR."},{"key":"e_1_3_2_1_24_1","volume-title":"Learning a disentangled embedding for monocular 3d shape retrieval and pose estimation. arXiv preprint arXiv:1812.09899","author":"Lin Kyaw Zaw","year":"2018","unstructured":"Kyaw Zaw Lin, Weipeng Xu, Qianru Sun, Christian Theobalt, and Tat-Seng Chua. 2018a. Learning a disentangled embedding for monocular 3d shape retrieval and pose estimation. arXiv preprint arXiv:1812.09899 (2018)."},{"key":"e_1_3_2_1_25_1","volume-title":"OPA: Object Placement Assessment Dataset. arXiv preprint arXiv:2107.01889","author":"Liu Liu","year":"2021","unstructured":"Liu Liu, Bo Zhang, Jiangtong Li, Li Niu, Qingyang Liu, and Liqing Zhang. 2021. OPA: Object Placement Assessment Dataset. arXiv preprint arXiv:2107.01889 (2021)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Manuel Lopes Francisco S Melo and Luis Montesano. 2007. Affordance-based imitation learning in robots. In IROS.","DOI":"10.1109\/IROS.2007.4399517"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSMCB.2005.846654"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cag.2019.09.002"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Julieta Martinez Rayat Hossain Javier Romero and James J Little. 2017. A simple yet effective baseline for 3d human pose estimation. In ICCV.","DOI":"10.1109\/ICCV.2017.288"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Bogdan Moldovan and Luc De Raedt. 2014. Occluded object search by relational affordances. In ICRA.","DOI":"10.1109\/ICRA.2014.6906605"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Alejandro Newell Kaiyu Yang and Jia Deng. 2016. Stacked hourglass networks for human pose estimation. In ECCV.","DOI":"10.1007\/978-3-319-46484-8_29"},{"key":"e_1_3_2_1_32_1","volume-title":"Making Images Real Again: A Comprehensive Survey on Deep Image Composition. arXiv preprint arXiv:2106.14490","author":"Niu Li","year":"2021","unstructured":"Li Niu, Wenyan Cong, Liu Liu, Yan Hong, Bo Zhang, Jing Liang, and Liqing Zhang. 2021. Making Images Real Again: A Comprehensive Survey on Deep Image Composition. arXiv preprint arXiv:2106.14490 (2021)."},{"key":"e_1_3_2_1_33_1","volume-title":"Zhenchen Liu, and Jiangtong Li.","author":"Niu Li","year":"2022","unstructured":"Li Niu, Qingyang Liu Liu, Zhenchen Liu, and Jiangtong Li. 2022. Fast Object Placement Assessment. arXiv preprint arXiv:2205.14280 (2022)."},{"key":"e_1_3_2_1_34_1","unstructured":"Adam Paszke Sam Gross Soumith Chintala Gregory Chanan Edward Yang Zachary DeVito Zeming Lin Alban Desmaison Luca Antiga and Adam Lerer. 2017. Automatic differentiation in pytorch. (2017)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Davis Rempe Tolga Birdal Aaron Hertzmann Jimei Yang Srinath Sridhar and Leonidas J Guibas. 2021. Humor: 3d human motion model for robust pose estimation. In ICCV.","DOI":"10.1109\/ICCV48922.2021.01129"},{"key":"e_1_3_2_1_36_1","unstructured":"Anirban Roy and Sinisa Todorovic. 2016. A multi-scale cnn for affordance segmentation in rgb images. In ECCV."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Benjamin Sapp Alexander Toshev and Ben Taskar. 2010. Cascaded models for articulated pose estimation. In ECCV.","DOI":"10.1007\/978-3-642-15552-9_30"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Yohei Shiraki Kazuyuki Nagata Natsuki Yamanobe Akira Nakamura Kensuke Harada Daisuke Sato and Dragomir N Nenchev. 2014. Modeling of everyday objects for semantic grasp. In RO-MAN.","DOI":"10.1109\/ROMAN.2014.6926343"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Kai Su Dongdong Yu Zhenqi Xu Xin Geng and Changhu Wang. 2019. Multi-person pose estimation with enhanced channel-wise and spatial information. In CVPR.","DOI":"10.1109\/CVPR.2019.00582"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Min Sun Pushmeet Kohli and Jamie Shotton. 2012. Conditional regression forests for human pose estimation. In CVPR.","DOI":"10.1109\/CVPR.2012.6248079"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Xiao Sun Jiaxiang Shang Shuang Liang and Yichen Wei. 2017. Compositional human pose regression. In ICCV.","DOI":"10.1109\/ICCV.2017.284"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Fuwen Tan Crispin Bernier Benjamin Cohen Vicente Ordonez and Connelly Barnes. 2018. Where and who? automatic semantic-aware person composition. In WACV.","DOI":"10.1109\/WACV.2018.00170"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Kaihua Tang Hanwang Zhang Baoyuan Wu Wenhan Luo and Wei Liu. 2019. Learning to compose dynamic tree structures for visual contexts. In CVPR.","DOI":"10.1109\/CVPR.2019.00678"},{"key":"e_1_3_2_1_44_1","volume-title":"Deeppose: Human pose estimation via deep neural networks. In CVPR.","author":"Toshev Alexander","year":"2014","unstructured":"Alexander Toshev and Christian Szegedy. 2014. Deeppose: Human pose estimation via deep neural networks. In CVPR."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Shashank Tripathi Siddhartha Chandra Amit Agrawal Ambrish Tyagi James M Rehg and Visesh Chari. 2019. Learning to generate synthetic data via compositing. In CVPR.","DOI":"10.1109\/CVPR.2019.00055"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"crossref","unstructured":"Emre Ugur Erhan Oztop and Erol cS ahin. 2011. Going beyond the perception of affordances: Learning how to actualize them through behavioral parameters. In ICRA.","DOI":"10.1109\/ICRA.2011.5980299"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Emre Ugur Sandor Szedmak and Justus Piater. 2014. Bootstrapping paired-object affordance learning with learned single-affordance features. In ICDL-EPIROB.","DOI":"10.1109\/DEVLRN.2014.6983026"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Karthik Mahesh Varadarajan and Markus Vincze. 2013. Parallel deep learning with suggestive activation for object category recognition. In ICVS.","DOI":"10.1007\/978-3-642-39402-7_36"},{"key":"e_1_3_2_1_49_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In NIPS."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"Jacob Walker Kenneth Marino Abhinav Gupta and Martial Hebert. 2017. The pose knows: Video forecasting by generating pose futures. In ICCV.","DOI":"10.1109\/ICCV.2017.361"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Fang Wang and Yi Li. 2013. Beyond physical connections: Tree models in human pose estimation. In CVPR.","DOI":"10.1109\/CVPR.2013.83"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Xiaolong Wang Rohit Girdhar and Abhinav Gupta. 2017. Binge watching: Scaling affordance learning from sitcoms. In CVPR.","DOI":"10.1109\/CVPR.2017.359"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"crossref","unstructured":"Yang Wang and Greg Mori. 2008. Multiple tree models for occlusion and spatial constraints in human pose estimation. In ECCV.","DOI":"10.1007\/978-3-540-88690-7_53"},{"key":"e_1_3_2_1_54_1","unstructured":"Shih-En Wei Varun Ramakrishna Takeo Kanade and Yaser Sheikh. 2016. Convolutional pose machines. In CVPR."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Lingzhi Zhang Tarmily Wen Jie Min Jiancong Wang David Han and Jianbo Shi. 2020a. Learning object placement by inpainting for compositional data augmentation. In ECCV.","DOI":"10.1007\/978-3-030-58601-0_34"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/s41095-020-0158-8"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Siyuan Zhou Liu Liu Li Niu and Liqing Zhang. 2022. Learning Object Placement via Dual-Path Graph Completion. In ECCV.","DOI":"10.1007\/978-3-031-19790-1_23"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"Yixin Zhu Yibiao Zhao and Song Chun Zhu. 2015. Understanding tools: Task-oriented object modeling learning and recognition. In CVPR.","DOI":"10.1109\/CVPR.2015.7298903"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612439","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612439","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:08:06Z","timestamp":1755821286000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612439"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":58,"alternative-id":["10.1145\/3581783.3612439","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612439","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}