{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,13]],"date-time":"2026-04-13T15:07:33Z","timestamp":1776092853596,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the City-University Joint Funding Project of Guangzhou Science and Technology Plan","award":["No. 2023A03J0141"],"award-info":[{"award-number":["No. 2023A03J0141"]}]},{"name":"Foshan HKUST Projects","award":["FSUST21-FYTRI01A, FSUST21-FYTRI02A"],"award-info":[{"award-number":["FSUST21-FYTRI01A, FSUST21-FYTRI02A"]}]},{"name":"OPPO Research Fund"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612420","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:40Z","timestamp":1698391660000},"page":"6472-6480","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Interactive Interior Design Recommendation via Coarse-to-fine Multimodal Reinforcement Learning"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-0800-8035","authenticated-orcid":false,"given":"He","family":"Zhang","sequence":"first","affiliation":[{"name":"Thrust of Artificial Intelligence, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4763-6060","authenticated-orcid":false,"given":"Ying","family":"Sun","sequence":"additional","affiliation":[{"name":"Thrust of Artificial Intelligence, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5449-9490","authenticated-orcid":false,"given":"Weiyu","family":"Guo","sequence":"additional","affiliation":[{"name":"Thrust of Artificial Intelligence, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0331-3494","authenticated-orcid":false,"given":"Yafei","family":"Liu","sequence":"additional","affiliation":[{"name":"OPPO Research Institute, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6332-2785","authenticated-orcid":false,"given":"Haonan","family":"Lu","sequence":"additional","affiliation":[{"name":"OPPO Research Institute, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0686-5206","authenticated-orcid":false,"given":"Xiaodong","family":"Lin","sequence":"additional","affiliation":[{"name":"Rutgers University, New Brunswick, NJ, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6016-6465","authenticated-orcid":false,"given":"Hui","family":"Xiong","sequence":"additional","affiliation":[{"name":"Thrust of Artificial Intelligence, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Towards knowledge-based recommender dialog system. arXiv preprint arXiv:1908.05391","author":"Chen Qibin","year":"2019","unstructured":"Qibin Chen, Junyang Lin, Yichang Zhang, Ming Ding, Yukuo Cen, Hongxia Yang, and Jie Tang. 2019. Towards knowledge-based recommender dialog system. arXiv preprint arXiv:1908.05391 (2019)."},{"key":"e_1_3_2_1_2_1","volume-title":"Gabriela Csurka, and Diane Larlus.","author":"Delmas Ginger","year":"2022","unstructured":"Ginger Delmas, Rafael Sampaio de Rezende, Gabriela Csurka, and Diane Larlus. 2022. ARTEMIS: Attention-based Retrieval with Text-Explicit Matching and Implicit Similarity. arXiv preprint arXiv:2203.08101 (2022)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462913"},{"key":"e_1_3_2_1_5_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_6_1","volume-title":"End-to-End Reinforcement Learning of Dialogue Agents for Information Access. CoRR","author":"Dhingra Bhuwan","year":"2016","unstructured":"Bhuwan Dhingra, Lihong Li, Xiujun Li, Jianfeng Gao, Yun-Nung Chen, Faisal Ahmed, and Li Deng. 2016. End-to-End Reinforcement Learning of Dialogue Agents for Information Access. CoRR , Vol. abs\/1609.00777 (2016). [arXiv]1609.00777 http:\/\/arxiv.org\/abs\/1609.00777"},{"key":"e_1_3_2_1_7_1","volume-title":"Multi-Agent Reinforcement Learning of 3D Furniture Layout Simulation in Indoor Graphics Scenes. arXiv preprint arXiv:2102.09137","author":"Di Xinhan","year":"2021","unstructured":"Xinhan Di and Pengqian Yu. 2021. Multi-Agent Reinforcement Learning of 3D Furniture Layout Simulation in Indoor Graphics Scenes. arXiv preprint arXiv:2102.09137 (2021)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"e_1_3_2_1_9_1","volume-title":"Tel Aviv","author":"Guo Weiyu","year":"2022","unstructured":"Weiyu Guo, Zhaoshuo Li, Yongkui Yang, Zheng Wang, Russell H Taylor, Mathias Unberath, Alan Yuille, and Yingwei Li. 2022. Context-Enhanced Stereo Transformer. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXII. Springer, 263--279."},{"key":"e_1_3_2_1_10_1","volume-title":"Dialog-based interactive image retrieval. Advances in neural information processing systems","author":"Guo Xiaoxiao","year":"2018","unstructured":"Xiaoxiao Guo, Hui Wu, Yu Cheng, Steven Rennie, Gerald Tesauro, and Rogerio Feris. 2018. Dialog-based interactive image retrieval. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_11_1","volume-title":"Text2Room: Extracting Textured 3D Meshes from 2D Text-to-Image Models. arXiv preprint arXiv:2303.11989","author":"H\u00f6llein Lukas","year":"2023","unstructured":"Lukas H\u00f6llein, Ang Cao, Andrew Owens, Justin Johnson, and Matthias Nie\u00dfner. 2023. Text2Room: Extracting Textured 3D Meshes from 2D Text-to-Image Models. arXiv preprint arXiv:2303.11989 (2023)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295748"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2512208"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3167132.3167276"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3336191.3371769"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403258"},{"key":"e_1_3_2_1_17_1","volume-title":"Hannes Schulz, Vincent Michalski, Laurent Charlin, and Chris Pal.","author":"Li Raymond","year":"2018","unstructured":"Raymond Li, Samira Ebrahimi Kahou, Hannes Schulz, Vincent Michalski, Laurent Charlin, and Chris Pal. 2018. Towards deep conversational recommendations. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240646"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2911451.2911493"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"e_1_3_2_1_21_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_22_1","volume-title":"International Conference on Machine Learning. PMLR, 8821--8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International Conference on Machine Learning. PMLR, 8821--8831."},{"key":"e_1_3_2_1_23_1","volume-title":"Samira Ebrahimi Kahou, and Yoshua Bengio","author":"Sharma Shikhar","year":"2018","unstructured":"Shikhar Sharma, Dendi Suhubdy, Vincent Michalski, Samira Ebrahimi Kahou, and Yoshua Bengio. 2018. Chatpainter: Improving text to image generation using dialogue. arXiv preprint arXiv:1802.08216 (2018)."},{"key":"e_1_3_2_1_24_1","volume-title":"From show to tell: a survey on deep learning-based image captioning","author":"Stefanini Matteo","year":"2022","unstructured":"Matteo Stefanini, Marcella Cornia, Lorenzo Baraldi, Silvia Cascianelli, Giuseppe Fiameni, and Rita Cucchiara. 2022. From show to tell: a survey on deep learning-based image captioning. IEEE transactions on pattern analysis and machine intelligence, Vol. 45, 1 (2022), 539--559."},{"key":"e_1_3_2_1_25_1","volume-title":"The 41st international acm sigir conference on research & development in information retrieval. 235--244.","author":"Sun Yueming","unstructured":"Yueming Sun and Yi Zhang. 2018. Conversational recommender system. In The 41st international acm sigir conference on research & development in information retrieval. 235--244."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00687"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"e_1_3_2_1_28_1","volume-title":"DiffuScene: Scene Graph Denoising Diffusion Probabilistic Model for Generative Indoor Scene Synthesis. arXiv preprint arXiv:2303.14207","author":"Tang Jiapeng","year":"2023","unstructured":"Jiapeng Tang, Yinyu Nie, Lev Markhasin, Angela Dai, Justus Thies, and Matthias Nie\u00dfner. 2023. DiffuScene: Scene Graph Denoising Diffusion Probabilistic Model for Generative Indoor Scene Synthesis. arXiv preprint arXiv:2303.14207 (2023)."},{"key":"e_1_3_2_1_29_1","volume-title":"Multimodal interaction: A review. Pattern recognition letters","author":"Turk Matthew","year":"2014","unstructured":"Matthew Turk. 2014. Multimodal interaction: A review. Pattern recognition letters, Vol. 36 (2014), 189--195."},{"key":"e_1_3_2_1_30_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems, Vol. 30 (2017)."},{"key":"e_1_3_2_1_31_1","volume-title":"Tel Aviv","author":"Wang Jiepeng","year":"2022","unstructured":"Jiepeng Wang, Peng Wang, Xiaoxiao Long, Christian Theobalt, Taku Komura, Lingjie Liu, and Wenping Wang. 2022a. Neuris: Neural reconstruction of indoor scenes using normal priors. In Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23-27, 2022, Proceedings, Part XXXII. Springer, 139--155."},{"key":"e_1_3_2_1_32_1","volume-title":"International Conference on Machine Learning. PMLR, 23318--23340","author":"Wang Peng","year":"2022","unstructured":"Peng Wang, An Yang, Rui Men, Junyang Lin, Shuai Bai, Zhikang Li, Jianxin Ma, Chang Zhou, Jingren Zhou, and Hongxia Yang. 2022b. Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In International Conference on Machine Learning. PMLR, 23318--23340."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/3DV53792.2021.00021"},{"key":"e_1_3_2_1_34_1","volume-title":"Rahul Sajnani, Adrien Poulenard, Srinath Sridhar, and Leonidas Guibas.","author":"Wei Qiuhong Anna","year":"2023","unstructured":"Qiuhong Anna Wei, Sijie Ding, Jeong Joon Park, Rahul Sajnani, Adrien Poulenard, Srinath Sridhar, and Leonidas Guibas. 2023. Lego-net: Learning regular rearrangements of objects in rooms. arXiv preprint arXiv:2301.09629 (2023)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01095"},{"key":"e_1_3_2_1_36_1","volume-title":"Simple statistical gradient-following algorithms for connectionist reinforcement learning. Reinforcement learning","author":"Williams Ronald J","year":"1992","unstructured":"Ronald J Williams. 1992. Simple statistical gradient-following algorithms for connectionist reinforcement learning. Reinforcement learning (1992), 5--32."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437963.3441791"},{"key":"e_1_3_2_1_38_1","volume-title":"Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese. arXiv preprint arXiv:2211.01335","author":"Yang An","year":"2022","unstructured":"An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, and Chang Zhou. 2022. Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese. arXiv preprint arXiv:2211.01335 (2022)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462881"},{"key":"e_1_3_2_1_40_1","volume-title":"Reward Constrained Interactive Recommendation with Natural Language Feedback. arXiv preprint arXiv:2005.01618","author":"Zhang Ruiyi","year":"2020","unstructured":"Ruiyi Zhang, Tong Yu, Yilin Shen, Hongxia Jin, Changyou Chen, and Lawrence Carin. 2020. Reward Constrained Interactive Recommendation with Natural Language Feedback. arXiv preprint arXiv:2005.01618 (2020)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403143"},{"key":"e_1_3_2_1_42_1","volume-title":"Xiaoke Wang, and Ji-Rong Wen.","author":"Zhou Kun","year":"2020","unstructured":"Kun Zhou, Yuanhang Zhou, Wayne Xin Zhao, Xiaoke Wang, and Ji-Rong Wen. 2020b. Towards topic-guided conversational recommender system. arXiv preprint arXiv:2010.04125 (2020)."}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612420","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612420","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:04:23Z","timestamp":1755821063000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612420"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":42,"alternative-id":["10.1145\/3581783.3612420","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612420","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}