{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:10:19Z","timestamp":1765343419266,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","funder":[{"name":"Key Technology R&D Program of Ningbo","award":["2023Z143?2025Z028"],"award-info":[{"award-number":["2023Z143?2025Z028"]}]},{"name":"atural Science Foundation of the Jiangsu Higher Education Institutions of China","award":["23KJB120013"],"award-info":[{"award-number":["23KJB120013"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758290","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:55Z","timestamp":1761377215000},"page":"13311-13317","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["EditGarment: An Instruction-Based Garment Editing Dataset Constructed with Automated MLLM Synthesis and Semantic-Aware Evaluation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-0655-2064","authenticated-orcid":false,"given":"Deqiang","family":"Yin","sequence":"first","affiliation":[{"name":"Xi'an Jiaotong-Liverpool University, Suzhou, Jiangsu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8744-6939","authenticated-orcid":false,"given":"Junyi","family":"Guo","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong-Liverpool University, Suzhou, Jiangsu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4924-1038","authenticated-orcid":false,"given":"Huanda","family":"Lu","sequence":"additional","affiliation":[{"name":"NingboTech University, Ningbo, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9618-8965","authenticated-orcid":false,"given":"Fangyu","family":"Wu","sequence":"additional","affiliation":[{"name":"Xi'an Jiaotong-Liverpool University, Suzhou, Jiangsu, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7228-0887","authenticated-orcid":false,"given":"Dongming","family":"Lu","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, Zhejiang, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Jinze Bai Shuai Bai Shusheng Yang Shijie Wang Sinan Tan Peng Wang Junyang Lin Chang Zhou and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding Localization Text Reading and Beyond. arXiv preprint arXiv:2312.11805."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02138"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Alberto Baldrati Davide Morelli Marcella Cornia Marco Bertini and Rita Cucchiara. 2024. Multimodal-conditioned latent diffusion models for fashion image editing. arXiv preprint arXiv:2403.14828.","DOI":"10.1109\/ICCV51070.2023.02138"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00738"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00846"},{"key":"e_1_3_2_1_6_1","first-page":"18392","article-title":"Instructpix2pix: Learning to follow image editing instructions","author":"Brooks Tim","year":"2023","unstructured":"Tim Brooks, Aleksander Holynski, and Alexei A Efros. 2023. Instructpix2pix: Learning to follow image editing instructions. Proceedings of the CVPR, 18392-18402.","journal-title":"Proceedings of the CVPR"},{"key":"e_1_3_2_1_7_1","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al., 2020. Language models are few-shot learners. In Proceedings of NeurIPS, Vol. 33, 1877-1901.","journal-title":"Proceedings of NeurIPS"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-emnlp.646"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of ICLR.","author":"Cho Jaemin","year":"2023","unstructured":"Jaemin Cho, Yushi Hu, Roopal Garg, Peter Anderson, Ranjay Krishna, Jason Baldridge, Mohit Bansal, Jordi Pont-Tuset, and Su Wang. 2023. Davidsonian scene graph: Improving reliability in fine-grained evaluation for text-to-image generation. In Proceedings of ICLR."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01391"},{"key":"e_1_3_2_1_11_1","first-page":"8780","article-title":"Diffusion Models Beat GANs, on Image Synthesis","volume":"34","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alex Nichol. 2021. Diffusion Models Beat GANs, on Image Synthesis. In Proceedings of NeurIPS, Vol. 34, 8780-8794.","journal-title":"Proceedings of NeurIPS"},{"key":"e_1_3_2_1_12_1","unstructured":"Daya Guo Dejian Yang Haowei Zhang Junxiao Song Ruoyu Zhang Runxin Xu Qihao Zhu Shirong Ma Peiyi Wang Xiao Bi et al. 2025a. Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning. arXiv preprint arXiv:2501.12948."},{"key":"e_1_3_2_1_13_1","volume-title":"Eng Gee Lim, and Dongming Lu","author":"Guo Junyi","year":"2025","unstructured":"Junyi Guo, Jingxuan Zhang, Fangyu Wu, Huanda Lu, Qiufeng Wang, Wenmian Yang, Eng Gee Lim, and Dongming Lu. 2025b. HiGarment: Cross-modal Harmony Based Diffusion Model for Flat Sketch to Realistic Garment Image. arXiv preprint arXiv:2505.23186."},{"key":"e_1_3_2_1_14_1","first-page":"20406","article-title":"TIFA: Accurate, and Interpretable Text-to-Image Faithfulness Evaluation, with Question Answering","author":"Hu Yushi","year":"2023","unstructured":"Yushi Hu, Benlin Liu, Jungo Kasai, Yizhong Wang, Mari Ostendorf, Ranjay Krishna, and Noah A. Smith. 2023. TIFA: Accurate, and Interpretable Text-to-Image Faithfulness Evaluation, with Question Answering. In Proceedings of ICCV, 20406-20417.","journal-title":"Proceedings of ICCV"},{"key":"e_1_3_2_1_15_1","article-title":"Diffusion model-based image editing: A survey","author":"Huang Yi","year":"2025","unstructured":"Yi Huang, Jiancheng Huang, Yifan Liu, Mingfu Yan, Jiaxi Lv, Jianzhuang Liu, Wei Xiong, He Zhang, Liangliang Cao, and Shifeng Chen. 2025. Diffusion model-based image editing: A survey. IEEE Transactions on Pattern Analysis and Machine Intelligence.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00799"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01185"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of ICLR.","author":"Hui Mude","year":"2024","unstructured":"Mude Hui, Siwei Yang, Bingchen Zhao, Yichun Shi, Heng Wang, Peng Wang, Yuyin Zhou, and Cihang Xie. 2024. HQ-Edit: A High-Quality Dataset for Instruction-based Image Editing. In Proceedings of ICLR."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00747"},{"key":"e_1_3_2_1_20_1","unstructured":"Pamela Mishkin Lama Ahmad Miles Brundage Gretchen Krueger and Girish Sastry. 2022. DALL\u00b7E 2 Preview - Risks and Limitations. https:\/\/github.com\/openai\/dalle-2-preview\/blob\/main\/system-card.md"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_2_1_22_1","first-page":"8580","article-title":"LaDI-VTON: Latent Diffusion Textual-Inversion Enhanced Virtual Try-On","author":"Morelli Davide","year":"2023","unstructured":"Davide Morelli, Alberto Baldrati, Giuseppe Cartella, Marcella Cornia, Marco Bertini, and Rita Cucchiara. 2023. LaDI-VTON: Latent Diffusion Textual-Inversion Enhanced Virtual Try-On. In Proceedings of ACM MM, 8580-8589.","journal-title":"Proceedings of ACM MM"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00243"},{"key":"e_1_3_2_1_24_1","unstructured":"Jiteng Mu Nuno Vasconcelos and Xiaolong Wang. 2025. EditAR: Unified Conditional Generation with Autoregressive Models. arXiv preprint arXiv:2501.04699."},{"key":"e_1_3_2_1_25_1","unstructured":"OpenAI. 2024. GPT-4 Technical Report. arXiv preprint arXiv:2303.08774."},{"key":"e_1_3_2_1_26_1","unstructured":"William Peebles and Saining Xie. 2023. Scalable Diffusion Models with Transformers. arXiv preprint arXiv:2212.09748 4195-4205."},{"key":"e_1_3_2_1_27_1","volume-title":"Proceedings of ICML, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021-07-18\/2021-07-24. Learning Transferable Visual Models From Natural Language Supervision. In Proceedings of ICML, 8748-8763."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00847"},{"key":"e_1_3_2_1_30_1","unstructured":"Xincheng Shuai Henghui Ding Xingjun Ma Rongcheng Tu Yu-Gang Jiang and Dacheng Tao. 2024. A survey of multimodal-guided image editing with text-to-image diffusion models. arXiv preprint arXiv:2406.14555."},{"key":"e_1_3_2_1_31_1","unstructured":"Gemini Team Rohan Anil Sebastian Borgeaud Jean-Baptiste Alayrac Jiahui Yu Radu Soricut Johan Schalkwyk Andrew M Dai Anja Hauth Katie Millican et al. 2023. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805."},{"key":"e_1_3_2_1_32_1","unstructured":"Yoad Tewel Rinon Gal Dvir Samuel Yuval Atzmon Lior Wolf and Gal Chechik. 2024. Add-It: Training-Free Object Insertion in Images With Pretrained Diffusion Models. arXiv preprint arXiv:2411.07232."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i9.28885"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_2_1_35_1","volume-title":"MAGICBRUSH: A Manually Annotated Dataset, for Instruction-Guided Image Editing. 31428-31449.","author":"Zhang Kai","year":"2023","unstructured":"Kai Zhang, Lingbo Mo, Wenhu Chen, Huan Sun, and Yu Su. 2023a. MAGICBRUSH: A Manually Annotated Dataset, for Instruction-Guided Image Editing. 31428-31449."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of ECCV, 148-164","author":"Zhang Shiyue","year":"2024","unstructured":"Shiyue Zhang, Zheng Chong, Xujie Zhang, Hanhui Li, Yuhao Cheng, Yiqiang Yan, and Xiaodan Liang. 2024a. GarmentAligner: Text-to-Garment Generation, via Retrieval-augmented Multi-level Corrections. In Proceedings of ECCV, 148-164."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00862"},{"key":"e_1_3_2_1_39_1","first-page":"4525","article-title":"ARMANI: Part-level Garment-Text Alignment, for Unified Cross-Modal Fashion Design","author":"Zhang Xujie","year":"2022","unstructured":"Xujie Zhang, Yu Sha, Michael C. Kampffmeyer, Zhenyu Xie, Zequn Jie, Chengwen Huang, Jianqing Peng, and Xiaodan Liang. 2022. ARMANI: Part-level Garment-Text Alignment, for Unified Cross-Modal Fashion Design. In Proceedings of ACM MM, 4525-4535.","journal-title":"Proceedings of ACM MM"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02116"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758290","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:07:25Z","timestamp":1765343245000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758290"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":40,"alternative-id":["10.1145\/3746027.3758290","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758290","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}