{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:10:15Z","timestamp":1765008615270,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,9]]},"DOI":"10.1145\/3743093.3770997","type":"proceedings-article","created":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:08:11Z","timestamp":1765008491000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CARD: Control-Driven Autoregressive Reconstruction with Decoupled Learning for Multi-Class Anomaly Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-1380-2862","authenticated-orcid":false,"given":"Yifan","family":"Wang","sequence":"first","affiliation":[{"name":"Institution of Automation, Chinese Academy of Science, Beijing, China and School of Artificial Intelligence, University of Chinese Academy of Science, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2517-0731","authenticated-orcid":false,"given":"Mingqing","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua Shenzhen International Graduate School, Tsinghua University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6780-7495","authenticated-orcid":false,"given":"Boyi","family":"Sun","sequence":"additional","affiliation":[{"name":"Institution of Automation, Chinese Academy of Science, Beijing, China and School of Artificial Intelligence, University of Chinese Academy of Science, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2521-7006","authenticated-orcid":false,"given":"Qianfan","family":"Zhao","sequence":"additional","affiliation":[{"name":"Research and Development, Hangzhou Xingwuzhong Robot Co., Ltd, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6240-5300","authenticated-orcid":false,"given":"Lu","family":"Zhang","sequence":"additional","affiliation":[{"name":"Institution of Automation, Chinese Academy of Science, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2148-1846","authenticated-orcid":false,"given":"Zhiyong","family":"Liu","sequence":"additional","affiliation":[{"name":"Institution of Automation, Chinese Academy of Science, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0553-4581","authenticated-orcid":false,"given":"Xu","family":"Yang","sequence":"additional","affiliation":[{"name":"Institution of Automation, Chinese Academy of Science, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8068-0883","authenticated-orcid":false,"given":"Suiwu","family":"Zheng","sequence":"additional","affiliation":[{"name":"Institution of Automation, Chinese Academy of Science, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,12,6]]},"reference":[{"key":"e_1_3_3_1_2_2","volume-title":"CVPR","author":"Bergmann Paul","year":"2019","unstructured":"Paul Bergmann, Michael Fauser, David Sattlegger, and Carsten Steger. 2019. MVTec AD\u2013A comprehensive real-world dataset for unsupervised anomaly detection. In CVPR."},{"key":"e_1_3_3_1_3_2","unstructured":"Yunkang Cao Haiming Yao Wei Luo and Weiming Shen. 2025. VarAD: Lightweight High-Resolution Image Anomaly Detection via Visual Autoregressive Modeling. IEEE TII (2025)."},{"key":"e_1_3_3_1_4_2","unstructured":"Xuhai Chen Yue Han and Jiangning Zhang. 2023. April-gan: A zero-\/few-shot anomaly classification and segmentation method for cvpr 2023 vand workshop challenge tracks 1&2: 1st place on zero-shot ad and 4th place on few-shot ad. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2305.17382 (2023)."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00951"},{"key":"e_1_3_3_1_6_2","volume-title":"Proc. NAACL-HLT","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. Bert: Pre-training of deep bidirectional transformers for language understanding. In Proc. NAACL-HLT."},{"key":"e_1_3_3_1_7_2","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et\u00a0al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2010.11929 (2020)."},{"key":"e_1_3_3_1_8_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"e_1_3_3_1_9_2","volume-title":"ECCV","author":"Fu\u010dka Matic","year":"2024","unstructured":"Matic Fu\u010dka, Vitjan Zavrtanik, and Danijel Sko\u010daj. 2024. TransFusion\u2013a transparency-based diffusion model for anomaly detection. In ECCV."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i8.28690"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01878"},{"key":"e_1_3_3_1_13_2","unstructured":"Zongming Li Tianheng Cheng Shoufa Chen Peize Sun Haocheng Shen Longjin Ran Xiaoxin Chen Wenyu Liu and Xinggang Wang. 2024. Controlar: Controllable image generation with autoregressive models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.02705 (2024)."},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP48485.2024.10446794"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01954"},{"key":"e_1_3_3_1_16_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01481"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI48211.2021.9433778"},{"key":"e_1_3_3_1_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ISIE45552.2021.9576231"},{"key":"e_1_3_3_1_19_2","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Th\u00e9o Moutakanni Huy Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby et\u00a0al. 2023. Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2304.07193 (2023)."},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"crossref","unstructured":"Ken Perlin. 1985. An image synthesizer. ACM SIGGRAPH (1985).","DOI":"10.1145\/325165.325247"},{"key":"e_1_3_3_1_21_2","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et\u00a0al. 2018. Improving language understanding by generative pre-training."},{"key":"e_1_3_3_1_22_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01392"},{"key":"e_1_3_3_1_23_2","unstructured":"Y Shi J Yang and Z Qi. 2021. DFR: Deep feature reconstruction for unsupervised anomaly segmentation. Neurocomputing (2021)."},{"key":"e_1_3_3_1_24_2","unstructured":"Peize Sun Yi Jiang Shoufa Chen Shilong Zhang Bingyue Peng Ping Luo and Zehuan Yuan. 2024. Autoregressive model beats diffusion: Llama for scalable image generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.06525 (2024)."},{"key":"e_1_3_3_1_25_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_1_26_2","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et\u00a0al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.09288 (2023)."},{"key":"e_1_3_3_1_27_2","volume-title":"NeurIPS","author":"Den\u00a0Oord Aaron Van","year":"2017","unstructured":"Aaron Van Den\u00a0Oord, Oriol Vinyals, et\u00a0al. 2017. Neural discrete representation learning. In NeurIPS."},{"key":"e_1_3_3_1_28_2","volume-title":"NeurIPS","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In NeurIPS."},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"crossref","unstructured":"Minghui Yang Peng Wu and Hui Feng. 2023. MemSeg: A semi-supervised method for image surface defect detection using differences and commonalities. EAAI (2023).","DOI":"10.1016\/j.engappai.2023.105835"},{"key":"e_1_3_3_1_30_2","unstructured":"Haiming Yao Yunkang Cao Wei Luo Weihang Zhang Wenyong Yu and Weiming Shen. 2024. Prior normality prompt transformer for multiclass industrial image anomaly detection. IEEE TII (2024)."},{"key":"e_1_3_3_1_31_2","volume-title":"ECCV","author":"Yao Hang","year":"2024","unstructured":"Hang Yao, Ming Liu, Zhicun Yin, Zifei Yan, Xiaopeng Hong, and Wangmeng Zuo. 2024. GLAD: towards better reconstruction with global and local adaptive diffusion models for unsupervised anomaly detection. In ECCV."},{"key":"e_1_3_3_1_32_2","unstructured":"Ziyu Yao Jialin Li Yifeng Zhou Yong Liu Xi Jiang Chengjie Wang Feng Zheng Yuexian Zou and Lei Li. 2024. Car: Controllable autoregressive modeling for visual generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.04671 (2024)."},{"key":"e_1_3_3_1_33_2","volume-title":"NeurIPS","author":"You Zhiyuan","year":"2022","unstructured":"Zhiyuan You, Lei Cui, Yujun Shen, Kai Yang, Xin Lu, Yu Zheng, and Xinyi Le. 2022. A unified model for multi-class anomaly detection. In NeurIPS."},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"crossref","unstructured":"Sergey Zagoruyko and Nikos Komodakis. 2016. Wide residual networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1605.07146 (2016).","DOI":"10.5244\/C.30.87"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00822"},{"key":"e_1_3_3_1_36_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_3_1_37_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00624"},{"key":"e_1_3_3_1_38_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00381"},{"key":"e_1_3_3_1_39_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20056-4_23"}],"event":{"name":"MMAsia '25: ACM Multimedia Asia","location":"Kuala Lumpur Malaysia","acronym":"MMAsia '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 7th ACM International Conference on Multimedia in Asia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3743093.3770997","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,6]],"date-time":"2025-12-06T08:08:24Z","timestamp":1765008504000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3743093.3770997"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,6]]},"references-count":38,"alternative-id":["10.1145\/3743093.3770997","10.1145\/3743093"],"URL":"https:\/\/doi.org\/10.1145\/3743093.3770997","relation":{},"subject":[],"published":{"date-parts":[[2025,12,6]]},"assertion":[{"value":"2025-12-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}