{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T18:49:05Z","timestamp":1771699745478,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"the Fundamental Research Funds for the Central Universities","award":["2019JBZ110"],"award-info":[{"award-number":["2019JBZ110"]}]},{"name":"the Joint Foundation of the Ministry of Education","award":["8091B042235"],"award-info":[{"award-number":["8091B042235"]}]},{"name":"Beijing Natural Science Foundation","award":["4244096"],"award-info":[{"award-number":["4244096"]}]},{"name":"National Natural Science Foundation of China under Grant","award":["62176020"],"award-info":[{"award-number":["62176020"]}]},{"name":"the Beijing Natural Science Foundation under Grant","award":["L211016"],"award-info":[{"award-number":["L211016"]}]},{"name":"the National Key Research and Development Program","award":["2020AAA0106800"],"award-info":[{"award-number":["2020AAA0106800"]}]},{"name":"the Chinese Academy of Sciences","award":["OEIP-O-202004"],"award-info":[{"award-number":["OEIP-O-202004"]}]},{"name":"the Talent Found of Beijing Jiaotong University","award":["2024XKRC075"],"award-info":[{"award-number":["2024XKRC075"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680707","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"8972-8981","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Align2Concept: Language Guided Interpretable Image Recognition by Visual Prototype and Textual Concept Alignment"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-1766-7774","authenticated-orcid":false,"given":"Jiaqi","family":"Wang","sequence":"first","affiliation":[{"name":"Beijing Jiaotong University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1430-0237","authenticated-orcid":false,"given":"Pichao","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Seattle, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8358-2995","authenticated-orcid":false,"given":"Yi","family":"Feng","sequence":"additional","affiliation":[{"name":"Beijing Jiaotong University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7914-6867","authenticated-orcid":false,"given":"Huafeng","family":"Liu","sequence":"additional","affiliation":[{"name":"Beijing Jiaotong University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4009-426X","authenticated-orcid":false,"given":"Chang","family":"Gao","sequence":"additional","affiliation":[{"name":"Beijing Jiaotong University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7578-3407","authenticated-orcid":false,"given":"Liping","family":"Jing","sequence":"additional","affiliation":[{"name":"Beijing Jiaotong University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10559-011-9334-2"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0130140"},{"key":"e_1_3_2_1_3_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems Vol. 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_4_1","volume-title":"This looks like that: deep learning for interpretable image recognition. Advances in neural information processing systems","author":"Chen Chaofan","year":"2019","unstructured":"Chaofan Chen, Oscar Li, Daniel Tao, Alina Barnett, Cynthia Rudin, and Jonathan K Su. 2019. This looks like that: deep learning for interpretable image recognition. Advances in neural information processing systems, Vol. 32 (2019), 8930--8941."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-020-00265-z"},{"key":"e_1_3_2_1_6_1","volume-title":"Advances in Neural Information Processing Systems","volume":"27","author":"Cui Zhen","year":"2014","unstructured":"Zhen Cui, Hong Chang, Shiguang Shan, and Xilin Chen. 2014. Generalized unsupervised manifold alignment. Advances in Neural Information Processing Systems, Vol. 27 (2014)."},{"key":"e_1_3_2_1_7_1","volume-title":"Real time image saliency for black box classifiers. arXiv preprint arXiv:1705.07857","author":"Dabkowski Piotr","year":"2017","unstructured":"Piotr Dabkowski and Yarin Gal. 2017. Real time image saliency for black box classifiers. arXiv preprint arXiv:1705.07857 (2017)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01002"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00304"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.371"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jml.2020.104172"},{"key":"e_1_3_2_1_13_1","volume-title":"International Workshop on Artificial Intelligence and Statistics. PMLR, 120--127","author":"Ham Jihun","year":"2005","unstructured":"Jihun Ham, Daniel Lee, and Lawrence Saul. 2005. Semisupervised alignment of manifolds. In International Workshop on Artificial Intelligence and Statistics. PMLR, 120--127."},{"key":"e_1_3_2_1_14_1","volume-title":"Canonical correlation analysis: An overview with application to learning methods. Neural computation","author":"Hardoon David R","year":"2004","unstructured":"David R Hardoon, Sandor Szedmak, and John Shawe-Taylor. 2004. Canonical correlation analysis: An overview with application to learning methods. Neural computation, Vol. 16, 12 (2004), 2639--2664."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings, Part IV 14","author":"Hendricks Lisa Anne","year":"2016","unstructured":"Lisa Anne Hendricks, Zeynep Akata, Marcus Rohrbach, Jeff Donahue, Bernt Schiele, and Trevor Darrell. 2016. Generating visual explanations. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part IV 14. Springer, 3--19."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00869"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00096"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01459"},{"key":"e_1_3_2_1_21_1","volume-title":"Aggregating local descriptors into a compact image representation. In 2010 IEEE computer society conference on computer vision and pattern recognition","author":"J\u00e9gou Herv\u00e9","unstructured":"Herv\u00e9 J\u00e9gou, Matthijs Douze, Cordelia Schmid, and Patrick P\u00e9rez. 2010. Aggregating local descriptors into a compact image representation. In 2010 IEEE computer society conference on computer vision and pattern recognition. IEEE, 3304--3311."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01546"},{"key":"e_1_3_2_1_23_1","volume-title":"Beyond grids: Learning graph representations for visual recognition. Advances in neural information processing systems","author":"Li Yin","year":"2018","unstructured":"Yin Li and Abhinav Gupta. 2018. Beyond grids: Learning graph representations for visual recognition. Advances in neural information processing systems, Vol. 31 (2018)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_37"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/11744085_2"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0911-8"},{"key":"e_1_3_2_1_27_1","volume-title":"Do concept bottleneck models learn as intended? arXiv preprint arXiv:2105.04289","author":"Margeloiu Andrei","year":"2021","unstructured":"Andrei Margeloiu, Matthew Ashman, Umang Bhatt, Yanzhi Chen, Mateja Jamnik, and Adrian Weller. 2021. Do concept bottleneck models learn as intended? arXiv preprint arXiv:2105.04289 (2021)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2016.11.008"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-93736-2_34"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01469"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"e_1_3_2_1_32_1","volume-title":"Unsupervised image matching based on manifold alignment","author":"Pei Yuru","year":"2011","unstructured":"Yuru Pei, Fengchun Huang, Fuhao Shi, and Hongbin Zha. 2011. Unsupervised image matching based on manifold alignment. IEEE transactions on pattern analysis and machine intelligence, Vol. 34, 8 (2011), 1658--1664."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.tics.2013.06.004"},{"key":"e_1_3_2_1_34_1","volume-title":"International Conference on Machine Learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_35_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J Liu. 2020. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of machine learning research, Vol. 21, 140 (2020), 1--67.","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_36_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems, Vol. 28 (2015)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1037\/0096-3445.104.3.192"},{"key":"e_1_3_2_1_38_1","volume-title":"Natural categories. Cognitive psychology","author":"Rosch Eleanor H","year":"1973","unstructured":"Eleanor H Rosch. 1973. Natural categories. Cognitive psychology, Vol. 4, 3 (1973), 328--350."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3167702"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.74"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2007.383198"},{"key":"e_1_3_2_1_42_1","volume-title":"International Conference on Machine Learning. PMLR, 3145--3153","author":"Shrikumar Avanti","year":"2017","unstructured":"Avanti Shrikumar, Peyton Greenside, and Anshul Kundaje. 2017. Learning important features through propagating activation differences. In International Conference on Machine Learning. PMLR, 3145--3153."},{"key":"e_1_3_2_1_43_1","volume-title":"Deep inside convolutional networks: Visualising image classification models and saliency maps. arXiv preprint arXiv:1312.6034","author":"Simonyan Karen","year":"2013","unstructured":"Karen Simonyan, Andrea Vedaldi, and Andrew Zisserman. 2013. Deep inside convolutional networks: Visualising image classification models and saliency maps. arXiv preprint arXiv:1312.6034 (2013)."},{"key":"e_1_3_2_1_44_1","volume-title":"Striving for simplicity: The all convolutional net. arXiv preprint arXiv:1412.6806","author":"Springenberg Jost Tobias","year":"2014","unstructured":"Jost Tobias Springenberg, Alexey Dosovitskiy, Thomas Brox, and Martin Riedmiller. 2014. Striving for simplicity: The all convolutional net. arXiv preprint arXiv:1412.6806 (2014)."},{"key":"e_1_3_2_1_45_1","volume-title":"International Conference on Machine Learning. PMLR, 3319--3328","author":"Sundararajan Mukund","year":"2017","unstructured":"Mukund Sundararajan, Ankur Taly, and Qiqi Yan. 2017. Axiomatic attribution for deep networks. In International Conference on Machine Learning. PMLR, 3319--3328."},{"key":"e_1_3_2_1_46_1","unstructured":"Catherine Wah Steve Branson Peter Welinder Pietro Perona and Serge Belongie. 2011. The caltech-ucsd birds-200--2011 dataset. (2011)."},{"key":"e_1_3_2_1_47_1","volume-title":"2009 AAAI Fall Symposium Series.","author":"Wang Chang","year":"2009","unstructured":"Chang Wang and Sridhar Mahadevan. 2009. A general framework for manifold alignment. In 2009 AAAI Fall Symposium Series."},{"key":"e_1_3_2_1_48_1","first-page":"3","article-title":"Manifold alignment without correspondence","volume":"2","author":"Wang Chang","year":"2009","unstructured":"Chang Wang and Sridhar Mahadevan. 2009. Manifold alignment without correspondence.. In IJCAI, Vol. 2. 3.","journal-title":"IJCAI"},{"key":"e_1_3_2_1_49_1","unstructured":"Chang Wang and Sridhar Mahadevan. 2011. Heterogeneous domain adaptation using manifold alignment. In Twenty-second international joint conference on artificial intelligence."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3314769"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00093"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10107-012-0584-1"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01839"},{"key":"e_1_3_2_1_54_1","volume-title":"Post-hoc concept bottleneck models. arXiv preprint arXiv:2205.15480","author":"Yuksekgonul Mert","year":"2022","unstructured":"Mert Yuksekgonul, Maggie Wang, and James Zou. 2022. Post-hoc concept bottleneck models. arXiv preprint arXiv:2205.15480 (2022)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10590-1_53"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-017-1059-x"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00920"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.319"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680707","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680707","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:06:23Z","timestamp":1750291583000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680707"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":58,"alternative-id":["10.1145\/3664647.3680707","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680707","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}