{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,4]],"date-time":"2025-07-04T06:16:48Z","timestamp":1751609808710,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":100,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,4,19]],"date-time":"2023-04-19T00:00:00Z","timestamp":1681862400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["R01MH125740, R01MH096951, U01MH116925"],"award-info":[{"award-number":["R01MH125740, R01MH096951, U01MH116925"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1722822, 1750439"],"award-info":[{"award-number":["1722822, 1750439"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100019827","name":"Meta","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100019827","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100004704","name":"BMW of North America","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100004704","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,4,19]]},"DOI":"10.1145\/3544549.3585604","type":"proceedings-article","created":{"date-parts":[[2023,4,20]],"date-time":"2023-04-20T07:31:00Z","timestamp":1681975860000},"page":"1-21","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["MultiViz: Towards User-Centric Visualizations and Interpretations of Multimodal Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7768-3610","authenticated-orcid":false,"given":"Paul Pu","family":"Liang","sequence":"first","affiliation":[{"name":"Machine Learning Department, Carnegie Mellon University, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3882-4246","authenticated-orcid":false,"given":"Yiwei","family":"Lyu","sequence":"additional","affiliation":[{"name":"University of Michigan, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8445-9051","authenticated-orcid":false,"given":"Gunjan","family":"Chhablani","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1939-0278","authenticated-orcid":false,"given":"Nihal","family":"Jain","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9392-2335","authenticated-orcid":false,"given":"Zihao","family":"Deng","sequence":"additional","affiliation":[{"name":"University of Pennsylvania, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5693-1128","authenticated-orcid":false,"given":"Xingbo","family":"Wang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, The Hong Kong University of Science and Technology, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6376-7696","authenticated-orcid":false,"given":"Louis-Philippe","family":"Morency","sequence":"additional","affiliation":[{"name":"Language Technologies Institute, Carnegie Mellon University, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3752-2756","authenticated-orcid":false,"given":"Ruslan","family":"Salakhutdinov","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, United States"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,4,19]]},"reference":[{"key":"e_1_3_3_3_1_1","volume-title":"Sanity Checks for Saliency Maps. CoRR abs\/1810.03292","author":"Adebayo Julius","year":"2018","unstructured":"Julius Adebayo, Justin Gilmer, Michael Muelly, Ian\u00a0J. Goodfellow, Moritz Hardt, and Been Kim. 2018. Sanity Checks for Saliency Maps. CoRR abs\/1810.03292 (2018). arXiv:1810.03292http:\/\/arxiv.org\/abs\/1810.03292"},{"key":"e_1_3_3_3_2_1","volume-title":"International Conference on Machine Learning. PMLR, 279\u2013290","author":"Amizadeh Saeed","year":"2020","unstructured":"Saeed Amizadeh, Hamid Palangi, Alex Polozov, Yichen Huang, and Kazuhito Koishida. 2020. Neuro-Symbolic Visual Reasoning: Disentangling Visual from Reasoning. In International Conference on Machine Learning. PMLR, 279\u2013290."},{"key":"e_1_3_3_3_3_1","volume-title":"Blindfold baselines for embodied QA. arXiv preprint arXiv:1811.05013","author":"Anand Ankesh","year":"2018","unstructured":"Ankesh Anand, Eugene Belilovsky, Kyle Kastner, Hugo Larochelle, and Aaron Courville. 2018. Blindfold baselines for embodied QA. arXiv preprint arXiv:1811.05013 (2018)."},{"key":"e_1_3_3_3_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.12"},{"key":"e_1_3_3_3_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.279"},{"key":"e_1_3_3_3_6_1","volume-title":"Gated Multimodal Units for Information Fusion. In 5th International conference on learning representations 2017 workshop.","author":"Arevalo John","year":"2017","unstructured":"John Arevalo, Thamar Solorio, Manuel Montes-y G\u00f3mez, and Fabio\u00a0A Gonz\u00e1lez. 2017. Gated Multimodal Units for Information Fusion. In 5th International conference on learning representations 2017 workshop."},{"key":"e_1_3_3_3_7_1","volume-title":"Rethinking User Study Design for Evaluating Model Explanations. arXiv preprint arXiv:2112.09669","author":"Arora Siddhant","year":"2021","unstructured":"Siddhant Arora, Danish Pruthi, Norman Sadeh, William\u00a0W Cohen, Zachary\u00a0C Lipton, and Graham Neubig. 2021. Explain, Edit, and Understand: Rethinking User Study Design for Evaluating Model Explanations. arXiv preprint arXiv:2112.09669 (2021)."},{"key":"e_1_3_3_3_8_1","doi-asserted-by":"publisher","DOI":"10.5555\/1756006.1859912"},{"key":"e_1_3_3_3_9_1","volume-title":"Multimodal machine learning: A survey and taxonomy","author":"Baltru\u0161aitis Tadas","year":"2018","unstructured":"Tadas Baltru\u0161aitis, Chaitanya Ahuja, and Louis-Philippe Morency. 2018. Multimodal machine learning: A survey and taxonomy. IEEE transactions on pattern analysis and machine intelligence 41, 2 (2018), 423\u2013443."},{"key":"e_1_3_3_3_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3375624"},{"key":"e_1_3_3_3_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/122475.122486"},{"key":"e_1_3_3_3_12_1","first-page":"841","article-title":". RUBi: Reducing Unimodal Biases for Visual Question Answering","volume":"32","author":"Cadene Remi","year":"2019","unstructured":"Remi Cadene, Corentin Dancette, Matthieu Cord, Devi Parikh, 2019. RUBi: Reducing Unimodal Biases for Visual Question Answering. Advances in Neural Information Processing Systems 32 (2019), 841\u2013852.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_3_3_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58539-6_34"},{"key":"e_1_3_3_3_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.345"},{"key":"e_1_3_3_3_15_1","doi-asserted-by":"crossref","unstructured":"Arjun Chandrasekaran Viraj Prabhu Deshraj Yadav Prithvijit Chattopadhyay and Devi Parikh. 2018. Do explanations make VQA models more predictable to a human?. In EMNLP.","DOI":"10.18653\/v1\/D18-1128"},{"key":"e_1_3_3_3_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3136801"},{"key":"e_1_3_3_3_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3511299"},{"key":"e_1_3_3_3_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/2207676.2207738"},{"key":"e_1_3_3_3_19_1","volume-title":"Framework for Evaluating Faithfulness of Local Explanations. arXiv preprint arXiv:2202.00734","author":"Dasgupta Sanjoy","year":"2022","unstructured":"Sanjoy Dasgupta, Nave Frost, and Michal Moshkovitz. 2022. Framework for Evaluating Faithfulness of Local Explanations. arXiv preprint arXiv:2202.00734 (2022)."},{"volume-title":"Human machine interaction","author":"Dumas Bruno","key":"e_1_3_3_3_20_1","unstructured":"Bruno Dumas, Denis Lalanne, and Sharon Oviatt. 2009. Multimodal interfaces: A survey of principles, models and frameworks. In Human machine interaction. Springer, 3\u201326."},{"key":"e_1_3_3_3_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/6046.865479"},{"key":"e_1_3_3_3_22_1","first-page":"1","article-title":"Visualizing higher-layer features of a deep network","volume":"1341","author":"Erhan Dumitru","year":"2009","unstructured":"Dumitru Erhan, Yoshua Bengio, Aaron Courville, and Pascal Vincent. 2009. Visualizing higher-layer features of a deep network. University of Montreal 1341, 3 (2009), 1.","journal-title":"University of Montreal"},{"key":"e_1_3_3_3_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.775"},{"key":"e_1_3_3_3_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TITB.2009.2038481"},{"key":"e_1_3_3_3_25_1","volume-title":"Predictive learning via rule ensembles. The annals of applied statistics 2, 3","author":"Friedman H","year":"2008","unstructured":"Jerome\u00a0H Friedman and Bogdan\u00a0E Popescu. 2008. Predictive learning via rule ensembles. The annals of applied statistics 2, 3 (2008), 916\u2013954."},{"key":"e_1_3_3_3_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSAA.2018.00018"},{"key":"e_1_3_3_3_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58589-1_23"},{"key":"e_1_3_3_3_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2556288.2557173"},{"key":"e_1_3_3_3_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_3_3_30_1","volume-title":"Towards transparent ai systems: Interpreting visual question answering models. arXiv preprint arXiv:1608.08974","author":"Goyal Yash","year":"2016","unstructured":"Yash Goyal, Akrit Mohapatra, Devi Parikh, and Dhruv Batra. 2016. Towards transparent ai systems: Interpreting visual question answering models. arXiv preprint arXiv:1608.08974 (2016)."},{"key":"e_1_3_3_3_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.492"},{"key":"e_1_3_3_3_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01219-9_47"},{"key":"e_1_3_3_3_33_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.62"},{"key":"e_1_3_3_3_34_1","volume-title":"A Picture is Worth a Thousand Words: Multimodal Sensemaking of the Global Financial Crisis","author":"Hollerer A.","year":"2018","unstructured":"Markus\u00a0A. Hollerer, Dennis Jancsary, and Maria Grafstrom. 2018. A Picture is Worth a Thousand Words: Multimodal Sensemaking of the Global Financial Crisis. Organization Studies (2018)."},{"key":"e_1_3_3_3_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_44"},{"key":"e_1_3_3_3_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.386"},{"key":"e_1_3_3_3_37_1","volume-title":"a freely accessible critical care database. Scientific data 3, 1","author":"Johnson EW","year":"2016","unstructured":"Alistair\u00a0EW Johnson, Tom\u00a0J Pollard, Lu Shen, H\u00a0Lehman Li-Wei, Mengling Feng, Mohammad Ghassemi, Benjamin Moody, Peter Szolovits, Leo\u00a0Anthony Celi, and Roger\u00a0G Mark. 2016. MIMIC-III, a freely accessible critical care database. Scientific data 3, 1 (2016), 1\u20139."},{"key":"e_1_3_3_3_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.215"},{"key":"e_1_3_3_3_39_1","volume-title":"MDETR\u2013Modulated Detection for End-to-End Multi-Modal Understanding. arXiv preprint arXiv:2104.12763","author":"Kamath Aishwarya","year":"2021","unstructured":"Aishwarya Kamath, Mannat Singh, Yann LeCun, Ishan Misra, Gabriel Synnaeve, and Nicolas Carion. 2021. MDETR\u2013Modulated Detection for End-to-End Multi-Modal Understanding. arXiv preprint arXiv:2104.12763 (2021)."},{"key":"e_1_3_3_3_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00879"},{"key":"e_1_3_3_3_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357251.3357581"},{"key":"e_1_3_3_3_42_1","volume-title":"International Conference on Machine Learning. PMLR, 5583\u20135594","author":"Kim Wonjae","year":"2021","unstructured":"Wonjae Kim, Bokyung Son, and Ildoo Kim. 2021. Vilt: Vision-and-language transformer without convolution or region supervision. In International Conference on Machine Learning. PMLR, 5583\u20135594."},{"key":"e_1_3_3_3_43_1","doi-asserted-by":"crossref","unstructured":"Elsa\u00a0A Kirchner Stephen\u00a0H Fairclough and Frank Kirchner. 2019. Embedded multimodal interfaces in robotics: applications future trends and societal implications. In The Handbook of Multimodal-Multisensor Interfaces: Language Processing Software Commercialization and Emerging Directions-Volume 3. 523\u2013576.","DOI":"10.1145\/3233795.3233810"},{"key":"e_1_3_3_3_44_1","unstructured":"Klaus Krippendorff. 2011. Computing Krippendorff\u2019s alpha-reliability. (2011)."},{"key":"e_1_3_3_3_45_1","volume-title":"The Disagreement Problem in Explainable Machine Learning: A Practitioner\u2019s Perspective. arXiv preprint arXiv:2202.01602","author":"Krishna Satyapriya","year":"2022","unstructured":"Satyapriya Krishna, Tessa Han, Alex Gu, Javin Pombra, Shahin Jabbari, Steven Wu, and Himabindu Lakkaraju. 2022. The Disagreement Problem in Explainable Machine Learning: A Practitioner\u2019s Perspective. arXiv preprint arXiv:2202.01602 (2022)."},{"key":"e_1_3_3_3_46_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.241"},{"key":"e_1_3_3_3_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA.2019.8793485"},{"key":"e_1_3_3_3_48_1","volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","author":"Li Liunian\u00a0Harold","year":"2019","unstructured":"Liunian\u00a0Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557 (2019)."},{"key":"e_1_3_3_3_49_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.469"},{"key":"e_1_3_3_3_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3311823.3311831"},{"key":"e_1_3_3_3_51_1","doi-asserted-by":"crossref","unstructured":"Paul\u00a0Pu Liang Zhun Liu Yao-Hung\u00a0Hubert Tsai Qibin Zhao Ruslan Salakhutdinov and Louis-Philippe Morency. 2019. Learning Representations from Imperfect Time Series Data via Tensor Rank Regularization. In ACL.","DOI":"10.18653\/v1\/P19-1152"},{"key":"e_1_3_3_3_52_1","doi-asserted-by":"crossref","unstructured":"Paul\u00a0Pu Liang Ziyin Liu AmirAli\u00a0Bagher Zadeh and Louis-Philippe Morency. 2018. Multimodal Language Analysis with Recurrent Multistage Fusion. In EMNLP.","DOI":"10.18653\/v1\/D18-1014"},{"key":"e_1_3_3_3_53_1","unstructured":"Paul\u00a0Pu Liang Yiwei Lyu Gunjan Chhablani Nihal Jain Zihao Deng 2023. MultiViz: Towards Visualizing and Understanding Multimodal Models. In ICLR."},{"key":"e_1_3_3_3_54_1","volume-title":"MultiBench: Multiscale Benchmarks for Multimodal Representation Learning. NeurIPS Datasets and Benchmarks Track","author":"Liang Paul\u00a0Pu","year":"2021","unstructured":"Paul\u00a0Pu Liang, Yiwei Lyu, Xiang Fan, Zetian Wu, Yun Cheng, Jason Wu, Leslie Chen, Peter Wu, Michelle\u00a0A Lee, Yuke Zhu, Ruslan Salakhutdinov, and Louis-Philippe Morency. 2021. MultiBench: Multiscale Benchmarks for Multimodal Representation Learning. NeurIPS Datasets and Benchmarks Track (2021)."},{"key":"e_1_3_3_3_55_1","volume-title":"Foundations and recent trends in multimodal machine learning: Principles, challenges, and open questions. arXiv preprint arXiv:2209.03430","author":"Liang Paul\u00a0Pu","year":"2022","unstructured":"Paul\u00a0Pu Liang, Amir Zadeh, and Louis-Philippe Morency. 2022. Foundations and recent trends in multimodal machine learning: Principles, challenges, and open questions. arXiv preprint arXiv:2209.03430 (2022)."},{"key":"e_1_3_3_3_56_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1209"},{"key":"e_1_3_3_3_57_1","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems. 13\u201323","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In Proceedings of the 33rd International Conference on Neural Information Processing Systems. 13\u201323."},{"key":"e_1_3_3_3_58_1","volume-title":"DIME: Fine-grained Interpretations of Multimodal Models via Disentangled Local Explanations. arXiv preprint arXiv:2203.02013","author":"Lyu Yiwei","year":"2022","unstructured":"Yiwei Lyu, Paul\u00a0Pu Liang, Zihao Deng, Ruslan Salakhutdinov, and Louis-Philippe Morency. 2022. DIME: Fine-grained Interpretations of Multimodal Models via Disentangled Local Explanations. arXiv preprint arXiv:2203.02013 (2022)."},{"key":"e_1_3_3_3_59_1","volume-title":"Evaluating the faithfulness of importance measures in nlp by recursively masking allegedly important tokens and retraining. arXiv preprint arXiv:2110.08412","author":"Madsen Andreas","year":"2021","unstructured":"Andreas Madsen, Nicholas Meade, Vaibhav Adlakha, and Siva Reddy. 2021. Evaluating the faithfulness of importance measures in nlp by recursively masking allegedly important tokens and retraining. arXiv preprint arXiv:2110.08412 (2021)."},{"key":"e_1_3_3_3_60_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=rJgMlhRctm","author":"Mao Jiayuan","year":"2019","unstructured":"Jiayuan Mao, Chuang Gan, Pushmeet Kohli, Joshua\u00a0B. Tenenbaum, and Jiajun Wu. 2019. The Neuro-Symbolic Concept Learner: Interpreting Scenes, Words, and Sentences From Natural Supervision. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=rJgMlhRctm"},{"key":"e_1_3_3_3_61_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-57321-8_2"},{"key":"e_1_3_3_3_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3382507.3418881"},{"key":"e_1_3_3_3_63_1","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1900654116"},{"key":"e_1_3_3_3_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2006.63"},{"key":"e_1_3_3_3_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/223904.223917"},{"key":"e_1_3_3_3_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2004.139"},{"key":"e_1_3_3_3_67_1","volume-title":"Multimodal interfaces. The human-computer interaction handbook","author":"Oviatt Sharon","year":"2007","unstructured":"Sharon Oviatt. 2007. Multimodal interfaces. The human-computer interaction handbook (2007), 439\u2013458."},{"key":"e_1_3_3_3_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/258549.258821"},{"key":"e_1_3_3_3_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/1054972.1055006"},{"key":"e_1_3_3_3_70_1","volume-title":"Proceedings of the 1st Workshop on Multimodal Semantic Representations (MMSR). 32\u201344","author":"Parcalabescu Letitia","year":"2021","unstructured":"Letitia Parcalabescu, Albert Gatt, Anette Frank, and Iacer Calixto. 2021. Seeing past words: Testing the cross-modal capabilities of pretrained V&L models on counting tasks. In Proceedings of the 1st Workshop on Multimodal Semantic Representations (MMSR). 32\u201344."},{"key":"e_1_3_3_3_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00915"},{"key":"e_1_3_3_3_72_1","doi-asserted-by":"publisher","DOI":"10.1145\/3536221.3556600"},{"key":"e_1_3_3_3_73_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10772-010-9068-y"},{"key":"e_1_3_3_3_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"key":"e_1_3_3_3_75_1","volume-title":"A review of affective computing: From unimodal analysis to multimodal fusion. Information Fusion","author":"Poria Soujanya","year":"2017","unstructured":"Soujanya Poria, Erik Cambria, Rajiv Bajpai, and Amir Hussain. 2017. A review of affective computing: From unimodal analysis to multimodal fusion. Information Fusion (2017)."},{"key":"e_1_3_3_3_76_1","volume-title":"International Conference on Machine Learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_3_77_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems 28 (2015)."},{"key":"e_1_3_3_3_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939778"},{"key":"e_1_3_3_3_79_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10822-020-00314-0"},{"key":"e_1_3_3_3_80_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415848"},{"key":"e_1_3_3_3_81_1","volume-title":"Do Input Gradients Highlight Discriminative Features?Advances in Neural Information Processing Systems 34","author":"Shah Harshay","year":"2021","unstructured":"Harshay Shah, Prateek Jain, and Praneeth Netrapalli. 2021. Do Input Gradients Highlight Discriminative Features?Advances in Neural Information Processing Systems 34 (2021)."},{"key":"e_1_3_3_3_82_1","volume-title":"Deep inside convolutional networks: Visualising image classification models and saliency maps. arXiv preprint arXiv:1312.6034","author":"Simonyan Karen","year":"2013","unstructured":"Karen Simonyan, Andrea Vedaldi, and Andrew Zisserman. 2013. Deep inside convolutional networks: Visualising image classification models and saliency maps. arXiv preprint arXiv:1312.6034 (2013)."},{"key":"e_1_3_3_3_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/1390156.1390282"},{"key":"e_1_3_3_3_84_1","volume-title":"Rethinking the Role of Gradient-based Attribution Methods for Model Interpretability. In International Conference on Learning Representations.","author":"Srinivas Suraj","year":"2020","unstructured":"Suraj Srinivas and Francois Fleuret. 2020. Rethinking the Role of Gradient-based Attribution Methods for Model Interpretability. In International Conference on Learning Representations."},{"key":"e_1_3_3_3_85_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376782"},{"key":"e_1_3_3_3_86_1","volume-title":"Orko: Facilitating multimodal interaction for visual exploration and analysis of networks","author":"Srinivasan Arjun","year":"2017","unstructured":"Arjun Srinivasan and John Stasko. 2017. Orko: Facilitating multimodal interaction for visual exploration and analysis of networks. IEEE transactions on visualization and computer graphics 24, 1 (2017), 511\u2013521."},{"key":"e_1_3_3_3_87_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-2054"},{"key":"e_1_3_3_3_88_1","volume-title":"LXMERT: Learning Cross-Modality Encoder Representations from Transformers. CoRR abs\/1908.07490","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal. 2019. LXMERT: Learning Cross-Modality Encoder Representations from Transformers. CoRR abs\/1908.07490 (2019). arXiv:1908.07490http:\/\/arxiv.org\/abs\/1908.07490"},{"key":"e_1_3_3_3_89_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00517"},{"key":"e_1_3_3_3_90_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1656"},{"key":"e_1_3_3_3_91_1","volume-title":"International Conference on Learning Representations.","author":"Tsang Michael","year":"2019","unstructured":"Michael Tsang, Dehua Cheng, Hanpeng Liu, Xue Feng, Eric Zhou, and Yan Liu. 2019. Feature Interaction Interpretability: A Case for Explaining Ad-Recommendation Systems via Neural Interaction Detection. In International Conference on Learning Representations."},{"key":"e_1_3_3_3_92_1","volume-title":"International Conference on Learning Representations.","author":"Tsang Michael","year":"2018","unstructured":"Michael Tsang, Dehua Cheng, and Yan Liu. 2018. Detecting Statistical Interactions from Neural Network Weights. In International Conference on Learning Representations."},{"key":"e_1_3_3_3_93_1","volume-title":"Probabilistic Neural Symbolic Models for Interpretable Visual Question Answering. In International Conference on Machine Learning. 6428\u20136437","author":"Vedantam Ramakrishna","year":"2019","unstructured":"Ramakrishna Vedantam, Karan Desai, Stefan Lee, Marcus Rohrbach, Dhruv Batra, and Devi Parikh. 2019. Probabilistic Neural Symbolic Models for Interpretable Visual Question Answering. In International Conference on Machine Learning. 6428\u20136437."},{"key":"e_1_3_3_3_94_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2021.3114794"},{"key":"e_1_3_3_3_95_1","doi-asserted-by":"publisher","DOI":"10.1145\/1240624.1240642"},{"key":"e_1_3_3_3_96_1","volume-title":"International Conference on Machine Learning. PMLR, 11205\u201311216","author":"Wong Eric","year":"2021","unstructured":"Eric Wong, Shibani Santurkar, and Aleksander Madry. 2021. Leveraging sparse linear layers for debuggable deep networks. In International Conference on Machine Learning. PMLR, 11205\u201311216."},{"key":"e_1_3_3_3_97_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472749.3474763"},{"key":"e_1_3_3_3_98_1","volume-title":"Machine Learning for Healthcare Conference. PMLR, 197\u2013215","author":"Xu Keyang","year":"2019","unstructured":"Keyang Xu, Mike Lam, Jingzhi Pang, Xin Gao, Charlotte Band, Piyush Mathur, Frank Papay, Ashish\u00a0K Khanna, Jacek\u00a0B Cywinski, Kamal Maheshwari, 2019. Multimodal machine learning for automated ICD coding. In Machine Learning for Healthcare Conference. PMLR, 197\u2013215."},{"key":"e_1_3_3_3_99_1","volume-title":"In ICML Workshop on Deep Learning. Citeseer.","author":"Yosinski Jason","year":"2015","unstructured":"Jason Yosinski, Jeff Clune, Thomas Fuchs, and Hod Lipson. 2015. Understanding neural networks through deep visualization. In In ICML Workshop on Deep Learning. Citeseer."},{"key":"e_1_3_3_3_100_1","doi-asserted-by":"crossref","unstructured":"AmirAli\u00a0Bagher Zadeh Paul\u00a0Pu Liang Soujanya Poria Erik Cambria and Louis-Philippe Morency. 2018. Multimodal language analysis in the wild: Cmu-mosei dataset and interpretable dynamic fusion graph. In ACL.","DOI":"10.18653\/v1\/P18-1208"}],"event":{"name":"CHI '23: CHI Conference on Human Factors in Computing Systems","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction"],"location":"Hamburg Germany","acronym":"CHI '23"},"container-title":["Extended Abstracts of the 2023 CHI Conference on Human Factors in Computing Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3544549.3585604","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3544549.3585604","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3544549.3585604","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:46:33Z","timestamp":1750178793000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3544549.3585604"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,4,19]]},"references-count":100,"alternative-id":["10.1145\/3544549.3585604","10.1145\/3544549"],"URL":"https:\/\/doi.org\/10.1145\/3544549.3585604","relation":{},"subject":[],"published":{"date-parts":[[2023,4,19]]},"assertion":[{"value":"2023-04-19","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}