{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,15]],"date-time":"2025-10-15T10:33:13Z","timestamp":1760524393960,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,7,26]],"date-time":"2022-07-26T00:00:00Z","timestamp":1658793600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Facebook PhD Fellowship"},{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1722822 and 1750439"],"award-info":[{"award-number":["1722822 and 1750439"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Carnegie Mellon University?s Center for Machine Learning and Health Fellowship"},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["IIS1763562"],"award-info":[{"award-number":["IIS1763562"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"name":"ONR Grant","award":["N000141812861"],"award-info":[{"award-number":["N000141812861"]}]},{"DOI":"10.13039\/100000002","name":"National Institutes of Health","doi-asserted-by":"publisher","award":["#R01MH125740 and #R01MH096951"],"award-info":[{"award-number":["#R01MH125740 and #R01MH096951"]}],"id":[{"id":"10.13039\/100000002","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,7,26]]},"DOI":"10.1145\/3514094.3534148","type":"proceedings-article","created":{"date-parts":[[2022,7,27]],"date-time":"2022-07-27T22:25:13Z","timestamp":1658960713000},"page":"455-467","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":21,"title":["DIME: Fine-grained Interpretations of Multimodal Models via Disentangled Local Explanations"],"prefix":"10.1145","author":[{"given":"Yiwei","family":"Lyu","sequence":"first","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"given":"Paul Pu","family":"Liang","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"given":"Zihao","family":"Deng","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"given":"Ruslan","family":"Salakhutdinov","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"given":"Louis-Philippe","family":"Morency","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]}],"member":"320","published-online":{"date-parts":[[2022,7,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1203"},{"key":"e_1_3_2_1_2_1","volume-title":"Blindfold baselines for embodied QA. arXiv preprint arXiv:1811.05013","author":"Anand Ankesh","year":"2018","unstructured":"Ankesh Anand , Eugene Belilovsky , Kyle Kastner , Hugo Larochelle , and Aaron Courville . 2018. Blindfold baselines for embodied QA. arXiv preprint arXiv:1811.05013 ( 2018 ). Ankesh Anand, Eugene Belilovsky, Kyle Kastner, Hugo Larochelle, and Aaron Courville. 2018. Blindfold baselines for embodied QA. arXiv preprint arXiv:1811.05013 (2018)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.12"},{"key":"e_1_3_2_1_4_1","volume-title":"Multimodal machine learning: A survey and taxonomy","author":"Tadas Baltruvs","year":"2018","unstructured":"Tadas Baltruvs aitis, Chaitanya Ahuja , and Louis-Philippe Morency . 2018. Multimodal machine learning: A survey and taxonomy . IEEE transactions on pattern analysis and machine intelligence, Vol. 41 , 2 ( 2018 ), 423--443. Tadas Baltruvs aitis, Chaitanya Ahuja, and Louis-Philippe Morency. 2018. Multimodal machine learning: A survey and taxonomy. IEEE transactions on pattern analysis and machine intelligence, Vol. 41, 2 (2018), 423--443."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.354"},{"key":"e_1_3_2_1_6_1","first-page":"8","article-title":"Representation Learning","volume":"35","author":"Bengio Yoshua","year":"2013","unstructured":"Yoshua Bengio , Aaron Courville , and Pascal Vincent . 2013 . Representation Learning : A Review and New Perspectives. TPAMI , Vol. 35 , 8 (Aug. 2013). Yoshua Bengio, Aaron Courville, and Pascal Vincent. 2013. Representation Learning: A Review and New Perspectives. TPAMI, Vol. 35, 8 (Aug. 2013).","journal-title":"A Review and New Perspectives. TPAMI"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3351095.3375624"},{"key":"e_1_3_2_1_8_1","volume-title":"Rubi: Reducing unimodal biases for visual question answering. Advances in neural information processing systems","author":"Cadene Remi","year":"2019","unstructured":"Remi Cadene , Corentin Dancette , Matthieu Cord , Devi Parikh , 2019 . Rubi: Reducing unimodal biases for visual question answering. Advances in neural information processing systems , Vol. 32 (2019). Remi Cadene, Corentin Dancette, Matthieu Cord, Devi Parikh, et al. 2019. Rubi: Reducing unimodal biases for visual question answering. Advances in neural information processing systems, Vol. 32 (2019)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58539-6_34"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"crossref","unstructured":"Arjun Chandrasekaran Viraj Prabhu Deshraj Yadav Prithvijit Chattopadhyay and Devi Parikh. 2018. Do explanations make VQA models more predictable to a human?. In EMNLP .  Arjun Chandrasekaran Viraj Prabhu Deshraj Yadav Prithvijit Chattopadhyay and Devi Parikh. 2018. Do explanations make VQA models more predictable to a human?. In EMNLP .","DOI":"10.18653\/v1\/D18-1128"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3136755.3136801"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3511299"},{"key":"e_1_3_2_1_13_1","volume-title":"Infogan: Interpretable representation learning by information maximizing generative adversarial nets. In NIPS.","author":"Chen Xi","year":"2016","unstructured":"Xi Chen , Yan Duan , Rein Houthooft , John Schulman , Ilya Sutskever , and Pieter Abbeel . 2016 . Infogan: Interpretable representation learning by information maximizing generative adversarial nets. In NIPS. Xi Chen, Yan Duan, Rein Houthooft, John Schulman, Ilya Sutskever, and Pieter Abbeel. 2016. Infogan: Interpretable representation learning by information maximizing generative adversarial nets. In NIPS."},{"key":"e_1_3_2_1_14_1","volume-title":"Discovering hidden factors of variation in deep networks. arXiv preprint arXiv:1412.6583","author":"Cheung Brian","year":"2014","unstructured":"Brian Cheung , Jesse A Livezey , Arjun K Bansal , and Bruno A Olshausen . 2014. Discovering hidden factors of variation in deep networks. arXiv preprint arXiv:1412.6583 ( 2014 ). Brian Cheung, Jesse A Livezey, Arjun K Bansal, and Bruno A Olshausen. 2014. Discovering hidden factors of variation in deep networks. arXiv preprint arXiv:1412.6583 (2014)."},{"volume-title":"Human machine interaction","author":"Dumas Bruno","key":"e_1_3_2_1_15_1","unstructured":"Bruno Dumas , Denis Lalanne , and Sharon Oviatt . 2009. Multimodal interfaces: A survey of principles, models and frameworks . In Human machine interaction . Springer , 3--26. Bruno Dumas, Denis Lalanne, and Sharon Oviatt. 2009. Multimodal interfaces: A survey of principles, models and frameworks. In Human machine interaction. Springer, 3--26."},{"key":"e_1_3_2_1_16_1","first-page":"1","article-title":"Visualizing higher-layer features of a deep network","volume":"1341","author":"Erhan Dumitru","year":"2009","unstructured":"Dumitru Erhan , Yoshua Bengio , Aaron Courville , and Pascal Vincent . 2009 . Visualizing higher-layer features of a deep network . University of Montreal , Vol. 1341 , 3 (2009), 1 . Dumitru Erhan, Yoshua Bengio, Aaron Courville, and Pascal Vincent. 2009. Visualizing higher-layer features of a deep network. University of Montreal, Vol. 1341, 3 (2009), 1.","journal-title":"University of Montreal"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.775"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSAA.2018.00018"},{"key":"e_1_3_2_1_19_1","volume-title":"Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR).","author":"Khot Tejas","year":"2017","unstructured":"Tejas Khot , Douglas Summers-Stay , Dhruv Batra , and Devi Parikh . 2017 a. Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR). Tejas Khot, Douglas Summers-Stay, Dhruv Batra, and Devi Parikh. 2017a. Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"e_1_3_2_1_21_1","volume-title":"Towards transparent ai systems: Interpreting visual question answering models. arXiv preprint arXiv:1608.08974","author":"Goyal Yash","year":"2016","unstructured":"Yash Goyal , Akrit Mohapatra , Devi Parikh , and Dhruv Batra . 2016. Towards transparent ai systems: Interpreting visual question answering models. arXiv preprint arXiv:1608.08974 ( 2016 ). Yash Goyal, Akrit Mohapatra, Devi Parikh, and Dhruv Batra. 2016. Towards transparent ai systems: Interpreting visual question answering models. arXiv preprint arXiv:1608.08974 (2016)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01219-9_47"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Jack Hessel and Lillian Lee. 2020. Does my multimodal model learn cross-modal interactions? It's harder to tell than you might think!. In EMNLP.  Jack Hessel and Lillian Lee. 2020. Does my multimodal model learn cross-modal interactions? It's harder to tell than you might think!. In EMNLP.","DOI":"10.18653\/v1\/2020.emnlp-main.62"},{"key":"e_1_3_2_1_24_1","unstructured":"Irina Higgins Loic Matthey Arka Pal Christopher Burgess Xavier Glorot Matthew Botvinick Shakir Mohamed and Alexander Lerchner. 2016. ?-vae: Learning basic visual concepts with a constrained variational framework. (2016).  Irina Higgins Loic Matthey Arka Pal Christopher Burgess Xavier Glorot Matthew Botvinick Shakir Mohamed and Alexander Lerchner. 2016. ?-vae: Learning basic visual concepts with a constrained variational framework. (2016)."},{"key":"e_1_3_2_1_25_1","volume-title":"A Picture is Worth a Thousand Words: Multimodal Sensemaking of the Global Financial Crisis","author":"Hollerer Markus A.","year":"2018","unstructured":"Markus A. Hollerer , Dennis Jancsary , and Maria Grafstrom . 2018. A Picture is Worth a Thousand Words: Multimodal Sensemaking of the Global Financial Crisis . Organization Studies ( 2018 ). Markus A. Hollerer, Dennis Jancsary, and Maria Grafstrom. 2018. A Picture is Worth a Thousand Words: Multimodal Sensemaking of the Global Financial Crisis. Organization Studies (2018)."},{"volume-title":"European conference on computer vision. Springer, 727--739","author":"Jabri Allan","key":"e_1_3_2_1_26_1","unstructured":"Allan Jabri , Armand Joulin , and Laurens van der Maaten. 2016. Revisiting visual question answering baselines . In European conference on computer vision. Springer, 727--739 . Allan Jabri, Armand Joulin, and Laurens van der Maaten. 2016. Revisiting visual question answering baselines. In European conference on computer vision. Springer, 727--739."},{"key":"e_1_3_2_1_27_1","volume-title":"CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning. In CVPR.","author":"Johnson Justin","year":"2017","unstructured":"Justin Johnson , Bharath Hariharan , Laurens van der Maaten , Li Fei-Fei , C Lawrence Zitnick , and Ross Girshick . 2017 . CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning. In CVPR. Justin Johnson, Bharath Hariharan, Laurens van der Maaten, Li Fei-Fei, C Lawrence Zitnick, and Ross Girshick. 2017. CLEVR: A Diagnostic Dataset for Compositional Language and Elementary Visual Reasoning. In CVPR."},{"key":"e_1_3_2_1_28_1","volume-title":"MDETR--Modulated Detection for End-to-End Multi-Modal Understanding. arXiv preprint arXiv:2104.12763","author":"Kamath Aishwarya","year":"2021","unstructured":"Aishwarya Kamath , Mannat Singh , Yann LeCun , Ishan Misra , Gabriel Synnaeve , and Nicolas Carion . 2021. MDETR--Modulated Detection for End-to-End Multi-Modal Understanding. arXiv preprint arXiv:2104.12763 ( 2021 ). Aishwarya Kamath, Mannat Singh, Yann LeCun, Ishan Misra, Gabriel Synnaeve, and Nicolas Carion. 2021. MDETR--Modulated Detection for End-to-End Multi-Modal Understanding. arXiv preprint arXiv:2104.12763 (2021)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00879"},{"key":"e_1_3_2_1_30_1","volume-title":"Bayesian representation learning with oracle constraints. arXiv preprint arXiv:1506.05011","author":"Karaletsos Theofanis","year":"2015","unstructured":"Theofanis Karaletsos , Serge Belongie , and Gunnar R\u00e4tsch . 2015. Bayesian representation learning with oracle constraints. arXiv preprint arXiv:1506.05011 ( 2015 ). Theofanis Karaletsos, Serge Belongie, and Gunnar R\u00e4tsch. 2015. Bayesian representation learning with oracle constraints. arXiv preprint arXiv:1506.05011 (2015)."},{"key":"e_1_3_2_1_31_1","unstructured":"Hyunjik Kim and Andriy Mnih. 2018. Disentangling by Factorising. In ICML.  Hyunjik Kim and Andriy Mnih. 2018. Disentangling by Factorising. In ICML."},{"key":"e_1_3_2_1_32_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013","unstructured":"Diederik P Kingma and Max Welling . 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 ( 2013 ). Diederik P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Elsa A Kirchner Stephen H Fairclough and Frank Kirchner. 2019. Embedded multimodal interfaces in robotics: applications future trends and societal implications. In The Handbook of Multimodal-Multisensor Interfaces: Language Processing Software Commercialization and Emerging Directions-Volume 3. 523--576.  Elsa A Kirchner Stephen H Fairclough and Frank Kirchner. 2019. Embedded multimodal interfaces in robotics: applications future trends and societal implications. In The Handbook of Multimodal-Multisensor Interfaces: Language Processing Software Commercialization and Emerging Directions-Volume 3. 523--576.","DOI":"10.1145\/3233795.3233810"},{"key":"e_1_3_2_1_34_1","volume-title":"International Conference on Machine Learning. PMLR, 5338--5348","author":"Koh Pang Wei","year":"2020","unstructured":"Pang Wei Koh , Thao Nguyen , Yew Siang Tang , Stephen Mussmann , Emma Pierson , Been Kim , and Percy Liang . 2020 . Concept bottleneck models . In International Conference on Machine Learning. PMLR, 5338--5348 . Pang Wei Koh, Thao Nguyen, Yew Siang Tang, Stephen Mussmann, Emma Pierson, Been Kim, and Percy Liang. 2020. Concept bottleneck models. In International Conference on Machine Learning. PMLR, 5338--5348."},{"key":"e_1_3_2_1_35_1","unstructured":"Klaus Krippendorff. 2011. Computing Krippendorff's alpha-reliability. (2011).  Klaus Krippendorff. 2011. Computing Krippendorff's alpha-reliability. (2011)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TRO.2019.2959445"},{"key":"e_1_3_2_1_37_1","volume-title":"VisualBERT: A Simple and Performant Baseline for Vision and Language. CoRR","author":"Li Liunian Harold","year":"2019","unstructured":"Liunian Harold Li , Mark Yatskar , Da Yin , Cho-Jui Hsieh , and Kai-Wei Chang . 2019. VisualBERT: A Simple and Performant Baseline for Vision and Language. CoRR , Vol. abs\/ 1908 .03557 ( 2019 ). [arXiv]1908.03557 http:\/\/arxiv.org\/abs\/1908.03557 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, and Kai-Wei Chang. 2019. VisualBERT: A Simple and Performant Baseline for Vision and Language. CoRR, Vol. abs\/1908.03557 (2019). [arXiv]1908.03557 http:\/\/arxiv.org\/abs\/1908.03557"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.469"},{"key":"e_1_3_2_1_39_1","volume-title":"AmirAli Bagher Zadeh, and Louis-Philippe Morency","author":"Liang Paul Pu","year":"2018","unstructured":"Paul Pu Liang , Ziyin Liu , AmirAli Bagher Zadeh, and Louis-Philippe Morency . 2018 a. Multimodal Language Analysis with Recurrent Multistage Fusion. In EMNLP. Paul Pu Liang, Ziyin Liu, AmirAli Bagher Zadeh, and Louis-Philippe Morency. 2018a. Multimodal Language Analysis with Recurrent Multistage Fusion. In EMNLP."},{"key":"e_1_3_2_1_40_1","volume-title":"MultiBench: Multiscale Benchmarks for Multimodal Representation Learning. NeurIPS Datasets and Benchmarks Track","author":"Liang Paul Pu","year":"2021","unstructured":"Paul Pu Liang , Yiwei Lyu , Xiang Fan , Zetian Wu , Yun Cheng , Jason Wu , Leslie Chen , Peter Wu , Michelle A Lee , Yuke Zhu , Ruslan Salakhutdinov , and Louis-Philippe Morency . 2021. MultiBench: Multiscale Benchmarks for Multimodal Representation Learning. NeurIPS Datasets and Benchmarks Track ( 2021 ). Paul Pu Liang, Yiwei Lyu, Xiang Fan, Zetian Wu, Yun Cheng, Jason Wu, Leslie Chen, Peter Wu, Michelle A Lee, Yuke Zhu, Ruslan Salakhutdinov, and Louis-Philippe Morency. 2021. MultiBench: Multiscale Benchmarks for Multimodal Representation Learning. NeurIPS Datasets and Benchmarks Track (2021)."},{"key":"e_1_3_2_1_41_1","unstructured":"Paul Pu Liang Ruslan Salakhutdinov and Louis-Philippe Morency. 2018b. Computational modeling of human multimodal language: The mosei dataset and interpretable dynamic fusion.  Paul Pu Liang Ruslan Salakhutdinov and Louis-Philippe Morency. 2018b. Computational modeling of human multimodal language: The mosei dataset and interpretable dynamic fusion."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Paul Pu Liang Amir Zadeh and Louis-Philippe Morency. 2018c. Multimodal Local-Global Ranking Fusion for Emotion Recognition. In ICMI.  Paul Pu Liang Amir Zadeh and Louis-Philippe Morency. 2018c. Multimodal Local-Global Ranking Fusion for Emotion Recognition. In ICMI.","DOI":"10.1145\/3242969.3243019"},{"key":"e_1_3_2_1_43_1","unstructured":"Francesco Locatello Stefan Bauer Mario Lucic Gunnar Raetsch Sylvain Gelly Bernhard Sch\u00f6lkopf and Olivier Bachem. 2019. Challenging common assumptions in the unsupervised learning of disentangled representations. (2019) 4114--4124.  Francesco Locatello Stefan Bauer Mario Lucic Gunnar Raetsch Sylvain Gelly Bernhard Sch\u00f6lkopf and Olivier Bachem. 2019. Challenging common assumptions in the unsupervised learning of disentangled representations. (2019) 4114--4124."},{"key":"e_1_3_2_1_44_1","volume-title":"Garnett (Eds.)","volume":"30","author":"Lundberg Scott M","year":"2017","unstructured":"Scott M Lundberg and Su-In Lee . 2017 . A Unified Approach to Interpreting Model Predictions. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R . Garnett (Eds.) , Vol. 30 . Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/ 2017\/file\/8a20a8621978632d76c43dfd28b67767-Paper.pdf Scott M Lundberg and Su-In Lee. 2017. A Unified Approach to Interpreting Model Predictions. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.), Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/8a20a8621978632d76c43dfd28b67767-Paper.pdf"},{"key":"e_1_3_2_1_45_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=rJgMlhRctm","author":"Mao Jiayuan","year":"2019","unstructured":"Jiayuan Mao , Chuang Gan , Pushmeet Kohli , Joshua B. Tenenbaum , and Jiajun Wu . 2019 . The Neuro-Symbolic Concept Learner: Interpreting Scenes, Words, and Sentences From Natural Supervision . In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=rJgMlhRctm Jiayuan Mao, Chuang Gan, Pushmeet Kohli, Joshua B. Tenenbaum, and Jiajun Wu. 2019. The Neuro-Symbolic Concept Learner: Interpreting Scenes, Words, and Sentences From Natural Supervision. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=rJgMlhRctm"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2006.63"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2004.139"},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of the 1st Workshop on Multimodal Semantic Representations (MMSR). 32--44","author":"Parcalabescu Letitia","year":"2021","unstructured":"Letitia Parcalabescu , Albert Gatt , Anette Frank , and Iacer Calixto . 2021 . Seeing past words: Testing the cross-modal capabilities of pretrained V&L models on counting tasks . In Proceedings of the 1st Workshop on Multimodal Semantic Representations (MMSR). 32--44 . Letitia Parcalabescu, Albert Gatt, Anette Frank, and Iacer Calixto. 2021. Seeing past words: Testing the cross-modal capabilities of pretrained V&L models on counting tasks. In Proceedings of the 1st Workshop on Multimodal Semantic Representations (MMSR). 32--44."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00915"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10772-010-9068-y"},{"key":"e_1_3_2_1_51_1","volume-title":"A review of affective computing: From unimodal analysis to multimodal fusion. Information Fusion","author":"Poria Soujanya","year":"2017","unstructured":"Soujanya Poria , Erik Cambria , Rajiv Bajpai , and Amir Hussain . 2017. A review of affective computing: From unimodal analysis to multimodal fusion. Information Fusion ( 2017 ). Soujanya Poria, Erik Cambria, Rajiv Bajpai, and Amir Hussain. 2017. A review of affective computing: From unimodal analysis to multimodal fusion. Information Fusion (2017)."},{"key":"e_1_3_2_1_52_1","unstructured":"Scott Reed Kihyuk Sohn Yuting Zhang and Honglak Lee. 2014. Learning to disentangle factors of variation with manifold interaction. In ICML.  Scott Reed Kihyuk Sohn Yuting Zhang and Honglak Lee. 2014. Learning to disentangle factors of variation with manifold interaction. In ICML."},{"key":"e_1_3_2_1_53_1","volume-title":"Explaining the Predictions of Any Classifier. CoRR abs\/1602.04938","author":"Ribeiro Marco T\u00falio","year":"2016","unstructured":"Marco T\u00falio Ribeiro , Sameer Singh , and Carlos Guestrin . 2016. \" Why Should I Trust You?\" : Explaining the Predictions of Any Classifier. CoRR abs\/1602.04938 ( 2016 ). arXiv:1602.04938 http:\/\/arxiv.org\/abs\/1602.0493 Marco T\u00falio Ribeiro, Sameer Singh, and Carlos Guestrin. 2016. \"Why Should I Trust You?\": Explaining the Predictions of Any Classifier. CoRR abs\/1602.04938 (2016). arXiv:1602.04938 http:\/\/arxiv.org\/abs\/1602.0493"},{"key":"e_1_3_2_1_54_1","volume-title":"On the Latent Space of Wasserstein Auto-Encoders. arXiv preprint arXiv:1802.03761","author":"Rubenstein Paul K","year":"2018","unstructured":"Paul K Rubenstein , Bernhard Schoelkopf , and Ilya Tolstikhin . 2018. On the Latent Space of Wasserstein Auto-Encoders. arXiv preprint arXiv:1802.03761 ( 2018 ). Paul K Rubenstein, Bernhard Schoelkopf, and Ilya Tolstikhin. 2018. On the Latent Space of Wasserstein Auto-Encoders. arXiv preprint arXiv:1802.03761 (2018)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1038\/s42256-019-0048-x"},{"key":"e_1_3_2_1_56_1","volume-title":"Deep inside convolutional networks: Visualising image classification models and saliency maps. arXiv preprint arXiv:1312.6034","author":"Simonyan Karen","year":"2013","unstructured":"Karen Simonyan , Andrea Vedaldi , and Andrew Zisserman . 2013. Deep inside convolutional networks: Visualising image classification models and saliency maps. arXiv preprint arXiv:1312.6034 ( 2013 ). Karen Simonyan, Andrea Vedaldi, and Andrew Zisserman. 2013. Deep inside convolutional networks: Visualising image classification models and saliency maps. arXiv preprint arXiv:1312.6034 (2013)."},{"key":"e_1_3_2_1_57_1","volume-title":"Smoothgrad: removing noise by adding noise. arXiv preprint arXiv:1706.03825","author":"Smilkov Daniel","year":"2017","unstructured":"Daniel Smilkov , Nikhil Thorat , Been Kim , Fernanda Vi\u00e9gas , and Martin Wattenberg . 2017. Smoothgrad: removing noise by adding noise. arXiv preprint arXiv:1706.03825 ( 2017 ). Daniel Smilkov, Nikhil Thorat, Been Kim, Fernanda Vi\u00e9gas, and Martin Wattenberg. 2017. Smoothgrad: removing noise by adding noise. arXiv preprint arXiv:1706.03825 (2017)."},{"key":"e_1_3_2_1_58_1","volume-title":"LXMERT: Learning Cross-Modality Encoder Representations from Transformers. CoRR","author":"Tan Hao","year":"2019","unstructured":"Hao Tan and Mohit Bansal . 2019 . LXMERT: Learning Cross-Modality Encoder Representations from Transformers. CoRR , Vol. abs\/ 1908 .07490 (2019). [arXiv]1908.07490 http:\/\/arxiv.org\/abs\/1908.07490 Hao Tan and Mohit Bansal. 2019. LXMERT: Learning Cross-Modality Encoder Representations from Transformers. CoRR, Vol. abs\/1908.07490 (2019). [arXiv]1908.07490 http:\/\/arxiv.org\/abs\/1908.07490"},{"key":"e_1_3_2_1_59_1","volume-title":"Amir Zadeh, Louis-Philippe Morency, and Ruslan Salakhutdinov.","author":"Hubert Tsai Yao-Hung","year":"2019","unstructured":"Yao-Hung Hubert Tsai , Paul Pu Liang , Amir Zadeh, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019 . Learning factorized multimodal representations. ICLR ( 2019). Yao-Hung Hubert Tsai, Paul Pu Liang, Amir Zadeh, Louis-Philippe Morency, and Ruslan Salakhutdinov. 2019. Learning factorized multimodal representations. ICLR (2019)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.143"},{"key":"e_1_3_2_1_61_1","volume-title":"International Conference on Machine Learning. PMLR, 6428--6437","author":"Vedantam Ramakrishna","year":"2019","unstructured":"Ramakrishna Vedantam , Karan Desai , Stefan Lee , Marcus Rohrbach , Dhruv Batra , and Devi Parikh . 2019 . Probabilistic neural symbolic models for interpretable visual question answering . In International Conference on Machine Learning. PMLR, 6428--6437 . Ramakrishna Vedantam, Karan Desai, Stefan Lee, Marcus Rohrbach, Dhruv Batra, and Devi Parikh. 2019. Probabilistic neural symbolic models for interpretable visual question answering. In International Conference on Machine Learning. PMLR, 6428--6437."},{"key":"e_1_3_2_1_62_1","volume-title":"NBDT: Neural-Backed Decision Tree. In International Conference on Learning Representations.","author":"Wan Alvin","year":"2020","unstructured":"Alvin Wan , Lisa Dunlap , Daniel Ho , Jihan Yin , Scott Lee , Suzanne Petryk , Sarah Adel Bargal , and Joseph E Gonzalez . 2020 . NBDT: Neural-Backed Decision Tree. In International Conference on Learning Representations. Alvin Wan, Lisa Dunlap, Daniel Ho, Jihan Yin, Scott Lee, Suzanne Petryk, Sarah Adel Bargal, and Joseph E Gonzalez. 2020. NBDT: Neural-Backed Decision Tree. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2021.3114794"},{"key":"e_1_3_2_1_64_1","volume-title":"International Conference on Machine Learning. PMLR, 11205--11216","author":"Wong Eric","year":"2021","unstructured":"Eric Wong , Shibani Santurkar , and Aleksander Madry . 2021 . Leveraging sparse linear layers for debuggable deep networks . In International Conference on Machine Learning. PMLR, 11205--11216 . Eric Wong, Shibani Santurkar, and Aleksander Madry. 2021. Leveraging sparse linear layers for debuggable deep networks. In International Conference on Machine Learning. PMLR, 11205--11216."},{"key":"e_1_3_2_1_65_1","volume-title":"Machine Learning for Healthcare Conference. PMLR, 197--215","author":"Xu Keyang","year":"2019","unstructured":"Keyang Xu , Mike Lam , Jingzhi Pang , Xin Gao , Charlotte Band , Piyush Mathur , Frank Papay , Ashish K Khanna , Jacek B Cywinski , Kamal Maheshwari , 2019 . Multimodal machine learning for automated ICD coding . In Machine Learning for Healthcare Conference. PMLR, 197--215 . Keyang Xu, Mike Lam, Jingzhi Pang, Xin Gao, Charlotte Band, Piyush Mathur, Frank Papay, Ashish K Khanna, Jacek B Cywinski, Kamal Maheshwari, et almbox. 2019. Multimodal machine learning for automated ICD coding. In Machine Learning for Healthcare Conference. PMLR, 197--215."},{"key":"e_1_3_2_1_66_1","volume-title":"In ICML Workshop on Deep Learning. Citeseer.","author":"Yosinski Jason","year":"2015","unstructured":"Jason Yosinski , Jeff Clune , Thomas Fuchs , and Hod Lipson . 2015 . Understanding neural networks through deep visualization . In In ICML Workshop on Deep Learning. Citeseer. Jason Yosinski, Jeff Clune, Thomas Fuchs, and Hod Lipson. 2015. Understanding neural networks through deep visualization. In In ICML Workshop on Deep Learning. Citeseer."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1208"}],"event":{"name":"AIES '22: AAAI\/ACM Conference on AI, Ethics, and Society","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence","AAAI"],"location":"Oxford United Kingdom","acronym":"AIES '22"},"container-title":["Proceedings of the 2022 AAAI\/ACM Conference on AI, Ethics, and Society"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3514094.3534148","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3514094.3534148","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3514094.3534148","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:02:36Z","timestamp":1750186956000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3514094.3534148"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,7,26]]},"references-count":67,"alternative-id":["10.1145\/3514094.3534148","10.1145\/3514094"],"URL":"https:\/\/doi.org\/10.1145\/3514094.3534148","relation":{},"subject":[],"published":{"date-parts":[[2022,7,26]]},"assertion":[{"value":"2022-07-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}