{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T17:07:19Z","timestamp":1773508039100,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":80,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,23]]},"DOI":"10.1145\/3742413.3789145","type":"proceedings-article","created":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T11:32:24Z","timestamp":1772537544000},"page":"1867-1882","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Semantic See-through Goggles: Wearing Linguistic Virtual Reality in (Artificial) Intelligence"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4747-7984","authenticated-orcid":false,"given":"Goki","family":"Muramoto","sequence":"first","affiliation":[{"name":"RCAST, The University of Tokyo, Shibuya, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-1446-9657","authenticated-orcid":false,"given":"Yuri","family":"Yasui","sequence":"additional","affiliation":[{"name":"The University of British Columbia, Vancouver, British Columbia, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-7197-9248","authenticated-orcid":false,"given":"Hirosuke","family":"Asahi","sequence":"additional","affiliation":[{"name":"RCAST, The University of Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3125-1852","authenticated-orcid":false,"given":"Masaharu","family":"Hirose","sequence":"additional","affiliation":[{"name":"RCAST, The University of Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8652-0730","authenticated-orcid":false,"given":"Masahiko","family":"Inami","sequence":"additional","affiliation":[{"name":"RCAST, The University of Tokyo, Tokyo, Japan"}]}],"member":"320","published-online":{"date-parts":[[2026,3,22]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_3_2_3_2","unstructured":"Phyllis Ang Bhuwan Dhingra and Lisa\u00a0Wu Wills. 2022. Characterizing the efficiency vs. accuracy trade-off for long-context NLP models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.07288 (2022)."},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"crossref","unstructured":"Ronald\u00a0T Azuma. 1997. A survey of augmented reality. Presence: teleoperators & virtual environments 6 4 (1997) 355\u2013385.","DOI":"10.1162\/pres.1997.6.4.355"},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"crossref","unstructured":"Michael Bajura Henry Fuchs and Ryutarou Ohbuchi. 1992. Merging virtual objects with the real world: Seeing ultrasound imagery within the patient. ACM SIGGRAPH Computer Graphics 26 2 (1992) 203\u2013210.","DOI":"10.1145\/142920.134061"},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"crossref","unstructured":"Solon Barocas and Andrew\u00a0D Selbst. 2016. Big data\u2019s disparate impact. Calif. L. Rev. 104 (2016) 671.","DOI":"10.2139\/ssrn.2477899"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"crossref","unstructured":"Lawrence\u00a0W Barsalou. 1999. Perceptual symbol systems. Behavioral and brain sciences 22 4 (1999) 577\u2013660.","DOI":"10.1017\/S0140525X99002149"},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"crossref","unstructured":"Raffaella Bernardi Ruket Cakici Desmond Elliott Aykut Erdem Erkut Erdem Nazli Ikizler-Cinbis Frank Keller Adrian Muscat and Barbara Plank. 2016. Automatic description generation from images: A survey of models datasets and evaluation measures. Journal of Artificial Intelligence Research 55 (2016) 409\u2013442.","DOI":"10.1613\/jair.4900"},{"key":"e_1_3_3_2_9_2","unstructured":"Shruti Bhargava and David Forsyth. 2019. Exposing and correcting the gender bias in image captioning datasets and models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1912.00578 (2019)."},{"key":"e_1_3_3_2_10_2","unstructured":"Abeba Birhane and Fred Cummins. 2019. Algorithmic injustices: Towards a relational ethics. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1912.07376 (2019)."},{"key":"e_1_3_3_2_11_2","unstructured":"Tolga Bolukbasi Kai-Wei Chang James\u00a0Y Zou Venkatesh Saligrama and Adam\u00a0T Kalai. 2016. Man is to computer programmer as woman is to homemaker? debiasing word embeddings. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_3_2_12_2","unstructured":"Pierre Bourdieu. 1991. Language and symbolic power. Polity (1991)."},{"key":"e_1_3_3_2_13_2","first-page":"77","volume-title":"Conference on fairness, accountability and transparency","author":"Buolamwini Joy","year":"2018","unstructured":"Joy Buolamwini and Timnit Gebru. 2018. Gender shades: Intersectional accuracy disparities in commercial gender classification. In Conference on fairness, accountability and transparency. PMLR, 77\u201391."},{"key":"e_1_3_3_2_14_2","doi-asserted-by":"crossref","unstructured":"Aylin Caliskan Joanna\u00a0J Bryson and Arvind Narayanan. 2017. Semantics derived automatically from language corpora contain human-like biases. Science 356 6334 (2017) 183\u2013186.","DOI":"10.1126\/science.aal4230"},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-2029"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.4324\/9780203771587"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"crossref","unstructured":"Emily\u00a0A Cooper. 2023. The perceptual science of augmented reality. Annual Review of Vision Science 9 1 (2023) 455\u2013478.","DOI":"10.1146\/annurev-vision-111022-123758"},{"key":"e_1_3_3_2_18_2","volume-title":"The atlas of AI: Power, politics, and the planetary costs of artificial intelligence","author":"Crawford Kate","year":"2021","unstructured":"Kate Crawford. 2021. The atlas of AI: Power, politics, and the planetary costs of artificial intelligence. Yale University Press."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1145\/634067.634255"},{"key":"e_1_3_3_2_20_2","unstructured":"Finale Doshi-Velez and Been Kim. 2017. Towards a rigorous science of interpretable machine learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1702.08608 (2017)."},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"crossref","unstructured":"Julia Dressel and Hany Farid. 2018. The accuracy fairness and limits of predicting recidivism. Science advances 4 1 (2018) eaao5580.","DOI":"10.1126\/sciadv.aao5580"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Luciano Floridi and Massimo Chiriatti. 2020. GPT-3: Its nature scope limits and consequences. Minds and machines 30 4 (2020) 681\u2013694.","DOI":"10.1007\/s11023-020-09548-1"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","unstructured":"Lianli Gao Zhao Guo Hanwang Zhang Xing Xu and Heng\u00a0Tao Shen. 2017. Video Captioning With Attention-Based LSTM and Semantic Consistency. IEEE Transactions on Multimedia 19 9 (2017) 2045\u20132055. 10.1109\/TMM.2017.2729019","DOI":"10.1109\/TMM.2017.2729019"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","unstructured":"Lianli Gao Zhao Guo Hanwang Zhang Xing Xu and Heng\u00a0Tao Shen. 2017. Video Captioning With Attention-Based LSTM and Semantic Consistency. IEEE Transactions on Multimedia 19 (2017) 2045\u20132055. 10.1109\/TMM.2017.2729019","DOI":"10.1109\/TMM.2017.2729019"},{"key":"e_1_3_3_2_25_2","unstructured":"Kasturi Ghosh and D Ghosh. 2025. Who am AI? Deconstructing Identity in the Age of Advanced Artificial Intelligence."},{"key":"e_1_3_3_2_26_2","unstructured":"Hila Gonen and Yoav Goldberg. 2019. Lipstick on a pig: Debiasing methods cover up systematic gender biases in word embeddings but do not remove them. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1903.03862 (2019)."},{"key":"e_1_3_3_2_27_2","unstructured":"Ian\u00a0J Goodfellow Jean Pouget-Abadie Mehdi Mirza Bing Xu David Warde-Farley Sherjil Ozair Aaron Courville and Yoshua Bengio. 2014. Generative adversarial nets. Advances in neural information processing systems 27 (2014)."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3025453.3025683"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.1109\/VR.2018.8446319"},{"key":"e_1_3_3_2_30_2","unstructured":"Melissa Hall Laurens van\u00a0der Maaten Laura Gustafson Maxwell Jones and Aaron Adcock. 2022. A systematic study of bias amplification. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2201.11706 (2022)."},{"key":"e_1_3_3_2_31_2","doi-asserted-by":"crossref","unstructured":"Stevan Harnad. 1990. The symbol grounding problem. Physica D: Nonlinear Phenomena 42 1-3 (1990) 335\u2013346.","DOI":"10.1016\/0167-2789(90)90087-6"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"crossref","unstructured":"Zellig\u00a0S Harris. 1954. Distributional structure. Word 10 2-3 (1954) 146\u2013162.","DOI":"10.1080\/00437956.1954.11659520"},{"key":"e_1_3_3_2_33_2","doi-asserted-by":"publisher","unstructured":"Jonathan Ho William Chan Chitwan Saharia Jay Whang Ruiqi Gao A. Gritsenko Diederik\u00a0P. Kingma Ben Poole Mohammad Norouzi David\u00a0J. Fleet and Tim Salimans. 2022. Imagen Video: High Definition Video Generation with Diffusion Models. ArXiv abs\/2210.02303 (2022). 10.48550\/arXiv.2210.02303","DOI":"10.48550\/arXiv.2210.02303"},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","unstructured":"Xiao Hu Haobo Wang Anirudh Vegesana Somesh Dube Kaiwen Yu Gore Kao Shuo-Han Chen Yung-Hsiang Lu G. Thiruvathukal and Ming Yin. 2020. Crowdsourcing Detection of Sampling Biases in Image Datasets. Proceedings of The Web Conference 2020 (2020). 10.1145\/3366423.3380063","DOI":"10.1145\/3366423.3380063"},{"key":"e_1_3_3_2_35_2","doi-asserted-by":"crossref","unstructured":"Shaohan Huang Li Dong Wenhui Wang Yaru Hao Saksham Singhal Shuming Ma Tengchao Lv Lei Cui Owais\u00a0Khan Mohammed Barun Patra et\u00a0al. 2023. Language is not all you need: Aligning perception with language models. Advances in Neural Information Processing Systems 36 (2023) 72096\u201372109.","DOI":"10.52202\/075280-3155"},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"publisher","unstructured":"Jungo Kasai Keisuke Sakaguchi Lavinia Dunagan Jacob\u00a0Daniel Morrison Ronan\u00a0Le Bras Yejin Choi and Noah\u00a0A. Smith. 2021. Transparent Human Evaluation for Image Captioning. ArXiv abs\/2111.08940 (2021). 10.18653\/v1\/2022.naacl-main.254","DOI":"10.18653\/v1\/2022.naacl-main.254"},{"key":"e_1_3_3_2_37_2","unstructured":"Diederik\u00a0P Kingma and Max Welling. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1312.6114 (2013)."},{"key":"e_1_3_3_2_38_2","volume-title":"Alfons Schilling: Ich\/Auge\/Welt-The Art of Vision","author":"Klocker Hubert","year":"1997","unstructured":"Hubert Klocker, Carl Aigner, and Peter Weibel. 1997. Alfons Schilling: Ich\/Auge\/Welt-The Art of Vision. Springer."},{"key":"e_1_3_3_2_39_2","first-page":"957","volume-title":"International conference on machine learning","author":"Kusner Matt","year":"2015","unstructured":"Matt Kusner, Yu Sun, Nicholas Kolkin, and Kilian Weinberger. 2015. From word embeddings to document distances. In International conference on machine learning. PMLR, 957\u2013966."},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"crossref","unstructured":"S Lee John\u00a0H. Xin and Stephen Westland. 2005. Evaluation of Image Similarity by Histogram Intersection. Color Research and Application 30 (2005) 265\u2013274. https:\/\/api.semanticscholar.org\/CorpusID:54697123","DOI":"10.1002\/col.20122"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.4324\/9781315458496"},{"key":"e_1_3_3_2_42_2","first-page":"12888","volume-title":"International conference on machine learning","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In International conference on machine learning. PMLR, 12888\u201312900."},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Jun Li Chenyang Zhang Wei Zhu and Yawei Ren. 2024. A Comprehensive Survey of Image Generation Models Based on Deep Learning. Annals of Data Science (2024) 1\u201330.","DOI":"10.1007\/s40745-024-00544-1"},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","unstructured":"Zhiheng Li and Chenliang Xu. 2021. Discover the Unknown Biased Attribute of an Image Classifier. 2021 IEEE\/CVF International Conference on Computer Vision (ICCV) (2021) 14950\u201314959. 10.1109\/ICCV48922.2021.01470","DOI":"10.1109\/ICCV48922.2021.01470"},{"key":"e_1_3_3_2_45_2","unstructured":"Simian Luo Yiqin Tan Longbo Huang Jian Li and Hang Zhao. 2023. Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference. arxiv:https:\/\/arXiv.org\/abs\/2310.04378\u00a0[cs.CV]"},{"key":"e_1_3_3_2_46_2","doi-asserted-by":"publisher","unstructured":"Burak Makav and V. K\u0131l\u0131\u00e7. 2019. Smartphone-based Image Captioning for Visually and Hearing Impaired. 2019 11th International Conference on Electrical and Electronics Engineering (ELECO) (2019) 950\u2013953. 10.23919\/ELECO47770.2019.8990395","DOI":"10.23919\/ELECO47770.2019.8990395"},{"key":"e_1_3_3_2_47_2","doi-asserted-by":"publisher","DOI":"10.1145\/1026653.1026654"},{"key":"e_1_3_3_2_48_2","unstructured":"Steve Mann Tom Furness Yu Yuan Jay Iorio and Zixin Wang. 2018. All reality: Virtual augmented mixed (x) mediated (x y) and multimediated reality. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1804.08386 (2018)."},{"key":"e_1_3_3_2_49_2","doi-asserted-by":"crossref","unstructured":"Ludovica Marinucci Claudia Mazzuca and Aldo Gangemi. 2023. Exposing implicit biases and stereotypes in human and artificial intelligence: state of the art and challenges with a focus on gender. AI & SOCIETY 38 2 (2023) 747\u2013761.","DOI":"10.1007\/s00146-022-01474-3"},{"key":"e_1_3_3_2_50_2","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/9780262514620.001.0001"},{"key":"e_1_3_3_2_51_2","unstructured":"Paul Milgram and Fumio Kishino. 1994. A taxonomy of mixed reality visual displays. IEICE TRANSACTIONS on Information and Systems 77 12 (1994) 1321\u20131329."},{"key":"e_1_3_3_2_52_2","unstructured":"Edwin\u00a0G Ng Bo Pang Piyush Sharma and Radu Soricut. 2020. Understanding guided image captioning performance across domains. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2012.02339 (2020)."},{"key":"e_1_3_3_2_53_2","doi-asserted-by":"crossref","unstructured":"A\u00a0Michael Noll. 1994. The beginnings of computer art in the United States: A memoir. Leonardo (1994) 39\u201344.","DOI":"10.2307\/1575947"},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"crossref","unstructured":"Eirini Ntoutsi Pavlos Fafalios Ujwal Gadiraju Vasileios Iosifidis Wolfgang Nejdl Maria-Esther Vidal Salvatore Ruggieri Franco Turini Symeon Papadopoulos Emmanouil Krasanakis et\u00a0al. 2020. Bias in data-driven artificial intelligence systems\u2014An introductory survey. Wiley Interdisciplinary Reviews: Data Mining and Knowledge Discovery 10 3 (2020) e1356.","DOI":"10.1002\/widm.1356"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01372"},{"key":"e_1_3_3_2_56_2","doi-asserted-by":"crossref","unstructured":"Ville Paananen Jonas Oppenlaender and Aku Visuri. 2023. Using text-to-image generation for architectural design ideation. International Journal of Architectural Computing (2023) 14780771231222783.","DOI":"10.1177\/14780771231222783"},{"key":"e_1_3_3_2_57_2","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_2_58_2","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.06125 1 2 (2022) 3."},{"key":"e_1_3_3_2_59_2","first-page":"8821","volume-title":"International conference on machine learning","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International conference on machine learning. Pmlr, 8821\u20138831."},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"crossref","unstructured":"N Reimers. 2019. Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1908.10084 (2019).","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_3_2_61_2","doi-asserted-by":"crossref","unstructured":"Katja Rogers Sukran Karaosmanoglu Dennis Wolf Frank Steinicke and Lennart\u00a0E Nacke. 2021. A best-fit framework and systematic review of asymmetric gameplay in multiplayer virtual reality games. Frontiers in Virtual Reality 2 (2021) 694660.","DOI":"10.3389\/frvir.2021.694660"},{"key":"e_1_3_3_2_62_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_2_63_2","doi-asserted-by":"crossref","unstructured":"Olga Russakovsky Jia Deng Hao Su Jonathan Krause Sanjeev Satheesh Sean Ma Zhiheng Huang Andrej Karpathy Aditya Khosla Michael Bernstein et\u00a0al. 2015. Imagenet large scale visual recognition challenge. International journal of computer vision 115 (2015) 211\u2013252.","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_3_2_64_2","doi-asserted-by":"crossref","unstructured":"Pierre Sachse Ursula Beermann Markus Martini Thomas Maran Markus Domeier and Marco\u00a0R Furtner. 2017. \u201cThe world is upside down\u201d\u2013The Innsbruck goggle experiments of Theodor Erismann (1883\u20131961) and Ivo Kohler (1915\u20131985). Cortex 92 (2017) 222\u2013232.","DOI":"10.1016\/j.cortex.2017.04.014"},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"crossref","unstructured":"Chitwan Saharia William Chan Saurabh Saxena Lala Li Jay Whang Emily\u00a0L Denton Kamyar Ghasemipour Raphael Gontijo\u00a0Lopes Burcu Karagol\u00a0Ayan Tim Salimans et\u00a0al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems 35 (2022) 36479\u201336494.","DOI":"10.52202\/068431-2643"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"crossref","unstructured":"Himanshu Sharma and Devanand Padha. 2023. A comprehensive survey on image captioning: from handcrafted to deep learning-based techniques a taxonomy and open research issues. Artificial Intelligence Review 56 11 (2023) 13619\u201313661.","DOI":"10.1007\/s10462-023-10488-2"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"publisher","DOI":"10.1145\/3442188.3445932"},{"key":"e_1_3_3_2_68_2","doi-asserted-by":"crossref","unstructured":"George\u00a0M Stratton. 1897. Vision without inversion of the retinal image.Psychological review 4 4 (1897) 341.","DOI":"10.1037\/h0075482"},{"key":"e_1_3_3_2_69_2","doi-asserted-by":"publisher","DOI":"10.1145\/1476589.1476686"},{"key":"e_1_3_3_2_70_2","first-page":"506","volume-title":"Proceedings of the IFIP Congress","volume":"2","author":"Sutherland Ivan\u00a0E","year":"1965","unstructured":"Ivan\u00a0E Sutherland et\u00a0al. 1965. The ultimate display. In Proceedings of the IFIP Congress , Vol.\u00a02. New York, 506\u2013508."},{"key":"e_1_3_3_2_71_2","doi-asserted-by":"crossref","unstructured":"Yunlong Tang Junjia Guo Pinxin Liu Zhiyuan Wang Hang Hua Jia-Xing Zhong Yunzhong Xiao Chao Huang Luchuan Song Susan Liang et\u00a0al. 2025. Generative ai for cel-animation: A survey. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.06250 (2025).","DOI":"10.1109\/ICCVW69036.2025.00400"},{"key":"e_1_3_3_2_72_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995347"},{"key":"e_1_3_3_2_73_2","doi-asserted-by":"publisher","unstructured":"Alasdair Tran A. Mathews and Lexing Xie. 2020. Transform and Tell: Entity-Aware News Image Captioning. 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020) 13032\u201313042. 10.1109\/CVPR42600.2020.01305","DOI":"10.1109\/CVPR42600.2020.01305"},{"key":"e_1_3_3_2_74_2","unstructured":"Jesse Vig. 2019. Visualizing Attention in Transformer-Based Language Representation Models. ArXiv abs\/1904.02679 (2019)."},{"key":"e_1_3_3_2_75_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_3_2_76_2","unstructured":"Sanchayan Vivekananthan. 2024. Comparative analysis of generative models: Enhancing image synthesis with vaes gans and stable diffusion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.08751 (2024)."},{"key":"e_1_3_3_2_77_2","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415847"},{"key":"e_1_3_3_2_78_2","doi-asserted-by":"crossref","unstructured":"Xingjiao Wu Luwei Xiao Yixuan Sun Junhang Zhang Tianlong Ma and Liang He. 2022. A survey of human-in-the-loop for machine learning. Future Generation Computer Systems 135 (2022) 364\u2013381.","DOI":"10.1016\/j.future.2022.05.014"},{"key":"e_1_3_3_2_79_2","first-page":"2048","volume-title":"International conference on machine learning","author":"Xu Kelvin","year":"2015","unstructured":"Kelvin Xu, Jimmy Ba, Ryan Kiros, Kyunghyun Cho, Aaron Courville, Ruslan Salakhudinov, Rich Zemel, and Yoshua Bengio. 2015. Show, attend and tell: Neural image caption generation with visual attention. In International conference on machine learning. PMLR, 2048\u20132057."},{"key":"e_1_3_3_2_80_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_3_2_81_2","doi-asserted-by":"crossref","unstructured":"Rolf\u00a0A Zwaan and Gabriel\u00a0A Radvansky. 1998. Situation models in language comprehension and memory.Psychological bulletin 123 2 (1998) 162.","DOI":"10.1037\/0033-2909.123.2.162"}],"event":{"name":"IUI '26: 31st International Conference on Intelligent User Interfaces","location":"Paphos Cyprus","acronym":"IUI '26","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGAI ACM Special Interest Group on Artificial Intelligence"]},"container-title":["Proceedings of the 31st International Conference on Intelligent User Interfaces"],"original-title":[],"deposited":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T12:56:54Z","timestamp":1773493014000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3742413.3789145"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,22]]},"references-count":80,"alternative-id":["10.1145\/3742413.3789145","10.1145\/3742413"],"URL":"https:\/\/doi.org\/10.1145\/3742413.3789145","relation":{},"subject":[],"published":{"date-parts":[[2026,3,22]]},"assertion":[{"value":"2026-03-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}