{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T12:11:12Z","timestamp":1773490272559,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":35,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,23]]},"DOI":"10.1145\/3742414.3794743","type":"proceedings-article","created":{"date-parts":[[2026,3,9]],"date-time":"2026-03-09T11:03:52Z","timestamp":1773054232000},"page":"34-38","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Evaluating Semantic and Perceptual Alignment in Multilingual Story Visualization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-6599-9956","authenticated-orcid":false,"given":"Krishna","family":"Tewari","sequence":"first","affiliation":[{"name":"Indian Institute of Technology (BHU) Varanasi, Varanasi, Uttar Pradesh, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-8204-102X","authenticated-orcid":false,"given":"Sharma Nandini","family":"Surendra","sequence":"additional","affiliation":[{"name":"Indian Institute of Technology (BHU) Varanasi, Varanasi, Uttar Pradesh, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8976-669X","authenticated-orcid":false,"given":"Divya","family":"Sharma","sequence":"additional","affiliation":[{"name":"Indian Institute of Technology (BHU) Varanasi, Varanasi, Uttar Pradesh, India"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8743-9830","authenticated-orcid":false,"given":"Sukomal","family":"Pal","sequence":"additional","affiliation":[{"name":"Indian Institute of Technology (BHU) Varanasi, Varanasi, Uttar Pradesh, India"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,3,22]]},"reference":[{"key":"e_1_3_3_1_2_2","doi-asserted-by":"crossref","unstructured":"Jean-Baptiste Alayrac Jeff Donahue Pauline Luc Antoine Miech Iain Barr Yana Hasson Karel Lenc Arthur Mensch Katie Millican Malcolm Reynolds Roman Ring Eliza Rutherford Serkan Cabi Tengda Han Zhitao Gong Sina Samangooei Marianne Monteiro Jacob Menick Sebastian Borgeaud Andrew Brock Aida Nematzadeh Sahand Sharifzadeh Mikolaj Binkowski Ricardo Barreira Oriol Vinyals Andrew Zisserman and Karen Simonyan. 2022. Flamingo: a Visual Language Model for Few-Shot Learning. Advances in Neural Information Processing Systems 35 (2022).","DOI":"10.52202\/068431-1723"},{"key":"e_1_3_3_1_3_2","unstructured":"Prafulla Dhariwal and Alex Nichol. 2021. Diffusion Models Beat GANs on Image Synthesis. arxiv:https:\/\/arXiv.org\/abs\/2105.05233\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2105.05233"},{"key":"e_1_3_3_1_4_2","unstructured":"David Dinkevich Matan Levy Omri Avrahami Dvir Samuel and Dani Lischinski. 2025. Story2Board: A Training-Free Approach for Expressive Storyboard Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2508.09983 (2025)."},{"key":"e_1_3_3_1_5_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_6_2","unstructured":"Stephanie Fu Netanel Tamir Shobhita Sundaram Lucy Chai Richard Zhang Tali Dekel and Phillip Isola. 2023. DreamSim: Learning New Dimensions of Human Visual Similarity using Synthetic Data. arxiv:https:\/\/arXiv.org\/abs\/2306.09344\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2306.09344"},{"key":"e_1_3_3_1_7_2","doi-asserted-by":"crossref","unstructured":"Oran Gafni Adam Polyak Oron Ashual Shelly Sheynin Devi Parikh and Yaniv Taigman. 2022. Make-A-Scene: Scene-Based Text-to-Image Generation with Human Priors. arxiv:https:\/\/arXiv.org\/abs\/2203.13131\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2203.13131","DOI":"10.1007\/978-3-031-19784-0_6"},{"key":"e_1_3_3_1_8_2","unstructured":"Shuyang Gu Dong Chen Jianmin Bao Fang Wen Bo Zhang Dongdong Chen Lu Yuan and Baining Guo. 2022. Vector Quantized Diffusion Model for Text-to-Image Synthesis. arxiv:https:\/\/arXiv.org\/abs\/2111.14822\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2111.14822"},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","unstructured":"Tanmay Gupta Abhinav Gupta Martial He and Abhinav\u00a0G. Bansal. 2018. Imagine This! Scripts to Compositions to Videos. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2018) 1\u201310. 10.1109\/CVPR.2018.00001","DOI":"10.1109\/CVPR.2018.00001"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"e_1_3_3_1_11_2","unstructured":"Jack Hessel Ari Holtzman Maxwell Forbes Ronan\u00a0Le Bras and Yejin Choi. 2022. CLIPScore: A Reference-free Evaluation Metric for Image Captioning. arxiv:https:\/\/arXiv.org\/abs\/2104.08718\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2104.08718"},{"key":"e_1_3_3_1_12_2","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2018. GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium. arxiv:https:\/\/arXiv.org\/abs\/1706.08500\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/1706.08500"},{"key":"e_1_3_3_1_13_2","first-page":"2345","volume-title":"Proceedings of the 2022 Conference on Neural Information Processing Systems","author":"Hong Jiwon","year":"2022","unstructured":"Jiwon Hong, Minjoon Kim, and Seungjin Lee. 2022. Visual Coherence Losses for Story Generation from Images. In Proceedings of the 2022 Conference on Neural Information Processing Systems. 2345\u20132356."},{"key":"e_1_3_3_1_14_2","unstructured":"Junjie Hu Yu Cheng Zhe Gan Jingjing Liu Jianfeng Gao and Graham Neubig. 2024. Visual Storytelling Dataset (VIST). https:\/\/service.tib.eu\/ldmservice\/dataset\/visual-storytelling-dataset\u2013vist-. Accessed: 2025-10-25."},{"key":"e_1_3_3_1_15_2","series-title":"CEUR Workshop Proceedings","volume-title":"Proceedings of the 8th Workshop on Narrative Extraction From Texts (Text2Story 2025)","volume":"3964","author":"Kapuriya Janak","year":"2025","unstructured":"Janak Kapuriya and Paul Buitelaar. 2025. FlintstonesSV++: Improving Story Narration using Visual Scene Graph. In Proceedings of the 8th Workshop on Narrative Extraction From Texts (Text2Story 2025)(CEUR Workshop Proceedings, Vol.\u00a03964). https:\/\/ceur-ws.org\/Vol-3964\/paper3.pdf Accessed: 2025-10-25."},{"key":"e_1_3_3_1_16_2","unstructured":"Yitong Li Zhe Gan Yelong Shen Jingjing Liu Yu Cheng Yuexin Wu Lawrence Carin David Carlson and Jianfeng Gao. 2019. StoryGAN: A Sequential Conditional GAN for Story Visualization. arxiv:https:\/\/arXiv.org\/abs\/1812.02784\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1812.02784"},{"key":"e_1_3_3_1_17_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_1_18_2","unstructured":"Tao Liu Kai Wang Senmao Li Joost van\u00a0de Weijer Fahad\u00a0Shahbaz Khan Shiqi Yang Yaxing Wang Jian Yang and Ming-Ming Cheng. 2025. One-Prompt-One-Story: Free-Lunch Consistent Text-to-Image Generation Using a Single Prompt. arxiv:https:\/\/arXiv.org\/abs\/2501.13554\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2501.13554"},{"key":"e_1_3_3_1_19_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.194"},{"key":"e_1_3_3_1_20_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_5"},{"key":"e_1_3_3_1_21_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_5"},{"key":"e_1_3_3_1_22_2","unstructured":"Alex Nichol Prafulla Dhariwal Aditya Ramesh Pranav Shyam Pamela Mishkin Bob McGrew Ilya Sutskever and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2112.10741\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2112.10741"},{"key":"e_1_3_3_1_23_2","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical Text-Conditional Image Generation with CLIP Latents. arxiv:https:\/\/arXiv.org\/abs\/2204.06125\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2204.06125"},{"key":"e_1_3_3_1_24_2","unstructured":"Aditya Ramesh Mikhail Pavlov Gabriel Goh Scott Gray Chelsea Voss Alec Radford Mark Chen and Ilya Sutskever. 2021. Zero-Shot Text-to-Image Generation. arxiv:https:\/\/arXiv.org\/abs\/2102.12092\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2102.12092"},{"key":"e_1_3_3_1_25_2","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-Resolution Image Synthesis with Latent Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2112.10752\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2112.10752"},{"key":"e_1_3_3_1_26_2","unstructured":"Chitwan Saharia William Chan Saurabh Saxena Lala Li Jay Whang Emily Denton Seyed Kamyar\u00a0Seyed Ghasemipour Burcu\u00a0Karagol Ayan S.\u00a0Sara Mahdavi Rapha\u00a0Gontijo Lopes Tim Salimans Jonathan Ho David\u00a0J Fleet and Mohammad Norouzi. 2022. Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding. arxiv:https:\/\/arXiv.org\/abs\/2205.11487\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2205.11487"},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW69036.2025.00399"},{"key":"e_1_3_3_1_28_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_2"},{"key":"e_1_3_3_1_29_2","doi-asserted-by":"publisher","DOI":"10.1145\/3777867.3778252"},{"key":"e_1_3_3_1_30_2","unstructured":"Yoad Tewel Omri Kaduri Rinon Gal Yoni Kasten Lior Wolf Gal Chechik and Yuval Atzmon. 2024. Training-Free Consistent Text-to-Image Generation. arxiv:https:\/\/arXiv.org\/abs\/2402.03286\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2402.03286"},{"key":"e_1_3_3_1_31_2","unstructured":"Yuxin Wu Alexander Kirillov Francisco Massa Wan-Yen Lo and Ross Girshick. 2019. Detectron2. https:\/\/github.com\/facebookresearch\/detectron2."},{"key":"e_1_3_3_1_32_2","unstructured":"Zilyu Ye Jinxiu Liu Ruotian Peng Jinjin Cao Zhiyang Chen Yiyang Zhang Ziwei Xuan Mingyuan Zhou Xiaoqian Shen Mohamed Elhoseiny Qi Liu and Guo-Jun Qi. 2024. Openstory++: A Large-scale Dataset and Benchmark for Instance-aware Open-domain Visual Storytelling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.03695 (2024). https:\/\/arxiv.org\/abs\/2408.03695"},{"key":"e_1_3_3_1_33_2","unstructured":"Jiahui Yu Yuanzhong Xu Jing\u00a0Yu Koh Thang Luong Gunjan Baid Zirui Wang Vijay Vasudevan Alexander Ku Yinfei Yang Burcu\u00a0Karagol Ayan Ben Hutchinson Wei Han Zarana Parekh Xin Li Han Zhang Jason Baldridge and Yonghui Wu. 2022. Scaling Autoregressive Models for Content-Rich Text-to-Image Generation. arxiv:https:\/\/arXiv.org\/abs\/2206.10789\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/2206.10789"},{"key":"e_1_3_3_1_34_2","doi-asserted-by":"publisher","unstructured":"Gangyan Zeng Zhaohui Li and Yuan Zhang. 2019. PororoGAN: An Improved Story Visualization Model on Pororo-SV Dataset. Proceedings of the 3rd International Conference on Computer Science and Artificial Intelligence (2019) 1\u20135. 10.1145\/3374587.3374649","DOI":"10.1145\/3374587.3374649"},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"publisher","DOI":"10.1145\/3374587.3374649"},{"key":"e_1_3_3_1_36_2","unstructured":"Minfeng Zhu Pingbo Pan Wei Chen and Yi Yang. 2019. DM-GAN: Dynamic Memory Generative Adversarial Networks for Text-to-Image Synthesis. arxiv:https:\/\/arXiv.org\/abs\/1904.01310\u00a0[cs.CV] https:\/\/arxiv.org\/abs\/1904.01310"}],"event":{"name":"IUI '26: 31st International Conference on Intelligent User Interfaces","location":"Paphos Cyprus","acronym":"IUI '26 Companion","sponsor":["SIGCHI ACM Special Interest Group on Computer-Human Interaction","SIGAI ACM Special Interest Group on Artificial Intelligence"]},"container-title":["Companion Proceedings of the 31st International Conference on Intelligent User Interfaces"],"original-title":[],"deposited":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T11:05:49Z","timestamp":1773486349000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3742414.3794743"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,22]]},"references-count":35,"alternative-id":["10.1145\/3742414.3794743","10.1145\/3742414"],"URL":"https:\/\/doi.org\/10.1145\/3742414.3794743","relation":{},"subject":[],"published":{"date-parts":[[2026,3,22]]},"assertion":[{"value":"2026-03-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}