{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:08:32Z","timestamp":1765357712675,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":26,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681659","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:41Z","timestamp":1729925981000},"page":"7366-7374","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Zero-Shot Character Identification and Speaker Prediction in Comics via Iterative Multimodal Fusion"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-2230-2679","authenticated-orcid":false,"given":"Yingxuan","family":"Li","sequence":"first","affiliation":[{"name":"The University of Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1542-2612","authenticated-orcid":false,"given":"Ryota","family":"Hinami","sequence":"additional","affiliation":[{"name":"Mantra Inc., Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2146-6275","authenticated-orcid":false,"given":"Kiyoharu","family":"Aizawa","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Tokyo, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1529-0154","authenticated-orcid":false,"given":"Yusuke","family":"Matsui","sequence":"additional","affiliation":[{"name":"The University of Tokyo, Tokyo, Japan"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/MMUL.2020.2987895"},{"key":"e_1_3_2_1_2_1","volume-title":"Danbooru community, and Gwern Branwen","author":"Anonymous","year":"2022","unstructured":"Anonymous, Danbooru community, and Gwern Branwen. 2022. Danbooru2021: A Large-Scale Crowdsourced & Tagged Anime Illustration Dataset. https:\/\/gwern.net\/danbooru2021."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i14.17537"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.67"},{"key":"e_1_3_2_1_7_1","unstructured":"Hikaru Ikuta Leslie W\u00f6hler and Kiyoharu Aizawa. 2024. MangaUB: A Manga Understanding Benchmark for Large Multimodal Models. arxiv: 2407.19034"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICME57554.2024.10687709"},{"key":"e_1_3_2_1_10_1","unstructured":"Haotian Liu Chunyuan Li Yuheng Li and Yong Jae Lee. 2023. Improved baselines with visual instruction tuning. arxiv: 2310.03744"},{"key":"e_1_3_2_1_11_1","unstructured":"Haotian Liu Chunyuan Li Qingyang Wu and Yong Jae Lee. 2023. Visual Instruction Tuning. arxiv: 2304.08485"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIT.1982.1056489"},{"key":"e_1_3_2_1_14_1","volume-title":"Object detection for comics using manga109 annotations. arxiv","author":"Ogawa Toru","year":"1803","unstructured":"Toru Ogawa, Atsushi Otsubo, Rei Narita, Yusuke Matsui, Toshihiko Yamasaki, and Kiyoharu Aizawa. 2018. Object detection for comics using manga109 annotations. arxiv: 1803.08670"},{"key":"e_1_3_2_1_15_1","unstructured":"OpenAI. 2022. ChatGPT. https:\/\/openai.com\/blog\/chatgpt\/."},{"key":"e_1_3_2_1_17_1","volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems","author":"Ren Shaoqing","year":"2015","unstructured":"Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. 2015. Faster r-cnn: Towards real-time object detection with region proposal networks. Advances in neural information processing systems, Vol. 28 (2015)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDAR.2015.7333782"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00377"},{"volume-title":"SIGGRAPH Asia 2018 Posters","author":"Tsubota Koki","key":"e_1_3_2_1_20_1","unstructured":"Koki Tsubota, Toru Ogawa, Toshihiko Yamasaki, and Kiyoharu Aizawa. 2018. Adaptation of Manga Face Representation for Accurate Clustering. In SIGGRAPH Asia 2018 Posters. Association for Computing Machinery, Article 15, 2 pages."},{"key":"e_1_3_2_1_21_1","unstructured":"Jason Wei Yi Tay Rishi Bommasani Colin Raffel Barret Zoph Sebastian Borgeaud Dani Yogatama Maarten Bosma Denny Zhou Donald Metzler et al. 2022. Emergent abilities of large language models. Transactions on Machine Learning Research (2022)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.330"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00611"},{"key":"e_1_3_2_1_24_1","volume-title":"Hashimoto","author":"Zhang Tianyi","year":"2023","unstructured":"Tianyi Zhang, Faisal Ladhak, Esin Durmus, Percy Liang, Kathleen McKeown, and Tatsunori B. Hashimoto. 2023. Benchmarking Large Language Models for News Summarization. arxiv: 2301.13848"},{"key":"e_1_3_2_1_25_1","unstructured":"Zhimin Zhang Zheng Wang and Wei Hu. 2022. Unsupervised Manga Character Re-identification via Face-body and Spatial-temporal Associated Clustering. arxiv: 2204.04621"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413726"},{"key":"e_1_3_2_1_27_1","unstructured":"Deyao Zhu Jun Chen Xiaoqian Shen Xiang Li and Mohamed Elhoseiny. 2023. Minigpt-4: Enhancing vision-language understanding with advanced large language models. arxiv: 2304.10592"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681659","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681659","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:49Z","timestamp":1750295869000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681659"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":26,"alternative-id":["10.1145\/3664647.3681659","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681659","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}