{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,4]],"date-time":"2026-03-04T02:38:27Z","timestamp":1772591907841,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,3,18]],"date-time":"2024-03-18T00:00:00Z","timestamp":1710720000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,3,18]]},"DOI":"10.1145\/3640543.3645154","type":"proceedings-article","created":{"date-parts":[[2024,4,5]],"date-time":"2024-04-05T18:23:12Z","timestamp":1712341392000},"page":"750-760","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Utilizing a Dense Video Captioning Technique for Generating Image Descriptions of Comics for People with Visual Impairments"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-1690-2777","authenticated-orcid":false,"given":"Suhyun","family":"Kim","sequence":"first","affiliation":[{"name":"The Department of Artificial Intelligence Convergence, Ewha Womans University, Korea, Republic of"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8700-7217","authenticated-orcid":false,"given":"Semin","family":"Lee","sequence":"additional","affiliation":[{"name":"The Department of Artificial Intelligence Convergence, Ewha Womans University, Korea, Republic of"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-9728-3856","authenticated-orcid":false,"given":"Kyungok","family":"Kim","sequence":"additional","affiliation":[{"name":"The Department of Artificial Intelligence Convergence, Ewha Womans University, Korea, Republic of"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7832-6313","authenticated-orcid":false,"given":"Uran","family":"Oh","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, Ewha Womans University, Korea, Republic of"}]}],"member":"320","published-online":{"date-parts":[[2024,4,5]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. clova.ai. https:\/\/clova.ai\/. Accessed: 09\/14\/2023."},{"key":"e_1_3_2_1_2_1","unstructured":"[n. d.]. comic popularity. https:\/\/vogue.sg\/upcoming-korean-dramas-based-on-webtoons\/."},{"key":"e_1_3_2_1_3_1","volume-title":"d.]. Digital comic book market to grow by USD 2.26 Billion from 2022 to","year":"2027","unstructured":"[n. d.]. Digital comic book market to grow by USD 2.26 Billion from 2022 to 2027. https:\/\/www.prnewswire.com\/news-releases\/digital-comic-book-market-to-grow-by-usd-2-26-billion-from-2022-to-2027\u2013growing-popularity-of-indie-comics-to-be-market-trend\u2014technavio-301903444.html."},{"key":"e_1_3_2_1_4_1","unstructured":"[n. d.]. huggingface. https:\/\/huggingface.co\/spaces\/flax-community\/image-captioning. Accessed: 08\/27\/2023."},{"key":"e_1_3_2_1_5_1","unstructured":"[n. d.]. naver webtoon. https:\/\/www.webtoons.com\/en\/."},{"key":"e_1_3_2_1_6_1","unstructured":"[n. d.]. whisperX. https:\/\/github.com\/m-bain\/whisperX."},{"key":"e_1_3_2_1_7_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65\u201372","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65\u201372."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_9_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2729019"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373625.3417027"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308561.3353792"},{"key":"e_1_3_2_1_13_1","volume-title":"Generating best evidence from qualitative research: the role of data analysis. Australian and New Zealand journal of public health 31, 6","author":"Green Julie","year":"2007","unstructured":"Julie Green, Karen Willis, Emma Hughes, Rhonda Small, Nicky Welch, Lisa Gibbs, and Jeanne Daly. 2007. Generating best evidence from qualitative research: the role of data analysis. Australian and New Zealand journal of public health 31, 6 (2007), 545\u2013550."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01815"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3502081"},{"key":"e_1_3_2_1_16_1","volume-title":"A better use of audio-visual cues: Dense video captioning with bi-modal transformer. arXiv preprint arXiv:2005.08271","author":"Iashin Vladimir","year":"2020","unstructured":"Vladimir Iashin and Esa Rahtu. 2020. A better use of audio-visual cues: Dense video captioning with bi-modal transformer. arXiv preprint arXiv:2005.08271 (2020)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3555720"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00782"},{"key":"e_1_3_2_1_20_1","volume-title":"InEuropean conference on computer vision 2014 Sep 6 (pp. 740-755)","author":"Lin TY","unstructured":"TY Lin, M Maire, S Belongie, J Hays, P Perona, D Ramanan, P Doll\u00e1r, CL Zitnick, [n. d.]. Microsoft coco: Common objects in context. InEuropean conference on computer vision 2014 Sep 6 (pp. 740-755)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308561.3354629"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3441852.3471207"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3551581"},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311\u2013318."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_27_1","volume-title":"Language models are unsupervised multitask learners. OpenAI blog 1, 8","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever, 2019. Language models are unsupervised multitask learners. OpenAI blog 1, 8 (2019), 9."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1609\/hcomp.v5i1.13301"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3313831.3376404"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3234695.3236337"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3479207"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2016.61"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2818048.2820013"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3411764.3445347"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/2998181.2998364"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01032"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3374587.3374649"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3491102.3502092"}],"event":{"name":"IUI '24: 29th International Conference on Intelligent User Interfaces","location":"Greenville SC USA","acronym":"IUI '24","sponsor":["SIGAI ACM Special Interest Group on Artificial Intelligence","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 29th International Conference on Intelligent User Interfaces"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3640543.3645154","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3640543.3645154","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,1]],"date-time":"2025-12-01T00:55:39Z","timestamp":1764550539000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3640543.3645154"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,3,18]]},"references-count":38,"alternative-id":["10.1145\/3640543.3645154","10.1145\/3640543"],"URL":"https:\/\/doi.org\/10.1145\/3640543.3645154","relation":{},"subject":[],"published":{"date-parts":[[2024,3,18]]},"assertion":[{"value":"2024-04-05","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}