{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T17:37:47Z","timestamp":1758044267598,"version":"3.44.0"},"publisher-location":"Cham","reference-count":27,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032046130","type":"print"},{"value":"9783032046147","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,9,13]],"date-time":"2025-09-13T00:00:00Z","timestamp":1757721600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,9,13]],"date-time":"2025-09-13T00:00:00Z","timestamp":1757721600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-04614-7_19","type":"book-chapter","created":{"date-parts":[[2025,9,12]],"date-time":"2025-09-12T12:22:51Z","timestamp":1757679771000},"page":"337-350","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["ComicsPAP: Understanding Comic Strips by\u00a0Picking the\u00a0Correct Panel"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9971-8738","authenticated-orcid":false,"given":"Emanuele","family":"Vivoli","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6128-1796","authenticated-orcid":false,"given":"Artemis","family":"Llabr\u00e9s","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0100-9392","authenticated-orcid":false,"given":"Mohamed Ali","family":"Souibgui","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1364-218X","authenticated-orcid":false,"given":"Marco","family":"Bertini","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0368-9697","authenticated-orcid":false,"given":"Ernest Valveny","family":"Llobet","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8762-4454","authenticated-orcid":false,"given":"Dimosthenis","family":"Karatzas","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,9,13]]},"reference":[{"key":"19_CR1","unstructured":"Bai, S., et al.: Qwen2.5-VL Technical Report (2025). arXiv: 2502.13923"},{"key":"19_CR2","doi-asserted-by":"crossref","unstructured":"Chen, Z., et al.: How far are we to GPT-4V? Closing the gap to commercial multimodal models with open-source suites (2024). arXiv: 2404.16821","DOI":"10.1007\/s11432-024-4231-5"},{"key":"19_CR3","unstructured":"Cohn, N.: Visual language lab (2025). Accessed 10 May 2025. https:\/\/www.visuallanguagelab.com\/"},{"key":"19_CR4","unstructured":"Deitke, M., et al.: Molmo and PixMo: open weights and open data for state-of-the-art vision-language models (2024). arXiv: 2409.17146"},{"key":"19_CR5","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics (NAACL 2019), HLT, pp. 4171\u20134186 (2019)"},{"key":"19_CR6","doi-asserted-by":"crossref","unstructured":"Gu\u00e9rin, C., et al.: eBDtheque: a representative database of comics. In: 2013 12th International Conference on Document Analysis and Recognition, pp. 1145\u20131149. IEEE (2013)","DOI":"10.1109\/ICDAR.2013.232"},{"key":"19_CR7","doi-asserted-by":"crossref","unstructured":"Ikuta, H., W\u00f6hler, L., Aizawa, K.: MangaUB: a manga understanding benchmark for large multimodal models (2024). arXiv: 2407.19034","DOI":"10.1109\/MMUL.2025.3550451"},{"key":"19_CR8","doi-asserted-by":"crossref","unstructured":"Iyyer, M., et al.: The amazing mysteries of the gutter: Drawing inferences between panels in comic book narratives. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7186\u20137195 (2017)","DOI":"10.1109\/CVPR.2017.686"},{"key":"19_CR9","unstructured":"Jiang, D., et al.: Mantis: interleaved multi-image instruction tuning. In: Transactions on Machine Learning Research (2024). ISSN: 2835-8856. https:\/\/openreview.net\/forum?id=skLtdUVaJa"},{"key":"19_CR10","unstructured":"Marafioti, A., et al.: SmolVLM: redefining small and efficient multimodal models (2025). arXiv: 2504.05299"},{"key":"19_CR11","doi-asserted-by":"crossref","unstructured":"Nguyen, N.-V., Rigaud, C., Burie, J.-C.: Digital comics image indexing based on deep learning. J. Imaging 4(7), 89 (2018)","DOI":"10.3390\/jimaging4070089"},{"key":"19_CR12","unstructured":"Radford, A.: Improving language understanding with unsupervised learning. In: OpenAI Res (2018)"},{"key":"19_CR13","doi-asserted-by":"crossref","unstructured":"Sachdeva, R., Shin, G., Zisserman, A.: Tails Tell Tales: chapter-wide manga transcriptions with character names. In: Proceedings of the Asian Conference on Computer Vision (ACCV) 2024 (2024)","DOI":"10.1007\/978-981-96-0908-6_4"},{"key":"19_CR14","doi-asserted-by":"crossref","unstructured":"Sachdeva, R., Zisserman, A.: The Manga Whisperer: automatically generating transcriptions for comics. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 12967\u201312976 (2024)","DOI":"10.1109\/CVPR52733.2024.01232"},{"key":"19_CR15","unstructured":"Schuhmann, C., et al.: LAION-400M: open dataset of CLIP-filtered 400 million image-text pairs. In: Proceedings of the NeurIPS 2021 Workshop on Data-Centric AI (2021)"},{"key":"19_CR16","unstructured":"Steiner, A., et al.: PaliGemma 2: a family of versatile VLMs for transfer (2024). arXiv: 2412.03555"},{"key":"19_CR17","doi-asserted-by":"crossref","unstructured":"Tang, Y., et al.: Video understanding with large language models: a survey. IEEE Trans. Circuits Syst. Video Technol. (2025)","DOI":"10.1109\/TCSVT.2025.3566695"},{"key":"19_CR18","doi-asserted-by":"crossref","unstructured":"Taylor, W.L.: Cloze procedure: a new tool for measuring readability. J. Q. 30(4), 415\u2013433 (1953)","DOI":"10.1177\/107769905303000401"},{"key":"19_CR19","unstructured":"Vivoli, E., Bertini, M., Karatzas, D.: CoMix: a comprehensive benchmark for multi-task comic understanding. In: Advances in Neural Information Processing Systems 37 (NeurIPS 2024), Datasets and Benchmarks Track (2024)"},{"key":"19_CR20","doi-asserted-by":"crossref","unstructured":"Vivoli, E., Biondi, N., Bertini, M., Karatzas, D.: ComiCap: a VLMs pipeline for dense captioning of comic panels. In: Proceedings of the European Conference on Computer Vision (ECCV 2024), AI for Visual Arts Workshop (AI4VA) (2024)","DOI":"10.1007\/978-3-031-92808-6_4"},{"key":"19_CR21","doi-asserted-by":"crossref","unstructured":"Vivoli, E., Campaioli, I., Nardoni, M., Biondi, N., Bertini, M., Karatzas, D.: Comics Datasets Framework: mix of comics datasets for detection benchmarking. In: International Conference on Document Analysis and Recognition (ICDAR 2024) (2024)","DOI":"10.1007\/978-3-031-70645-5_11"},{"key":"19_CR22","doi-asserted-by":"crossref","unstructured":"Vivoli, E., Baeza, J.L., Llobet, E.V., Karatzas, D.: Multimodal transformer for comics text-cloze. In: International Conference on Document Analysis and Recognition (ICDAR 2024) (2024)","DOI":"10.1007\/978-3-031-70552-6_8"},{"key":"19_CR23","unstructured":"Vivoli, E., Souibgui, M.A., Barsky, A., LLabr\u00e9s, A., Bertini, M., Karatzas, D.: One missing piece in vision and language: a survey on comics understanding (2025). arXiv: 2409.09502"},{"key":"19_CR24","unstructured":"Wang, P., et al.: Qwen2-VL: enhancing vision-language model\u2019s perception of the world at any resolution (2024). arXiv: 2409.12191"},{"key":"19_CR25","doi-asserted-by":"crossref","unstructured":"Wu, J., Tang, C., Wang, J., Zeng, Y., Li, X., Tong, Y.: DiffSensei: bridging multi-modal LLMs and diffusion models for customized manga generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2025)","DOI":"10.1109\/CVPR52734.2025.02671"},{"key":"19_CR26","unstructured":"Yang, A., et al.: Qwen2 technical report (2024). arXiv: 2407.10671"},{"key":"19_CR27","unstructured":"Zhang, B., et al.: VideoLLaMA 3: frontier multimodal foundation models for image and video understanding (2025). arXiv: 2501.13106"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition \u2013 ICDAR 2025"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-04614-7_19","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,12]],"date-time":"2025-09-12T12:23:05Z","timestamp":1757679785000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-04614-7_19"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,13]]},"ISBN":["9783032046130","9783032046147"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-04614-7_19","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9,13]]},"assertion":[{"value":"13 September 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wuhan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 September 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21 September 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/iapr.org\/icdar2025","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}