{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,13]],"date-time":"2025-09-13T16:05:54Z","timestamp":1757779554735,"version":"3.41.0"},"publisher-location":"Cham","reference-count":33,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031928079","type":"print"},{"value":"9783031928086","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-92808-6_4","type":"book-chapter","created":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T15:59:33Z","timestamp":1748361573000},"page":"56-67","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["ComiCap: A VLMs Pipeline for\u00a0Dense Captioning of\u00a0Comic Panels"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9971-8738","authenticated-orcid":false,"given":"Emanuele","family":"Vivoli","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1153-1651","authenticated-orcid":false,"given":"Niccol\u00f2","family":"Biondi","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1364-218X","authenticated-orcid":false,"given":"Marco","family":"Bertini","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8762-4454","authenticated-orcid":false,"given":"Dimosthenis","family":"Karatzas","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"4_CR1","unstructured":"Alayrac, J.B., et\u00a0al.: Flamingo: a visual language model for few-shot learning (2022). https:\/\/arxiv.org\/abs\/2204.14198"},{"key":"4_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering (2018). https:\/\/arxiv.org\/abs\/1707.07998","DOI":"10.1109\/CVPR.2018.00636"},{"key":"4_CR3","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Goldstein, J., Lavie, A., Lin, C.Y., Voss, C. (eds.) Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, Ann Arbor, Michigan, pp. 65\u201372. Association for Computational Linguistics (2005). https:\/\/aclanthology.org\/W05-0909"},{"key":"4_CR4","unstructured":"Beyer, L., et\u00a0al.: Paligemma: a versatile 3B VLM for transfer (2024). https:\/\/arxiv.org\/abs\/2407.07726"},{"key":"4_CR5","unstructured":"Caffagni, D., et al.: The revolution of multimodal large language models: a survey (2024). https:\/\/arxiv.org\/abs\/2402.12451"},{"key":"4_CR6","unstructured":"Dubey, A., et\u00a0al.: The llama 3 herd of models (2024). https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"4_CR7","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation (2014). https:\/\/arxiv.org\/abs\/1311.2524","DOI":"10.1109\/CVPR.2014.81"},{"key":"4_CR8","doi-asserted-by":"crossref","unstructured":"Iyyer, M., et al.: The amazing mysteries of the gutter: drawing inferences between panels in comic book narratives (2017). https:\/\/arxiv.org\/abs\/1611.05118","DOI":"10.1109\/CVPR.2017.686"},{"key":"4_CR9","unstructured":"Jiang, A.Q., et\u00a0al.: Mistral 7B (2023). https:\/\/arxiv.org\/abs\/2310.06825"},{"key":"4_CR10","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: Densecap: fully convolutional localization networks for dense captioning (2015). https:\/\/arxiv.org\/abs\/1511.07571","DOI":"10.1109\/CVPR.2016.494"},{"key":"4_CR11","doi-asserted-by":"crossref","unstructured":"Krishna, R., Hata, K., Ren, F., Fei-Fei, L., Niebles, J.C.: Dense-captioning events in videos (2017). https:\/\/arxiv.org\/abs\/1705.00754","DOI":"10.1109\/ICCV.2017.83"},{"key":"4_CR12","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations (2016). https:\/\/arxiv.org\/abs\/1602.07332"},{"key":"4_CR13","unstructured":"Lauren\u00e7on, H., et\u00a0al.: Obelics: an open web-scale filtered dataset of interleaved image-text documents (2023). https:\/\/arxiv.org\/abs\/2306.16527"},{"key":"4_CR14","unstructured":"Lauren\u00e7on, H., Tronchon, L., Cord, M., Sanh, V.: What matters when building vision-language models? (2024). https:\/\/arxiv.org\/abs\/2405.02246"},{"key":"4_CR15","doi-asserted-by":"publisher","unstructured":"Li, Y., Aizawa, K., Matsui, Y.: Manga109Dialog A Large-scale Dialogue Dataset for Comics Speaker Detection. arXiv. https:\/\/doi.org\/10.48550\/arXiv.2306.17469. http:\/\/arxiv.org\/abs\/2306.17469","DOI":"10.48550\/arXiv.2306.17469"},{"key":"4_CR16","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., Lee, Y.J.: Improved baselines with visual instruction tuning (2024). https:\/\/arxiv.org\/abs\/2310.03744","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"4_CR17","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning (2023). https:\/\/arxiv.org\/abs\/2304.08485"},{"key":"4_CR18","unstructured":"OpenBMB: Minicpm-v: a GPT-4v level multimodal LLM on your phone (2023). https:\/\/github.com\/OpenBMB\/MiniCPM-V"},{"key":"4_CR19","unstructured":"Ramaprasad, R.: Comics for everyone: generating accessible text descriptions for comic strips. https:\/\/arxiv.org\/abs\/2310.00698"},{"key":"4_CR20","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks (2016). https:\/\/arxiv.org\/abs\/1506.01497"},{"key":"4_CR21","doi-asserted-by":"crossref","unstructured":"Rigaud, C., Burie, J.C., Petit, S.: Toward accessible comics for blind and low vision readers (2024). https:\/\/arxiv.org\/abs\/2407.08248","DOI":"10.1007\/978-3-031-70645-5_13"},{"key":"4_CR22","doi-asserted-by":"crossref","unstructured":"Sachdeva, R., Shin, G., Zisserman, A.: Tails tell tales: chapter-wide manga transcriptions with character names (2024). https:\/\/arxiv.org\/abs\/2408.00298","DOI":"10.1007\/978-981-96-0908-6_4"},{"key":"4_CR23","doi-asserted-by":"crossref","unstructured":"Sachdeva, R., Zisserman, A.: The manga whisperer: automatically generating transcriptions for comics. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12967\u201312976 (2024)","DOI":"10.1109\/CVPR52733.2024.01232"},{"key":"4_CR24","unstructured":"Sermanet, P., Eigen, D., Zhang, X., Mathieu, M., Fergus, R., LeCun, Y.: Overfeat: integrated recognition, localization and detection using convolutional networks (2014). https:\/\/arxiv.org\/abs\/1312.6229"},{"key":"4_CR25","unstructured":"Team, G., et\u00a0al.: Gemma: open models based on gemini research and technology (2024). https:\/\/arxiv.org\/abs\/2403.08295"},{"key":"4_CR26","doi-asserted-by":"crossref","unstructured":"Topal, B.B., Yuret, D., Sezgin, T.M.: Domain-adaptive self-supervised pre-training for face and body detection in drawings (2023). https:\/\/arxiv.org\/abs\/2211.10641","DOI":"10.24963\/ijcai.2023\/159"},{"key":"4_CR27","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator (2015). https:\/\/arxiv.org\/abs\/1411.4555","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"4_CR28","unstructured":"Vivoli, E., Bertini, M., Karatzas, D.: Comix: a comprehensive benchmark for multi-task comic understanding (2024). https:\/\/arxiv.org\/abs\/2407.03550"},{"key":"4_CR29","doi-asserted-by":"crossref","unstructured":"Vivoli, E., Campaioli, I., Nardoni, M., Biondi, N., Bertini, M., Karatzas, D.: Comics datasets framework: mix of comics datasets for detection benchmarking (2024). https:\/\/arxiv.org\/abs\/2407.03540","DOI":"10.1007\/978-3-031-70645-5_11"},{"key":"4_CR30","doi-asserted-by":"crossref","unstructured":"Xiao, B., et al.: Florence-2: advancing a unified representation for a variety of vision tasks (2023). https:\/\/arxiv.org\/abs\/2311.06242","DOI":"10.1109\/CVPR52733.2024.00461"},{"key":"4_CR31","unstructured":"Xu, R., et al.: LLaVA-UHD: an LMM perceiving any aspect ratio and high-resolution images (2024). https:\/\/arxiv.org\/abs\/2403.11703"},{"key":"4_CR32","doi-asserted-by":"crossref","unstructured":"Zhai, X., Mustafa, B., Kolesnikov, A., Beyer, L.: Sigmoid loss for language image pre-training (2023). https:\/\/arxiv.org\/abs\/2303.15343","DOI":"10.1109\/ICCV51070.2023.01100"},{"key":"4_CR33","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: enhancing vision-language understanding with advanced large language models (2023). https:\/\/arxiv.org\/abs\/2304.10592"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-92808-6_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T15:59:41Z","timestamp":1748361581000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-92808-6_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031928079","9783031928086"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-92808-6_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}