{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T11:27:40Z","timestamp":1751887660395,"version":"3.41.0"},"publisher-location":"Cham","reference-count":50,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031928079","type":"print"},{"value":"9783031928086","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-92808-6_10","type":"book-chapter","created":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T15:59:17Z","timestamp":1748361557000},"page":"155-172","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Unlocking Comics: The AI4VA Dataset for\u00a0Visual Understanding"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3290-9361","authenticated-orcid":false,"given":"Peter","family":"Gr\u00f6nquist","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0534-852X","authenticated-orcid":false,"given":"Deblina","family":"Bhattacharjee","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5202-5240","authenticated-orcid":false,"given":"Bahar","family":"Aydemir","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8512-3381","authenticated-orcid":false,"given":"Baran","family":"Ozaydin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5818-4285","authenticated-orcid":false,"given":"Tong","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8347-8637","authenticated-orcid":false,"given":"Mathieu","family":"Salzmann","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0441-6068","authenticated-orcid":false,"given":"Sabine","family":"S\u00fcsstrunk","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"10_CR1","unstructured":"Pupil Core - Eye Tracking Hardware. https:\/\/pupil-labs.com\/products\/core. Accessed 5 March 2024"},{"issue":"2","key":"10_CR2","doi-asserted-by":"publisher","first-page":"8","DOI":"10.1109\/mmul.2020.2987895","volume":"27","author":"K Aizawa","year":"2020","unstructured":"Aizawa, K., et al.: Building a manga dataset manga109 with annotations for multimedia applications. IEEE Multimedia 27(2), 8\u201318 (2020). https:\/\/doi.org\/10.1109\/mmul.2020.2987895","journal-title":"IEEE Multimedia"},{"key":"10_CR3","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning (2022)"},{"key":"10_CR4","doi-asserted-by":"crossref","unstructured":"Aydemir, B., Hoffstetter, L., Zhang, T., Salzmann, M., S\u00fcsstrunk, S.: TempSAL - uncovering temporal information for deep saliency prediction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00625"},{"key":"10_CR5","unstructured":"Bhat, S.F., Birkl, R., Wofk, D., Wonka, P., M\u00fcller, M.: ZoeDepth: zero-shot transfer by combining relative and metric depth. arXiv preprint arXiv:2302.12288 (2023)"},{"key":"10_CR6","doi-asserted-by":"crossref","unstructured":"Bhattacharjee, D., Everaert, M., Salzmann, M., S\u00fcsstrunk, S.: Estimating image depth in the comics domain. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 2070\u20132079 (2022)","DOI":"10.1109\/WACV51458.2022.00118"},{"key":"10_CR7","doi-asserted-by":"crossref","unstructured":"Bhattacharjee, D., Kim, S., Vizier, G., Salzmann, M.: DUnit: detection-based unsupervised image-to-image translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2020)","DOI":"10.1109\/CVPR42600.2020.00484"},{"issue":"1","key":"10_CR8","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1016\/0734-189X(85)90002-7","volume":"32","author":"I Biederman","year":"1985","unstructured":"Biederman, I.: Human image understanding: recent research and a theory. Comput. Vis. Graph. Image Process. 32(1), 29\u201373 (1985). https:\/\/doi.org\/10.1016\/0734-189X(85)90002-7","journal-title":"Comput. Vis. Graph. Image Process."},{"key":"10_CR9","unstructured":"Brown, T., et al.: Language models are few-shot learners. In: Larochelle, H., Ranzato, M., Hadsell, R., Balcan, M., Lin, H. (eds.) Advances in Neural Information Processing Systems, vol.\u00a033, pp. 1877\u20131901. Curran Associates, Inc. (2020). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"},{"issue":"3","key":"10_CR10","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1109\/TPAMI.2018.2815601","volume":"41","author":"Z Bylinskii","year":"2019","unstructured":"Bylinskii, Z., Judd, T., Oliva, A., Torralba, A., Durand, F.: What do different evaluation metrics tell us about saliency models? IEEE Trans. Pattern Anal. Mach. Intell. 41(3), 740\u2013757 (2019). https:\/\/doi.org\/10.1109\/TPAMI.2018.2815601","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"2","key":"10_CR11","doi-asserted-by":"publisher","first-page":"193","DOI":"10.1016\/j.inffus.2005.10.001","volume":"8","author":"H Chen","year":"2007","unstructured":"Chen, H., Varshney, P.K.: A human perception inspired quality metric for image fusion based on regional information. Inf. Fusion 8(2), 193\u2013207 (2007). https:\/\/doi.org\/10.1016\/j.inffus.2005.10.001","journal-title":"Inf. Fusion"},{"key":"10_CR12","doi-asserted-by":"publisher","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255 (2009). https:\/\/doi.org\/10.1109\/CVPR.2009.5206848","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"10_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"419","DOI":"10.1007\/978-3-030-58558-7_25","volume-title":"Computer Vision \u2013 ECCV 2020","author":"R Droste","year":"2020","unstructured":"Droste, R., Jiao, J., Noble, J.A.: Unified image and video saliency modeling. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12350, pp. 419\u2013435. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58558-7_25"},{"key":"10_CR14","doi-asserted-by":"publisher","unstructured":"Engilberge, M., Collins, E., S\u00fcsstrunk, S.: Color representation in deep neural networks. In: 2017 IEEE International Conference on Image Processing (ICIP), pp. 2786\u20132790 (2017). https:\/\/doi.org\/10.1109\/ICIP.2017.8296790","DOI":"10.1109\/ICIP.2017.8296790"},{"key":"10_CR15","doi-asserted-by":"publisher","first-page":"167","DOI":"10.1146\/annurev.psych.58.110405.085632","volume":"59","author":"W Geisler","year":"2008","unstructured":"Geisler, W.: Visual perception and the statistical properties of natural scenes. Ann. Rev. Psychol. 59, 167\u201392 (2008). https:\/\/doi.org\/10.1146\/annurev.psych.58.110405.085632","journal-title":"Ann. Rev. Psychol."},{"key":"10_CR16","unstructured":"Gildenblat, J., et al.: PyTorch library for cam methods (2021). https:\/\/github.com\/jacobgil\/pytorch-grad-cam"},{"key":"10_CR17","doi-asserted-by":"crossref","unstructured":"Gu\u2019erin, C., et al.: eBDtheque: a representative database of comics. In: Proceedings of the 12th International Conference on Document Analysis and Recognition (ICDAR), pp. 1145\u20131149 (2013)","DOI":"10.1109\/ICDAR.2013.232"},{"key":"10_CR18","doi-asserted-by":"crossref","unstructured":"Inoue, N., Furuta, R., Yamasaki, T., Aizawa, K.: Cross-domain weakly-supervised object detection through progressive domain adaptation. In: The IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5001\u20135009 (2018)","DOI":"10.1109\/CVPR.2018.00525"},{"key":"10_CR19","doi-asserted-by":"publisher","first-page":"194","DOI":"10.1038\/35058500","volume":"2","author":"L Itti","year":"2001","unstructured":"Itti, L., Koch, C.: Computational modeling of visual attention. Nat. Rev. Neurosci. 2, 194\u2013203 (2001). https:\/\/doi.org\/10.1038\/35058500","journal-title":"Nat. Rev. Neurosci."},{"key":"10_CR20","doi-asserted-by":"crossref","unstructured":"Iyyer, M., et al.: The amazing mysteries of the gutter: drawing inferences between panels in comic book narratives. In: IEEE Conference on Computer Vision and Pattern Recognition (2017)","DOI":"10.1109\/CVPR.2017.686"},{"key":"10_CR21","doi-asserted-by":"crossref","unstructured":"Ji, Y., et al.: DDP: diffusion model for dense visual prediction. arXiv preprint arXiv:2303.17559 (2023)","DOI":"10.1109\/ICCV51070.2023.01987"},{"key":"10_CR22","doi-asserted-by":"publisher","unstructured":"Jiang, P.T., Zhang, C.B., Hou, Q., Cheng, M.M., Wei, Y.: LayerCam: exploring hierarchical class activation maps. IEEE Trans. Image Process. (2021). https:\/\/doi.org\/10.1109\/TIP.2021.3089943","DOI":"10.1109\/TIP.2021.3089943"},{"key":"10_CR23","doi-asserted-by":"crossref","unstructured":"Ke, B., Obukhov, A., Huang, S., Metzger, N., Daudt, R.C., Schindler, K.: Repurposing diffusion-based image generators for monocular depth estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) (2024)","DOI":"10.1109\/CVPR52733.2024.00907"},{"key":"10_CR24","doi-asserted-by":"publisher","first-page":"147808","DOI":"10.1109\/ACCESS.2020.3016008","volume":"8","author":"D Kim","year":"2020","unstructured":"Kim, D., Lee, S., Lee, J., Kim, J.: Leveraging contextual information for monocular depth estimation. IEEE Access 8, 147808\u2013147817 (2020). https:\/\/doi.org\/10.1109\/ACCESS.2020.3016008","journal-title":"IEEE Access"},{"key":"10_CR25","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: ImageNet classification with deep convolutional neural networks. In: Pereira, F., Burges, C., Bottou, L., Weinberger, K. (eds.) Advances in Neural Information Processing Systems, vol.\u00a025. Curran Associates, Inc. (2012). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2012\/file\/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf"},{"issue":"5","key":"10_CR26","doi-asserted-by":"publisher","first-page":"7","DOI":"10.1167\/jov.22.5.7","volume":"22","author":"M K\u00fcmmerer","year":"2022","unstructured":"K\u00fcmmerer, M., Bethge, M., Wallis, T.S.A.: DeepGaze III: modeling free-viewing human scanpaths with deep learning. J. Vis. 22(5), 7 (2022). https:\/\/doi.org\/10.1167\/jov.22.5.7","journal-title":"J. Vis."},{"key":"10_CR27","unstructured":"K\u00fcmmerer, M., Theis, L., Bethge, M.: Deep Gaze I: boosting saliency prediction with feature maps trained on ImageNet. In: International Conference on Learning Representations (ICLR) Workshops (2015)"},{"issue":"10","key":"10_CR28","doi-asserted-by":"publisher","first-page":"1147","DOI":"10.1167\/17.10.1147","volume":"17","author":"M K\u00fcmmerer","year":"2017","unstructured":"K\u00fcmmerer, M., Wallis, T., Bethge, M.: DeepGaze II: predicting fixations from deep features over time and tasks. J. Vis. 17(10), 1147 (2017). https:\/\/doi.org\/10.1167\/17.10.1147","journal-title":"J. Vis."},{"key":"10_CR29","unstructured":"Lavreniuk, M., Bhat, S.F., M\u00fcller, M., Wonka, P.: EVP: enhanced visual perception using inverse multi-attentive feature refinement and regularized image-text alignment. arXiv preprint arXiv:2312.08548 (2023)"},{"issue":"7553","key":"10_CR30","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1038\/nature14539","volume":"521","author":"Y LeCun","year":"2015","unstructured":"LeCun, Y., Bengio, Y., Hinton, G.: Deep learning. Nature 521(7553), 436 (2015)","journal-title":"Nature"},{"key":"10_CR31","doi-asserted-by":"crossref","unstructured":"Li, Z., Chen, Z., Liu, X., Jiang, J.: DepthFormer: exploiting long-range correlation and local information for accurate monocular depth estimation. arXiv preprint arXiv:2203.14211 (2022)","DOI":"10.1007\/s11633-023-1458-0"},{"key":"10_CR32","doi-asserted-by":"crossref","unstructured":"Lin, T., et al.: Microsoft COCO: common objects in context. arXiv preprint arXiv:1405.0312 (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"10_CR33","doi-asserted-by":"publisher","unstructured":"Linardos, A., Kummerer, M., Press, O., Bethge, M.: DeepGaze IIE: calibrated prediction in and out-of-domain for state-of-the-art saliency modeling. In: 2021 IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 12899\u201312908. IEEE Computer Society, Los Alamitos, CA, USA (2021). https:\/\/doi.org\/10.1109\/ICCV48922.2021.01268","DOI":"10.1109\/ICCV48922.2021.01268"},{"issue":"2","key":"10_CR34","doi-asserted-by":"publisher","first-page":"91","DOI":"10.1023\/B:VISI.0000029664.99615.94","volume":"60","author":"DG Lowe","year":"2004","unstructured":"Lowe, D.G.: Distinctive image features from scale-invariant keypoints. Int. J. Comput. Vis. 60(2), 91\u2013110 (2004). https:\/\/doi.org\/10.1023\/B:VISI.0000029664.99615.94","journal-title":"Int. J. Comput. Vis."},{"key":"10_CR35","doi-asserted-by":"publisher","unstructured":"Miyagawa, S., Lesure, C., N\u00f3brega, V.A.: Cross-modality information transfer: a hypothesis about the relationship among prehistoric cave paintings, symbolic thinking, and the emergence of language. Front. Psychol. 9 (2018). https:\/\/doi.org\/10.3389\/fpsyg.2018.00115","DOI":"10.3389\/fpsyg.2018.00115"},{"key":"10_CR36","doi-asserted-by":"publisher","unstructured":"Nguyen, N.V., Rigaud, C., Burie, J.C.: Digital comics image indexing based on deep learning. J. Imag. 4(7) (2018). https:\/\/doi.org\/10.3390\/jimaging4070089","DOI":"10.3390\/jimaging4070089"},{"key":"10_CR37","unstructured":"Omeiza, D., Speakman, S., Cintas, C., Weldemariam, K.: Smooth Grad-Cam++: an enhanced inference level visualization technique for deep convolutional neural network models. arXiv preprint arXiv:1908.01224 (2019)"},{"key":"10_CR38","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision (2021)"},{"key":"10_CR39","doi-asserted-by":"crossref","unstructured":"Ranftl, R., Bochkovskiy, A., Koltun, V.: Vision transformers for dense prediction. arXiv preprint (2021)","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"10_CR40","unstructured":"Ranftl, R., Lasinger, K., Hafner, D., Schindler, K., Koltun, V.: Towards robust monocular depth estimation: mixing datasets for zero-shot cross-dataset transfer. IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI) (2020)"},{"key":"10_CR41","doi-asserted-by":"crossref","unstructured":"Reddy, N., Jain, S., Yarlagadda, P., Gandhi, V.: Tidying deep saliency prediction architectures. In: International Conference on Intelligent Robots and Systems (IROS) (2020). https:\/\/arxiv.org\/abs\/2003.04942","DOI":"10.1109\/IROS45743.2020.9341574"},{"key":"10_CR42","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"10_CR43","doi-asserted-by":"publisher","unstructured":"Sekachev, B., et al.: OpenCV\/CVAT: v1.1.0 (2020). https:\/\/doi.org\/10.5281\/zenodo.4009388","DOI":"10.5281\/zenodo.4009388"},{"key":"10_CR44","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D.: Grad-cam: visual explanations from deep networks via gradient-based localization. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 618\u2013626 (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"10_CR45","unstructured":"Touvron, H., et al.: Llama: open and efficient foundation language models (2023)"},{"key":"10_CR46","doi-asserted-by":"publisher","unstructured":"Viola, P., Jones, M.: Robust real-time face detection. In: Proceedings Eighth IEEE International Conference on Computer Vision. ICCV 2001, vol.\u00a02, p. 747 (2001). https:\/\/doi.org\/10.1109\/ICCV.2001.937709","DOI":"10.1109\/ICCV.2001.937709"},{"key":"10_CR47","doi-asserted-by":"publisher","unstructured":"Wagemans, J., Elder, J.H., Kubovy, M.: A century of gestalt psychology in visual perception: I. perceptual grouping and figure-ground organization. Psychol. Bull. (2012). https:\/\/doi.org\/10.1037\/a0029333","DOI":"10.1037\/a0029333"},{"key":"10_CR48","doi-asserted-by":"crossref","unstructured":"Yang, L., Kang, B., Huang, Z., Xu, X., Feng, J., Zhao, H.: Depth anything: unleashing the power of large-scale unlabeled data. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00987"},{"key":"10_CR49","doi-asserted-by":"crossref","unstructured":"Zhou, B., Khosla, A., Lapedriza, A., Oliva, A., Torralba, A.: Learning deep features for discriminative localization. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.319"},{"key":"10_CR50","doi-asserted-by":"publisher","unstructured":"Zoran, D., Isola, P., Krishnan, D., Freeman, W.T.: Learning ordinal relationships for mid-level vision. In: 2015 IEEE International Conference on Computer Vision (ICCV), pp. 388\u2013396 (2015). https:\/\/doi.org\/10.1109\/ICCV.2015.52","DOI":"10.1109\/ICCV.2015.52"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-92808-6_10","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T15:59:24Z","timestamp":1748361564000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-92808-6_10"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031928079","9783031928086"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-92808-6_10","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}