{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,4,5]],"date-time":"2025-04-05T04:10:23Z","timestamp":1743826223830,"version":"3.40.3"},"publisher-location":"Cham","reference-count":41,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031887079","type":"print"},{"value":"9783031887086","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-88708-6_28","type":"book-chapter","created":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T11:49:23Z","timestamp":1743767363000},"page":"437-452","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Towards Identity-Aware Cross-Modal Retrieval: A\u00a0Dataset and\u00a0a\u00a0Baseline"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3011-2487","authenticated-orcid":false,"given":"Nicola","family":"Messina","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7182-7038","authenticated-orcid":false,"given":"Lucia","family":"Vadicamo","sequence":"additional","affiliation":[]},{"given":"Leo","family":"Maltese","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3715-149X","authenticated-orcid":false,"given":"Claudio","family":"Gennaro","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,4,3]]},"reference":[{"key":"28_CR1","doi-asserted-by":"crossref","unstructured":"Avrahami, O., Aberman, K., Fried, O., Cohen-Or, D., Lischinski, D.: Break-a-scene: extracting multiple concepts from a single image. In: SIGGRAPH Asia 2023 Conference Papers, pp. 1\u201312 (2023)","DOI":"10.1145\/3610548.3618154"},{"key":"28_CR2","doi-asserted-by":"crossref","unstructured":"Cao, Q., Shen, L., Xie, W., Parkhi, O.M., Zisserman, A.: VGGFace2: a dataset for recognising faces across pose and age. In: 2018 13th IEEE International Conference on Automatic Face & Gesture Recognition (FG 2018), pp. 67\u201374. IEEE (2018)","DOI":"10.1109\/FG.2018.00020"},{"key":"28_CR3","doi-asserted-by":"crossref","unstructured":"Chen, Z., Zhang, L., Weng, F., Pan, L., Lan, Z.: Tailored visions: enhancing text-to-image generation with personalized prompt rewriting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7727\u20137736 (2024)","DOI":"10.1109\/CVPR52733.2024.00738"},{"key":"28_CR4","doi-asserted-by":"publisher","unstructured":"Cohen, N., Gal, R., Meirom, E.A., Chechik, G., Atzmon, Y.: \u201cthis is my unicorn, fluffy\u201d: Personalizing frozen vision-language representations. In: European Conference on Computer Vision, pp. 558\u2013577. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-20044-1_32","DOI":"10.1007\/978-3-031-20044-1_32"},{"key":"28_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"264","DOI":"10.1007\/978-3-030-51310-8_24","volume-title":"Natural Language Processing and Information Systems","author":"S Dost","year":"2020","unstructured":"Dost, S., Serafini, L., Rospocher, M., Ballan, L., Sperduti, A.: Jointly linking visual and textual entity mentions with background knowledge. In: M\u00e9tais, E., Meziane, F., Horacek, H., Cimiano, P. (eds.) NLDB 2020. LNCS, vol. 12089, pp. 264\u2013276. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-51310-8_24"},{"key":"28_CR6","unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: VSE++: improving visual-semantic embeddings with hard negatives (2018). https:\/\/github.com\/fartashf\/vsepp"},{"key":"28_CR7","unstructured":"Gal, R., et al.: An image is worth one word: Personalizing text-to-image generation using textual inversion. In: The Eleventh International Conference on Learning Representations, ICLR 2023, Kigali, Rwanda, 1-5 May 2023 (2023). https:\/\/openreview.net\/pdf?id=NAQvF08TcyG"},{"issue":"4","key":"28_CR8","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3592133","volume":"42","author":"R Gal","year":"2023","unstructured":"Gal, R., Arar, M., Atzmon, Y., Bermano, A.H., Chechik, G., Cohen-Or, D.: Encoder-based domain tuning for fast personalization of text-to-image models. ACM Trans. Graph. (TOG) 42(4), 1\u201313 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"28_CR9","doi-asserted-by":"crossref","unstructured":"Gatti, P., Penamakuri, A.S., Teotia, R., Mishra, A., Sengupta, S., Ramnani, R.: COFAR: commonsense and factual reasoning in image search. arXiv preprint arXiv:2210.08554 (2022)","DOI":"10.18653\/v1\/2022.aacl-main.87"},{"key":"28_CR10","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"28_CR11","doi-asserted-by":"publisher","unstructured":"Jia, M., Tang, L., Chen, B.C., Cardie, C., Belongie, S., Hariharan, B., Lim, S.N.: Visual prompt tuning. In: European Conference on Computer Vision, pp. 709\u2013727. Springer (2022). https:\/\/doi.org\/10.1007\/978-3-031-19827-4_41","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"28_CR12","unstructured":"Korbar, B., Zisserman, A.: Personalised clip or: how to find your vacation videos (2022)"},{"key":"28_CR13","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.: Align before fuse: vision and language representation learning with momentum distillation. Adv. Neural. Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"28_CR14","doi-asserted-by":"crossref","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., Fu, Y.: Visual semantic reasoning for image-text matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4654\u20134662 (2019)","DOI":"10.1109\/ICCV.2019.00475"},{"key":"28_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"121","DOI":"10.1007\/978-3-030-58577-8_8","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Li","year":"2020","unstructured":"Li, X., et al.: Oscar: Object-semantics aligned pre-training for vision-language tasks. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 121\u2013137. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_8"},{"key":"28_CR16","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"28_CR17","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: VilBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"28_CR18","doi-asserted-by":"crossref","unstructured":"Messina, N., et al.: Aladin: Distilling fine-grained alignment scores for efficient image-text matching and retrieval. In: International Conference on Content-based Multimedia Indexing, pp. 64\u201370 (2022)","DOI":"10.1145\/3549555.3549576"},{"key":"28_CR19","unstructured":"Oord, A.v.d., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. arXiv preprint arXiv:1807.03748 (2018)"},{"key":"28_CR20","doi-asserted-by":"crossref","unstructured":"Pang, L., Yin, J., Xie, H., Wang, Q., Li, Q., Mao, X.: Cross initialization for face personalization of text-to-image models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8393\u20138403 (2024)","DOI":"10.1109\/CVPR52733.2024.00802"},{"key":"28_CR21","doi-asserted-by":"crossref","unstructured":"Qu, L., Liu, M., Cao, D., Nie, L., Tian, Q.: Context-aware multi-view summarization network for image-text matching. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1047\u20131055 (2020)","DOI":"10.1145\/3394171.3413961"},{"key":"28_CR22","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: Proceedings of the 38th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8748\u20138763. PMLR (2021). https:\/\/proceedings.mlr.press\/v139\/radford21a.html"},{"key":"28_CR23","doi-asserted-by":"publisher","unstructured":"Ramanathan, V., Joulin, A., Liang, P., Fei-Fei, L.: Linking people in videos with \u201ctheir\u201d names using coreference resolution. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, 6-12 September 2014, Proceedings, Part I 13, pp. 95\u2013110. Springer (2014). https:\/\/doi.org\/10.1007\/978-3-319-10590-1_7","DOI":"10.1007\/978-3-319-10590-1_7"},{"key":"28_CR24","doi-asserted-by":"crossref","unstructured":"Rosasco, A., et al.: ConCon-Chi: concept-context chimera benchmark for personalized vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22239\u201322248 (2024)","DOI":"10.1109\/CVPR52733.2024.02099"},{"key":"28_CR25","doi-asserted-by":"crossref","unstructured":"Sarafianos, N., Xu, X., Kakadiaris, I.A.: Adversarial representation learning for text-to-image matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5814\u20135824 (2019)","DOI":"10.1109\/ICCV.2019.00591"},{"key":"28_CR26","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2556\u20132565 (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"28_CR27","doi-asserted-by":"crossref","unstructured":"Stefanini, M., Cornia, M., Baraldi, L., Cucchiara, R.: A novel attention-based aggregation function to combine vision and language. In: 2020 25th International Conference on Pattern Recognition (ICPR), pp. 1212\u20131219. IEEE (2021)","DOI":"10.1109\/ICPR48806.2021.9413269"},{"key":"28_CR28","unstructured":"Su, W., et al.: Vl-BERT: pre-training of generic visual-linguistic representations. In: International Conference on Learning Representations (2020). https:\/\/openreview.net\/forum?id=SygXPaEYvH"},{"key":"28_CR29","doi-asserted-by":"crossref","unstructured":"Sun, W., Fan, Y., Guo, J., Zhang, R., Cheng, X.: Visual named entity linking: a new dataset and a baseline. arXiv preprint arXiv:2211.04872 (2022)","DOI":"10.18653\/v1\/2022.findings-emnlp.178"},{"key":"28_CR30","doi-asserted-by":"crossref","unstructured":"Tewel, Y., Gal, R., Chechik, G., Atzmon, Y.: Key-locked rank one editing for text-to-image personalization. In: ACM SIGGRAPH 2023 Conference Proceedings, pp. 1\u201311 (2023)","DOI":"10.1145\/3588432.3591506"},{"key":"28_CR31","doi-asserted-by":"crossref","unstructured":"Vadicamo, L., et\u00a0al.: Evaluating performance and trends in interactive video retrieval: insights from the 12th VBS competition. IEEE Access 12, 79342\u201379366 (2024)","DOI":"10.1109\/ACCESS.2024.3405638"},{"key":"28_CR32","doi-asserted-by":"crossref","unstructured":"Wang, W., et al.: Image as a foreign language: BEiT pretraining for vision and vision-language tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"28_CR33","doi-asserted-by":"crossref","unstructured":"Wei, Y., Zhang, Y., Ji, Z., Bai, J., Zhang, L., Zuo, W.: Elite: Encoding visual concepts into textual embeddings for customized text-to-image generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15943\u201315953 (2023)","DOI":"10.1109\/ICCV51070.2023.01461"},{"issue":"7","key":"28_CR34","doi-asserted-by":"crossref","first-page":"2866","DOI":"10.1109\/TCSVT.2020.3030656","volume":"31","author":"K Wen","year":"2020","unstructured":"Wen, K., Gu, X., Cheng, Q.: Learning dual semantic relations with graph attention for image-text matching. IEEE Trans. Circ. Syst. Video Technol. 31(7), 2866\u20132879 (2020)","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"28_CR35","doi-asserted-by":"crossref","unstructured":"Yeh, C.H., Russell, B., Sivic, J., Heilbron, F.C., Jenni, S.: Meta-personalizing vision-language models to find named instances in video. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19123\u201319132 (2023)","DOI":"10.1109\/CVPR52729.2023.01833"},{"key":"28_CR36","unstructured":"Yi, D., Lei, Z., Liao, S., Li, S.Z.: Learning face representation from scratch. arXiv preprint arXiv:1411.7923 (2014)"},{"key":"28_CR37","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"28_CR38","doi-asserted-by":"crossref","unstructured":"Zeng, Y., et al.: Jedi: Joint-image diffusion models for finetuning-free personalized text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6786\u20136795 (2024)","DOI":"10.1109\/CVPR52733.2024.00648"},{"issue":"10","key":"28_CR39","doi-asserted-by":"crossref","first-page":"1499","DOI":"10.1109\/LSP.2016.2603342","volume":"23","author":"K Zhang","year":"2016","unstructured":"Zhang, K., Zhang, Z., Li, Z., Qiao, Y.: Joint face detection and alignment using multitask cascaded convolutional networks. IEEE Sig. Process. Lett. 23(10), 1499\u20131503 (2016)","journal-title":"IEEE Sig. Process. Lett."},{"key":"28_CR40","doi-asserted-by":"crossref","unstructured":"Zhang, P., et al.: VinVL: revisiting visual representations in vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5579\u20135588 (2021)","DOI":"10.1109\/CVPR46437.2021.00553"},{"issue":"1","key":"28_CR41","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1162\/dint_a_00114","volume":"4","author":"Q Zheng","year":"2022","unstructured":"Zheng, Q., Wen, H., Wang, M., Qi, G.: Visual entity linking via multi-modal learning. Data Intell. 4(1), 1\u201319 (2022)","journal-title":"Data Intell."}],"container-title":["Lecture Notes in Computer Science","Advances in Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-88708-6_28","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,4]],"date-time":"2025-04-04T11:49:48Z","timestamp":1743767388000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-88708-6_28"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031887079","9783031887086"],"references-count":41,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-88708-6_28","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"3 April 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"The authors have no competing interests to declare that are relevant to the content of this article.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Disclosure of Interests"}},{"value":"ECIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Information Retrieval","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Lucca","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 April 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 April 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"47","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecir2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecir2025.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}