{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T18:40:24Z","timestamp":1743014424283,"version":"3.40.3"},"publisher-location":"Cham","reference-count":67,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031282409"},{"type":"electronic","value":"9783031282416"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-28241-6_5","type":"book-chapter","created":{"date-parts":[[2023,3,16]],"date-time":"2023-03-16T01:02:20Z","timestamp":1678928540000},"page":"68-85","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Scene-Centric vs. Object-Centric Image-Text Cross-Modal Retrieval: A\u00a0Reproducibility Study"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0314-2955","authenticated-orcid":false,"given":"Mariya","family":"Hendriksen","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5278-8886","authenticated-orcid":false,"given":"Svitlana","family":"Vakulenko","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8075-4894","authenticated-orcid":false,"given":"Ernst","family":"Kuiper","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1086-0202","authenticated-orcid":false,"given":"Maarten","family":"de Rijke","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,3,16]]},"reference":[{"unstructured":"ACM (2020) Artifact Review and Badging - Current. https:\/\/www.acm.org\/publications\/policies\/artifact-review-and-badging-current Accessed Aug 7 2022","key":"5_CR1"},{"unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. arXiv preprint arXiv:2204.14198 (2022)","key":"5_CR2"},{"key":"5_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"535","DOI":"10.1007\/978-3-030-99736-6_36","volume-title":"Advances in Information Retrieval","author":"M Bleeker","year":"2022","unstructured":"Bleeker, M., de Rijke, M.: Do lessons from\u00a0metric learning generalize to\u00a0image-caption retrieval? In: Hagen, M., et al. (eds.) ECIR 2022. LNCS, vol. 13185, pp. 535\u2013551. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-030-99736-6_36"},{"key":"5_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"677","DOI":"10.1007\/978-3-030-58545-7_39","volume-title":"Computer Vision \u2013 ECCV 2020","author":"A Brown","year":"2020","unstructured":"Brown, A., Xie, W., Kalogeiton, V., Zisserman, A.: Smooth-AP: smoothing the path towards large-scale image retrieval. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12354, pp. 677\u2013694. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58545-7_39"},{"doi-asserted-by":"crossref","unstructured":"Carvalho, M., Cad\u00e8ne, R., Picard, D., Soulier, L., Thome, N., Cord, M.: Cross-modal retrieval in the cooking context: Learning semantic text-image embeddings. In: The 41st International ACM SIGIR Conference on Research & Development in Information Retrieval, pp. 35\u201344 (2018)","key":"5_CR5","DOI":"10.1145\/3209978.3210036"},{"doi-asserted-by":"crossref","unstructured":"Chen, Y.C., et al.: Uniter: Learning universal image-text representations. In: Computer Vision - ECCV 2020, Springer International Publishing, pp. 104\u2013120 (2020)","key":"5_CR6","DOI":"10.1007\/978-3-030-58577-8_7"},{"doi-asserted-by":"crossref","unstructured":"Collins, J., et al.: Abo: Dataset and benchmarks for real-world 3d object understanding. CVPR (2022)","key":"5_CR7","DOI":"10.1109\/CVPR52688.2022.02045"},{"unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint (2018). arXiv:1810.04805","key":"5_CR8"},{"unstructured":"Dosovitskiy, A., et al. (2020) An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","key":"5_CR9"},{"doi-asserted-by":"crossref","unstructured":"Dou, Z.Y., et al.: An empirical study of training end-to-end vision-and-language transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18166\u201318176 (2022)","key":"5_CR10","DOI":"10.1109\/CVPR52688.2022.01763"},{"unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.: Vse++: Improving visual-semantic embeddings with hard negatives (2017). arXiv preprint arXiv:1707.05612","key":"5_CR11"},{"unstructured":"Frome, A.: Devise: A deep visual-semantic embedding model. In: Proceedings of the 26th International Conference on Neural Information Processing Systems - Volume 2, Curran Associates Inc., NIPS\u201913, pp 2121\u20132129 (2013)","key":"5_CR12"},{"doi-asserted-by":"crossref","unstructured":"Gao, D.: Fashionbert: Text and image matching with adaptive loss for cross-modal retrieval. In: Proceedings of the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 2251\u20132260 (2020)","key":"5_CR13","DOI":"10.1145\/3397271.3401430"},{"unstructured":"Goei, K., Hendriksen, M., de Rijke, M.: Tackling attribute fine-grainedness in cross-modal fashion search with multi-level features. In: SIGIR 2021 Workshop on eCommerce, ACM (2021)","key":"5_CR14"},{"key":"5_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"529","DOI":"10.1007\/978-3-319-10593-2_35","volume-title":"Computer Vision \u2013 ECCV 2014","author":"Y Gong","year":"2014","unstructured":"Gong, Y., Wang, L., Hodosh, M., Hockenmaier, J., Lazebnik, S.: Improving image-sentence embeddings using large weakly annotated photo collections. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8692, pp. 529\u2013545. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10593-2_35"},{"doi-asserted-by":"crossref","unstructured":"Gu, J., Cai, J., Joty, S.R., Niu, L., Wang, G.: Look, imagine and match: Improving textual-visual cross-modal retrieval with generative models. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 7181\u20137189 (2018)","key":"5_CR16","DOI":"10.1109\/CVPR.2018.00750"},{"doi-asserted-by":"crossref","unstructured":"Han, X., et al.: Automatic spatially-aware fashion concept discovery. In: Proceedings of the IEEE international Conference on Computer Vision, pp. 1463\u20131471 (2017)","key":"5_CR17","DOI":"10.1109\/ICCV.2017.163"},{"doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","key":"5_CR18","DOI":"10.1109\/CVPR.2016.90"},{"key":"5_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"505","DOI":"10.1007\/978-3-030-99739-7_62","volume-title":"Advances in Information Retrieval","author":"M Hendriksen","year":"2022","unstructured":"Hendriksen, M.: Multimodal retrieval in e-commerce. In: Hagen, M., et al. (eds.) ECIR 2022. LNCS, vol. 13186, pp. 505\u2013512. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-030-99739-7_62"},{"key":"5_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"289","DOI":"10.1007\/978-3-030-99736-6_20","volume-title":"Advances in Information Retrieval","author":"M Hendriksen","year":"2022","unstructured":"Hendriksen, M., Bleeker, M., Vakulenko, S., van Noord, N., Kuiper, E., de Rijke, M.: Extending CLIP for category-to-image retrieval in e-commerce. In: Hagen, M., et al. (eds.) ECIR 2022. LNCS, vol. 13185, pp. 289\u2013303. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-030-99736-6_20"},{"doi-asserted-by":"crossref","unstructured":"Herranz, L., Jiang, S., Li, X.: Scene recognition with cnns: objects, scales and dataset bias. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 571\u2013579 (2016)","key":"5_CR21","DOI":"10.1109\/CVPR.2016.68"},{"doi-asserted-by":"crossref","unstructured":"Hu, P., Zhen, L., Peng, D., Liu, P.: Scalable deep multimodal learning for cross-modal retrieval. In: Proceedings of the 42nd international ACM SIGIR conference on research and development in information retrieval, pp. 635\u2013644 (2019)","key":"5_CR22","DOI":"10.1145\/3331184.3331213"},{"unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, PMLR, pp. 4904\u20134916 (2021)","key":"5_CR23"},{"doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3128\u20133137 (2015)","key":"5_CR24","DOI":"10.1109\/CVPR.2015.7298932"},{"unstructured":"Kim, W., Son, B., Kim, I.: Vilt: Vision-and-language transformer without convolution or region supervision. In: International Conference on Machine Learning, PMLR, pp. 5583\u20135594 (2021)","key":"5_CR25"},{"unstructured":"Klein, B., Lev, G., Sadeh, G., Wolf, L.: Fisher vectors derived from hybrid gaussian-laplacian mixture models for image annotation. (2014) arXiv preprint arXiv:1411.7399","key":"5_CR26"},{"doi-asserted-by":"crossref","unstructured":"Krause, J., Stark, M., Deng, J., Fei-Fei, L.: 3d object representations for fine-grained categorization. In: 4th International IEEE Workshop on 3D Representation and Recognition (3dRR-13), Sydney, Australia (2013)","key":"5_CR27","DOI":"10.1109\/ICCVW.2013.77"},{"unstructured":"Laenen, K.: Cross-modal representation learning for fashion search and recommendation. PhD thesis, KU Leuven (2022)","key":"5_CR28"},{"unstructured":"Laenen, K., Zoghbi, S., Moens, M.F.: Cross-modal search for fashion attributes. In: Proceedings of the KDD 2017 Workshop on Machine Learning Meets Fashion, ACM, vol 2017, pp. 1\u201310 (2017)","key":"5_CR29"},{"doi-asserted-by":"crossref","unstructured":"Laenen, K., Zoghbi, S., Moens, M.F.: Web search of fashion items with multimodal querying. In: Proceedings of the Eleventh ACM International Conference on Web Search and Data Mining, pp. 342\u2013350 (2018)","key":"5_CR30","DOI":"10.1145\/3159652.3159716"},{"doi-asserted-by":"crossref","unstructured":"Lee, K.H., Chen, X., Hua, G., Hu, H., He, X.: Stacked cross attention for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 201\u2013216 (2018)","key":"5_CR31","DOI":"10.1007\/978-3-030-01225-0_13"},{"doi-asserted-by":"crossref","unstructured":"Li, A., Jabri, A., Joulin, A., Van Der Maaten, L.: Learning visual n-grams from web data. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4183\u20134192 (2017)","key":"5_CR32","DOI":"10.1109\/ICCV.2017.449"},{"key":"5_CR33","first-page":"11336","volume":"34","author":"G Li","year":"2020","unstructured":"Li, G., Duan, N., Fang, Y., Gong, M., Jiang, D.: Unicoder-vl: a universal encoder for vision and language by cross-modal pre-training. Proc. AAAI Conf. Artif. Intell. 34, 11336\u201311344 (2020)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"5_CR34","first-page":"9694","volume":"34","author":"J Li","year":"2021","unstructured":"Li, J., Selvaraju, R., Gotmare, A., Joty, S., Xiong, C., Hoi, S.C.H.: Align before fuse: vision and language representation learning with momentum distillation. Adv. Neural. Inf. Process. Syst. 34, 9694\u20139705 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"doi-asserted-by":"crossref","unstructured":"Liu, C., Mao, Z., Liu, A.A., Zhang, T., Wang, B., Zhang, Y.: Focus your attention: A bidirectional focal attention network for image-text matching. In: Proceedings of the 27th ACM International Conference on Multimedia, pp. 3\u201311 (2019)","key":"5_CR36","DOI":"10.1145\/3343031.3350869"},{"doi-asserted-by":"crossref","unstructured":"Liu, Z.: Swin transformer: Hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","key":"5_CR37","DOI":"10.1109\/ICCV48922.2021.00986"},{"doi-asserted-by":"crossref","unstructured":"Messina, N., Amato, G., Esuli, A., Falchi, F., Gennaro, C., Marchand-Maillet, S.: Fine-grained visual textual alignment for cross-modal retrieval using transformer encoders. ACM Transactions on Multimedia Computing, Communications, and Applications (TOMM) 17(4), 1\u201323 (2021)","key":"5_CR38","DOI":"10.1145\/3451390"},{"doi-asserted-by":"crossref","unstructured":"Nam, H., Ha, J.W., Kim, J.: Dual attention networks for multimodal reasoning and matching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 299\u2013307 (2017)","key":"5_CR39","DOI":"10.1109\/CVPR.2017.232"},{"doi-asserted-by":"crossref","unstructured":"Nilsback, M.E., Zisserman, A.: Automated flower classification over a large number of classes. In: Indian Conference on Computer Vision, Graphics and Image Processing (2008)","key":"5_CR40","DOI":"10.1109\/ICVGIP.2008.47"},{"doi-asserted-by":"crossref","unstructured":"Petrov, A., Macdonald, C.: A systematic review and replicability study of bert4rec for sequential recommendation. In: Proceedings of the 16th ACM Conference on Recommender Systems, pp. 436\u2013447 (2022)","key":"5_CR41","DOI":"10.1145\/3523227.3548487"},{"unstructured":"Qi, D., Su, L., Song, J., Cui, E., Bharti, T., Sacheti, A.: Imagebert: Cross-modal pre-training with large-scale weak-supervised image-text data. (2020) arXiv preprint arXiv:2001.07966","key":"5_CR42"},{"issue":"8","key":"5_CR43","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, PMLR, pp. 8748\u20138763 (2021)","key":"5_CR44"},{"doi-asserted-by":"crossref","unstructured":"Rao, J, et al.: Where does the performance improvement come from?: - A reproducibility concern about image-text retrieval. In: SIGIR \u201922: The 45th International ACM SIGIR Conference on Research and Development in Information Retrieval, Madrid, Spain, July 11\u201315, 2022, ACM, pp. 2727\u20132737 (2022)","key":"5_CR45","DOI":"10.1145\/3477495.3531715"},{"doi-asserted-by":"crossref","unstructured":"Reed, S., Akata, Z., Lee, H., Schiele, B.: Learning deep representations of fine-grained visual descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 49\u201358 (2016)","key":"5_CR46","DOI":"10.1109\/CVPR.2016.13"},{"doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., Soricut, R.: Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2556\u20132565 (2018)","key":"5_CR47","DOI":"10.18653\/v1\/P18-1238"},{"key":"5_CR48","doi-asserted-by":"publisher","first-page":"14","DOI":"10.1016\/j.imavis.2019.03.004","volume":"85","author":"ZY Shen","year":"2019","unstructured":"Shen, Z.Y., Han, S.Y., Fu, L.C., Hsiao, P.Y., Lau, Y.C., Chang, S.J.: Deep convolution neural network with scene-centric and object-centric information for object detection. Image Vis. Comput. 85, 14\u201325 (2019)","journal-title":"Image Vis. Comput."},{"issue":"9","key":"5_CR49","doi-asserted-by":"publisher","first-page":"105","DOI":"10.3390\/computers10090105","volume":"10","author":"S Sheng","year":"2021","unstructured":"Sheng, S., Laenen, K., Van Gool, L., Moens, M.F.: Fine-grained cross-modal retrieval for cultural items with focal attention and hierarchical encodings. Computers 10(9), 105 (2021)","journal-title":"Computers"},{"unstructured":"Song, J., Choi, S. Image-text alignment using adaptive cross-attention with transformer encoder for scene graphs (2021)","key":"5_CR50"},{"unstructured":"Tan, M., Le, Q.: Efficientnet: Rethinking model scaling for convolutional neural networks. In: International Conference on Machine Learning, PMLR, pp. 6105\u20136114 (2019)","key":"5_CR51"},{"doi-asserted-by":"crossref","unstructured":"Ueki, K.: Survey of visual-semantic embedding methods for zero-shot image retrieval. In: 2021 20th IEEE International Conference on Machine Learning and Applications (ICMLA), IEEE, pp. 628\u2013634 (2021)","key":"5_CR52","DOI":"10.1109\/ICMLA52953.2021.00105"},{"unstructured":"Varamesh, A., Diba, A., Tuytelaars, T., Van Gool, L.: Self-supervised ranking for representation learning. (2020) arXiv preprint arXiv:2010.07258","key":"5_CR53"},{"unstructured":"Vaswani, A.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)","key":"5_CR54"},{"key":"5_CR55","doi-asserted-by":"publisher","first-page":"355","DOI":"10.1007\/3-540-45691-0_34","volume-title":"Evaluation of Cross-Language Information Retrieval Systems","author":"EM Voorhees","year":"2002","unstructured":"Voorhees, E.M.: The philosophy of information retrieval evaluation. In: Evaluation of Cross-Language Information Retrieval Systems, pp. 355\u2013370. Springer, Berlin Heidelberg (2002)"},{"key":"5_CR56","doi-asserted-by":"publisher","first-page":"2515","DOI":"10.1109\/TMM.2021.3083109","volume":"24","author":"H Wang","year":"2021","unstructured":"Wang, H., et al.: Cross-modal food retrieval: learning a joint embedding of food images and recipes with semantic consistency and attention mechanism. IEEE Trans. Multimedia 24, 2515\u20132525 (2021)","journal-title":"IEEE Trans. Multimedia"},{"unstructured":"Wang, K., Yin, Q., Wang, W., Wu, S., Wang, L.: A comprehensive survey on cross-modal retrieval. arXiv preprint (2016). arXiv:1607.06215","key":"5_CR57"},{"doi-asserted-by":"crossref","unstructured":"Wang, L., Li, Y., Lazebnik, S.: Learning deep structure-preserving image-text embeddings. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5005\u20135013 (2016)","key":"5_CR58","DOI":"10.1109\/CVPR.2016.541"},{"unstructured":"Welinder, P.: Caltech-UCSD Birds 200. Tech. Rep. CNS-TR-2010-001, California Institute of Technology (2010)","key":"5_CR59"},{"unstructured":"Yu, J., Wang, Z., Vasudevan, V., Yeung, L., Seyedhosseini, M., Wu, Y.: Coca: Contrastive captioners are image-text foundation models. (2022). arXiv preprint arXiv:2205.01917","key":"5_CR60"},{"unstructured":"Zeng, Y., Zhang, X., Li, H.: Multi-grained vision language pre-training: Aligning texts with visual concepts. In: Chaudhuri K, Jegelka S, Song L, Szepesv\u00e1ri C, Niu G, Sabato S (eds) International Conference on Machine Learning, ICML 2022, 17\u201323 July 2022, Baltimore, Maryland, USA, PMLR, Proceedings of Machine Learning Research, vol 162, pp. 25994\u201326009 (2022)","key":"5_CR61"},{"doi-asserted-by":"crossref","unstructured":"Zhang, C., et al.: Mosaicos: a simple and effective use of object-centric images for long-tailed object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 417\u2013427 (2021)","key":"5_CR62","DOI":"10.1109\/ICCV48922.2021.00047"},{"doi-asserted-by":"crossref","unstructured":"Zhang, K., Mao, Z., Wang, Q., Zhang, Y.: Negative-aware attention framework for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15661\u201315670 (2022)","key":"5_CR63","DOI":"10.1109\/CVPR52688.2022.01521"},{"doi-asserted-by":"crossref","unstructured":"Zhang, P., et al.: Vinvl: Revisiting visual representations in vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5579\u20135588 (2021)","key":"5_CR64","DOI":"10.1109\/CVPR46437.2021.00553"},{"unstructured":"Zhang, Y., Jiang, H., Miura, Y., Manning, C.D., Langlotz, C.P.: Contrastive learning of medical visual representations from paired images and text (2020). arXiv preprint arXiv:2010.00747","key":"5_CR65"},{"unstructured":"Zhou, B., Lapedriza, A., Xiao, J., Torralba, A., Oliva, A.: Learning deep features for scene recognition using places database. In: Advances in Neural Information Processing Systems vol. 27 (2014)","key":"5_CR66"},{"doi-asserted-by":"crossref","unstructured":"Zhuge, M., et al.: Kaleido-bert: Vision-language pre-training on fashion domain. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12647\u201312657 (2021)","key":"5_CR67","DOI":"10.1109\/CVPR46437.2021.01246"}],"container-title":["Lecture Notes in Computer Science","Advances in Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-28241-6_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,5]],"date-time":"2024-03-05T13:54:03Z","timestamp":1709646843000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-28241-6_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031282409","9783031282416"],"references-count":67,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-28241-6_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"16 March 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECIR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Information Retrieval","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Dublin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Ireland","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 April 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 April 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"45","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ecir2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ecir2023.org\/index.html?v=1.0","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"489","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"77","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"83","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"16% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}