{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T07:26:39Z","timestamp":1768375599611,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":29,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819556786","type":"print"},{"value":"9789819556793","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5679-3_1","type":"book-chapter","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T18:36:38Z","timestamp":1768329398000},"page":"3-16","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Reawakening Intra-modality Discrimination for\u00a0Image-Text Matching"],"prefix":"10.1007","author":[{"given":"Jianfei","family":"Liu","sequence":"first","affiliation":[]},{"given":"Yi","family":"Li","sequence":"additional","affiliation":[]},{"given":"Fuxin","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Haiyan","family":"Fu","sequence":"additional","affiliation":[]},{"given":"Yanqing","family":"Guo","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,14]]},"reference":[{"key":"1_CR1","unstructured":"Chen, J., Hu, H., Wu, H., Jiang, Y., Wang, C.: Learning the best pooling strategy for visual semantic embedding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15789-15798(2010)"},{"key":"1_CR2","doi-asserted-by":"crossref","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: Bert: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 4171-4186(2019)","DOI":"10.18653\/v1\/N19-1423"},{"key":"1_CR3","doi-asserted-by":"crossref","unstructured":"Fu, Z., Mao, Z., Song, Y., Zhang, Y.: Learning semantic relationship among instances for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15159-15168(2023)","DOI":"10.1109\/CVPR52729.2023.01455"},{"key":"1_CR4","first-page":"6704","volume":"35","author":"S Goel","year":"2022","unstructured":"Goel, S., et al.: Cyclip: cyclic contrastive language-image pretraining. Adv. Neural. Inf. Process. Syst. 35, 6704\u20136719 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1_CR5","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 770-778(2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"1_CR6","doi-asserted-by":"crossref","unstructured":"Jiang, Q., et al.: Understanding and constructing latent modality structures in multi-modal representation learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7661-7671(2023)","DOI":"10.1109\/CVPR52729.2023.00740"},{"key":"1_CR7","unstructured":"Kipf, T., Welling, M.: Semi-supervised classification with graph convolutional networks(2016). arXiv:1609.02907"},{"key":"1_CR8","doi-asserted-by":"crossref","unstructured":"Laplante, P., et al.: Comprehensive dictionary of electrical engineering. CRC Press(2018)","DOI":"10.1201\/9781420037807"},{"key":"1_CR9","doi-asserted-by":"crossref","unstructured":"Lee, K., Chen, X., Hua, G., Hu, H., He, X.: UStacked cross attention for image-text matching. In: Proceedings of the European Conference on Computer Vision, pp. 201-216(2018)","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"1_CR10","doi-asserted-by":"crossref","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., Fu, Y.: Visual semantic reasoning for image-text matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4654-4662(2019)","DOI":"10.1109\/ICCV.2019.00475"},{"key":"1_CR11","doi-asserted-by":"crossref","unstructured":"Li, Z., Guo, C., Feng, Z., Hwang, J., Xue, X.: Multi-view visual semantic embedding. In: International Joint Conference on Artificial Intelligence(2022)","DOI":"10.24963\/ijcai.2022\/158"},{"key":"1_CR12","first-page":"17612","volume":"35","author":"V Liang","year":"2022","unstructured":"Liang, V., Zhang, Y., Kwon, Y., Yeung, S., Zou, J.: Mind the gap: Understanding the modality gap in multi-modal contrastive representation learning. Adv. Neural. Inf. Process. Syst. 35, 17612\u201317625 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1_CR13","doi-asserted-by":"crossref","unstructured":"Lin, T., et al.: Microsoft COCO: Common Objects in Context In: Proceedings of the European Conference on Computer Vision, pp. 740-755(2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1_CR14","doi-asserted-by":"crossref","unstructured":"Mansuy, R., Yor, M.: Aspects of brownian motion. Springer Science and Business Media(2008)","DOI":"10.1007\/978-3-540-49966-4"},{"key":"1_CR15","doi-asserted-by":"crossref","unstructured":"Peng, Y., Huang, X., Zhao, Y.: An overview of cross-media retrieval: concepts, methodologies, benchmarks, and challenges. IEEE transactions on circuits and systems for video technology,pp. 2372-2385 (2015)","DOI":"10.1109\/TCSVT.2017.2705068"},{"key":"1_CR16","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.: Glove: global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing, pp. 1532-1543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"1_CR17","doi-asserted-by":"crossref","unstructured":"Pham, K., Huynh, C., Lim, S., Shrivastava, A.: Composing object relations and attributes for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14354-14363(2024)","DOI":"10.1109\/CVPR52733.2024.01361"},{"key":"1_CR18","first-page":"2969239","volume":"28","author":"S Ren","year":"2015","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: towards real-time object detection with region proposal networks. Adv. Neural. Inf. Process. Syst. 28, 2969239\u20132969250 (2015)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1_CR19","doi-asserted-by":"crossref","unstructured":"Schuster, M., Paliwal, K.: An overview of cross-media retrieval: concepts, methodologies, benchmarks, and challenges. IEEE transactions on Signal Processing,pp. 2673-2681 (1997)","DOI":"10.1109\/78.650093"},{"key":"1_CR20","unstructured":"Wang, R., Durmus, E., Goodman, N., Hashimoto, T.: Language modeling via stochastic processes(2022). arXiv:2203.11370"},{"key":"1_CR21","doi-asserted-by":"crossref","unstructured":"Wang, Y., Yang, M., Cao, R.: Fine-grained Semantic Alignment with Transferred Person-SAM for Text-based Person Retrieval. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp. 5432-5441(2024)","DOI":"10.1145\/3664647.3681553"},{"key":"1_CR22","doi-asserted-by":"crossref","unstructured":"Wu, C., Manmatha, R., Smola, A., Krahenbuhl, P.: Sampling matters in deep embedding learning. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2840-2848(2017)","DOI":"10.1109\/ICCV.2017.309"},{"key":"1_CR23","unstructured":"Ye, T., et al.: Differential transformer (2024). arXiv:2410.05258"},{"key":"1_CR24","doi-asserted-by":"crossref","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. MIT Press(2014)","DOI":"10.1162\/tacl_a_00166"},{"key":"1_CR25","doi-asserted-by":"crossref","unstructured":"Zhang, K., Mao, Z., Wang, Q., Zhang, Y.: Negative-aware attention framework for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15661-15670(2022)","DOI":"10.1109\/CVPR52688.2022.01521"},{"key":"1_CR26","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3128-3137(2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"1_CR27","first-page":"32897","volume":"35","author":"H Bao","year":"2022","unstructured":"Bao, H.: Adv. Neural. Inf. Process. Syst. 35, 32897\u201332912 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1_CR28","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748-8763(2021)"},{"key":"1_CR29","doi-asserted-by":"crossref","unstructured":"Li, X., Yu, J., Li, Z., Lu, H., Yuan, R.: Dr.clip: clip-driven universal framework for zero-shot sketch image retrieval. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp. 9554-9562(2024)","DOI":"10.1145\/3664647.3680702"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5679-3_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T18:36:42Z","timestamp":1768329402000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5679-3_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819556786","9789819556793"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5679-3_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"14 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}