{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T12:41:48Z","timestamp":1776775308565,"version":"3.51.2"},"reference-count":61,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2023YFC3304104"],"award-info":[{"award-number":["2023YFC3304104"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neurocomputing"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.neucom.2026.133504","type":"journal-article","created":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T08:02:08Z","timestamp":1775030528000},"page":"133504","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["UIID: Unified intra-modal and inter-modal distillation for image-text retrieval"],"prefix":"10.1016","volume":"683","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5396-0003","authenticated-orcid":false,"given":"Song","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8560-9231","authenticated-orcid":false,"given":"Nan","family":"Zheng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2464-8149","authenticated-orcid":false,"given":"Liang","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuemei","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yifeng","family":"Jin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"key":"10.1016\/j.neucom.2026.133504_bib0005","series-title":"2024 11th International Conference on Behavioural and Social Computing (BESC)","first-page":"1","article-title":"Text-to-image retrieval based on zero-shot transfer learning with CLIP model and vector database","author":"Xie","year":"2024"},{"key":"10.1016\/j.neucom.2026.133504_bib0010","series-title":"2024 International Joint Conference on Neural Networks (IJCNN)","first-page":"1","article-title":"Semantic information reasoning and multi-step cross-modal interaction network for image-text retrieval","author":"Ma","year":"2024"},{"key":"10.1016\/j.neucom.2026.133504_bib0015","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"7190","article-title":"Bidirectional attentive fusion with context gating for dense video captioning","author":"Wang","year":"2018"},{"key":"10.1016\/j.neucom.2026.133504_bib0020","first-page":"2970","article-title":"Efficient image and sentence matching","volume":"45","author":"Huang","year":"2023","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.neucom.2026.133504_bib0025","doi-asserted-by":"crossref","first-page":"3088","DOI":"10.1109\/TPAMI.2019.2920899","article-title":"Reconstruct and represent video contents for captioning via reinforcement learning","volume":"42","author":"Zhang","year":"2019","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.neucom.2026.133504_bib0030","doi-asserted-by":"crossref","first-page":"959","DOI":"10.1109\/TCSS.2023.3244068","article-title":"Multimodal fake news analysis based on image\u2013text similarity","volume":"11","author":"Zhang","year":"2023","journal-title":"IEEE Trans. Comput. Soc. Syst."},{"key":"10.1016\/j.neucom.2026.133504_bib0035","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"5005","article-title":"Learning deep structure-preserving image-text embeddings","author":"Wang","year":"2016"},{"key":"10.1016\/j.neucom.2026.133504_bib0040","series-title":"British Machine Vision Conference","article-title":"VSE++: improving visual-semantic embeddings with hard negatives","author":"Faghri","year":"2018"},{"key":"10.1016\/j.neucom.2026.133504_bib0045","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3383184","article-title":"Dual-path convolutional image-text embeddings with instance loss","volume":"16","author":"Zheng","year":"2020","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl."},{"key":"10.1016\/j.neucom.2026.133504_bib0050","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"201","article-title":"Stacked cross attention for image-text matching","author":"Lee","year":"2018"},{"key":"10.1016\/j.neucom.2026.133504_bib0055","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"12655","article-title":"IMRAM: iterative matching with recurrent attention memory for cross-modal image-text retrieval","author":"Chen","year":"2020"},{"key":"10.1016\/j.neucom.2026.133504_bib0060","author":"Hinton"},{"key":"10.1016\/j.neucom.2026.133504_bib0065","article-title":"Devise: a deep visual-semantic embedding model","volume":"26","author":"Frome","year":"2013","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133504_bib0070","author":"Kiros"},{"key":"10.1016\/j.neucom.2026.133504_bib0075","series-title":"Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part VI 14","first-page":"833","article-title":"RNN fisher vectors for action recognition and image annotation","author":"Lev","year":"2016"},{"key":"10.1016\/j.neucom.2026.133504_bib0080","series-title":"2007 IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1","article-title":"Fisher kernels on visual vocabularies for image categorization","author":"Perronnin","year":"2007"},{"key":"10.1016\/j.neucom.2026.133504_bib0085","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"4654","article-title":"Visual semantic reasoning for image-text matching","author":"Li","year":"2019"},{"key":"10.1016\/j.neucom.2026.133504_bib0090","doi-asserted-by":"crossref","first-page":"3781","DOI":"10.1109\/TMM.2025.3535373","article-title":"Dynamic visual semantic sub-embeddings and fast re-ranking for image-text retrieval","volume":"27","author":"Wei","year":"2025","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.neucom.2026.133504_bib0095","doi-asserted-by":"crossref","DOI":"10.1016\/j.knosys.2024.112912","article-title":"Hierarchical and complementary experts transformer with momentum invariance for image-text retrieval","volume":"309","author":"Zhang","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.neucom.2026.133504_bib0100","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.neucom.2026.133504_bib0105","series-title":"Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021","first-page":"743","article-title":"Hashing based efficient inference for image-text matching","author":"Tu","year":"2021"},{"key":"10.1016\/j.neucom.2026.133504_bib0110","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2025.129411","article-title":"Federated training of gnns with similarity graph reasoning for text\u2013image retrieval","volume":"623","author":"Yan","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2026.133504_bib0115","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10921","article-title":"Graph structured network for image-text matching","author":"Liu","year":"2020"},{"key":"10.1016\/j.neucom.2026.133504_bib0120","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"12313","article-title":"Adaptive cross-modal embeddings for image-text alignment","volume":"vol. 34","author":"Wehrmann","year":"2020"},{"key":"10.1016\/j.neucom.2026.133504_bib0125","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2023.03.057","article-title":"Efficient text-image semantic search: a multi-modal vision-language approach for fashion retrieval","volume":"538","author":"Moro","year":"2023","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neucom.2026.133504_bib0130","series-title":"International Conference on Machine Learning","first-page":"12888","article-title":"BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"key":"10.1016\/j.neucom.2026.133504_bib0135","series-title":"European Conference on Computer Vision","first-page":"104","article-title":"Uniter: universal image-text representation learning","author":"Chen","year":"2020"},{"key":"10.1016\/j.neucom.2026.133504_bib0140","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"11336","article-title":"Unicoder-VL: a universal encoder for vision and language by cross-modal pre-training","volume":"vol. 34","author":"Li","year":"2020"},{"key":"10.1016\/j.neucom.2026.133504_bib0145","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX 16","first-page":"121","article-title":"Oscar: object-semantics aligned pre-training for vision-language tasks","author":"Li","year":"2020"},{"key":"10.1016\/j.neucom.2026.133504_bib0150","article-title":"Vilbert: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume":"32","author":"Lu","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133504_bib0155","series-title":"Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","first-page":"2556","article-title":"Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning","author":"Sharma","year":"2018"},{"key":"10.1016\/j.neucom.2026.133504_bib0160","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"5774","article-title":"ACMM: aligned cross-modal memory for few-shot image and sentence matching","author":"Huang","year":"2019"},{"issue":"7","key":"10.1016\/j.neucom.2026.133504_bib0165","doi-asserted-by":"crossref","first-page":"6542","DOI":"10.1109\/TCSVT.2024.3358411","article-title":"Fast, accurate, and lightweight memory-enhanced embedding learning framework for image-text retrieval","volume":"34","author":"Li","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.neucom.2026.133504_bib0170","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"19275","article-title":"Fine-grained image-text matching by cross-modal hard aligning network","author":"Pan","year":"2023"},{"key":"10.1016\/j.neucom.2026.133504_bib0175","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"15651","article-title":"Multi-modal alignment using representation codebook","author":"Duan","year":"2022"},{"key":"10.1016\/j.neucom.2026.133504_bib0180","series-title":"Findings of the Association for Computational Linguistics: EMNLP 2020","first-page":"4163","article-title":"Tinybert: distilling BERT for natural language understanding","author":"Jiao","year":"2020"},{"key":"10.1016\/j.neucom.2026.133504_bib0185","series-title":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","first-page":"2158","article-title":"Mobilebert: a compact task-agnostic BERT for resource-limited devices","author":"Sun","year":"2020"},{"key":"10.1016\/j.neucom.2026.133504_bib0190","article-title":"Parameter-efficient and student-friendly knowledge distillation","author":"Rao","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.neucom.2026.133504_bib0195","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"4125","article-title":"Unsupervised image captioning","author":"Feng","year":"2019"},{"key":"10.1016\/j.neucom.2026.133504_bib0200","article-title":"Learning efficient object detection models with knowledge distillation","volume":"30","author":"Chen","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133504_bib0205","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"1187","article-title":"Distilled person re-identification: towards a more scalable system","author":"Wu","year":"2019"},{"key":"10.1016\/j.neucom.2026.133504_bib0210","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"7096","article-title":"Knowledge distillation via instance relationship graph","author":"Liu","year":"2019"},{"key":"10.1016\/j.neucom.2026.133504_bib0215","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"2907","article-title":"Learning metrics from teachers: compact networks for image embedding","author":"Yu","year":"2019"},{"key":"10.1016\/j.neucom.2026.133504_bib0220","first-page":"9694","article-title":"Align before fuse: vision and language representation learning with momentum distillation","volume":"34","author":"Li","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.neucom.2026.133504_bib0225","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"10377","article-title":"DistilVPR: cross-modal knowledge distillation for visual place recognition","volume":"vol. 38","author":"Wang","year":"2024"},{"key":"10.1016\/j.neucom.2026.133504_bib0230","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"18298","article-title":"Cross-modal and uni-modal soft-label alignment for image-text retrieval","volume":"vol. 38","author":"Huang","year":"2024"},{"key":"10.1016\/j.neucom.2026.133504_bib0235","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"770","article-title":"Deep residual learning for image recognition","author":"He","year":"2016"},{"key":"10.1016\/j.neucom.2026.133504_bib0240","series-title":"Proceedings of naacL-HLT","first-page":"2","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","volume":"vol. 1","author":"Kenton","year":"2019"},{"key":"10.1016\/j.neucom.2026.133504_bib0245","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"8489","article-title":"Few-shot image and sentence matching via gated visual-semantic embedding","volume":"vol. 33","author":"Huang","year":"2019"},{"key":"10.1016\/j.neucom.2026.133504_bib0250","series-title":"Proceedings of the 27th ACM International Conference on Multimedia","first-page":"2088","article-title":"Learning fragment self-attention embeddings for image-text matching","author":"Wu","year":"2019"},{"key":"10.1016\/j.neucom.2026.133504_bib0255","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6077","article-title":"Bottom-up and top-down attention for image captioning and visual question answering","author":"Anderson","year":"2018"},{"key":"10.1016\/j.neucom.2026.133504_bib0260","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1492","article-title":"Aggregated residual transformations for deep neural networks","author":"Xie","year":"2017"},{"key":"10.1016\/j.neucom.2026.133504_bib0265","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3967","article-title":"Relational knowledge distillation","author":"Park","year":"2019"},{"key":"10.1016\/j.neucom.2026.133504_bib0270","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"4539","article-title":"Local dense logit relations for enhanced knowledge distillation","author":"Xu","year":"2025"},{"key":"10.1016\/j.neucom.2026.133504_bib0275","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"2641","article-title":"Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models","author":"Plummer","year":"2015"},{"key":"10.1016\/j.neucom.2026.133504_bib0280","series-title":"Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part V 13","first-page":"740","article-title":"Microsoft COCO: common objects in context","author":"Lin","year":"2014"},{"key":"10.1016\/j.neucom.2026.133504_bib0285","author":"Kingma"},{"key":"10.1016\/j.neucom.2026.133504_bib0290","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"1365","article-title":"Similarity-preserving knowledge distillation","author":"Tung","year":"2019"},{"key":"10.1016\/j.neucom.2026.133504_bib0295","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"1218","article-title":"Similarity reasoning and filtration for image-text matching","volume":"vol. 35","author":"Diao","year":"2021"},{"key":"10.1016\/j.neucom.2026.133504_bib0300","series-title":"Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision","first-page":"3459","article-title":"Gradual: graph-based dual-modal representation for image-text matching","author":"Long","year":"2022"},{"key":"10.1016\/j.neucom.2026.133504_bib0305","doi-asserted-by":"crossref","first-page":"2322","DOI":"10.1109\/TIP.2023.3266887","article-title":"Plug-and-play regulators for image-text matching","volume":"32","author":"Diao","year":"2023","journal-title":"IEEE Trans. Image Process."}],"container-title":["Neurocomputing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S092523122600901X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S092523122600901X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T11:47:45Z","timestamp":1776772065000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S092523122600901X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":61,"alternative-id":["S092523122600901X"],"URL":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133504","relation":{},"ISSN":["0925-2312"],"issn-type":[{"value":"0925-2312","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"UIID: Unified intra-modal and inter-modal distillation for image-text retrieval","name":"articletitle","label":"Article Title"},{"value":"Neurocomputing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neucom.2026.133504","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"133504"}}