{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T04:59:12Z","timestamp":1781585952430,"version":"3.54.5"},"reference-count":36,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2022,6,20]],"date-time":"2022-06-20T00:00:00Z","timestamp":1655683200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2022,6,20]],"date-time":"2022-06-20T00:00:00Z","timestamp":1655683200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Sci. China Inf. Sci."],"published-print":{"date-parts":[[2022,7]]},"DOI":"10.1007\/s11432-021-3367-y","type":"journal-article","created":{"date-parts":[[2022,6,25]],"date-time":"2022-06-25T03:21:31Z","timestamp":1656127291000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":14,"title":["Heterogeneous memory enhanced graph reasoning network for cross-modal retrieval"],"prefix":"10.1007","volume":"65","author":[{"given":"Zhong","family":"Ji","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kexin","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuqing","family":"He","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yanwei","family":"Pang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xuelong","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2022,6,20]]},"reference":[{"key":"3367_CR1","doi-asserted-by":"publisher","first-page":"4057","DOI":"10.1109\/TIP.2021.3068825","volume":"30","author":"Y Chen","year":"2021","unstructured":"Chen Y, Huang R, Chang H, et al. Cross-modal knowledge adaptation for language-based person search. IEEE Trans Image Process, 2021, 30: 4057\u20134069","journal-title":"IEEE Trans Image Process"},{"key":"3367_CR2","doi-asserted-by":"publisher","first-page":"128","DOI":"10.1109\/TMM.2017.2723841","volume":"20","author":"L Zhang","year":"2018","unstructured":"Zhang L, Ma B, Li G, et al. Generalized semi-supervised and structured subspace learning for cross-modal retrieval. IEEE Trans Multimedia, 2018, 20: 128\u2013141","journal-title":"IEEE Trans Multimedia"},{"key":"3367_CR3","doi-asserted-by":"publisher","first-page":"1086","DOI":"10.1109\/TCYB.2020.2985716","volume":"52","author":"Z Ji","year":"2022","unstructured":"Ji Z, Wang H, Han J, et al. SMAN: stacked multimodal attention network for cross-modal image-text retrieval. IEEE Trans Cybern, 2022, 52: 1086\u20131097","journal-title":"IEEE Trans Cybern"},{"key":"3367_CR4","doi-asserted-by":"publisher","first-page":"120101","DOI":"10.1007\/s11432-020-3032-8","volume":"64","author":"Z Ji","year":"2021","unstructured":"Ji Z, Yan J T, Wang Q, et al. Triple discriminator generative adversarial network for zero-shot image classification. Sci China Inf Sci, 2021, 64: 120101","journal-title":"Sci China Inf Sci"},{"key":"3367_CR5","doi-asserted-by":"publisher","first-page":"120112","DOI":"10.1007\/s11432-019-2721-0","volume":"63","author":"Z H Wang","year":"2020","unstructured":"Wang Z H, Liu X, Lin J W, et al. Multi-attention based cross-domain beauty product image retrieval. Sci China Inf Sci, 2020, 63: 120112","journal-title":"Sci China Inf Sci"},{"key":"3367_CR6","doi-asserted-by":"crossref","unstructured":"Karpathy A, Li F-F. Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2015. 3128\u20133137","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"3367_CR7","unstructured":"Faghri F, Fleet D J, Kiros J R, et al. VSE++: improving visual-semantic embeddings with hard negatives. 2017. ArXiv:1707.05612"},{"key":"3367_CR8","doi-asserted-by":"crossref","unstructured":"Lee K H, Chen X, Hua G, et al. Stacked cross attention for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), 2018. 201\u2013216","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"3367_CR9","doi-asserted-by":"crossref","unstructured":"Hu Z, Luo Y, Lin J, et al. Multi-level visual-semantic alignments with relation-wise dual attention network for image and text matching. In: Proceedings of International Joint Conference on Artificial Intelligence, 2019. 789\u2013795","DOI":"10.24963\/ijcai.2019\/111"},{"key":"3367_CR10","unstructured":"Frome A, Corrado G, Shlens J, et al. DeViSE: a deep visual-semantic embedding model. In: Proceedings of the 26th International Conference on Neural Information Processing Systems, 2013"},{"key":"3367_CR11","doi-asserted-by":"crossref","unstructured":"Ma L, Lu Z, Shang L, et al. Multimodal convolutional neural networks for matching image and sentence. In: Proceedings of the IEEE International Conference on Computer Vision, 2015. 2623\u20132631","DOI":"10.1109\/ICCV.2015.301"},{"key":"3367_CR12","unstructured":"Kiros R, Salakhutdinov R, Zemel R S. Unifying visual-semantic embeddings with multimodal neural language models. 2014. ArXiv:1411.2539"},{"key":"3367_CR13","doi-asserted-by":"crossref","unstructured":"Wei X, Zhang T, Li Y, et al. Multi-modality cross attention network for image and sentence matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020. 10941\u201310950","DOI":"10.1109\/CVPR42600.2020.01095"},{"key":"3367_CR14","doi-asserted-by":"crossref","unstructured":"Mithun N C, Li J, Metze F, et al. Learning joint embedding with multimodal cues for cross-modal video-text retrieval. In: Proceedings of the 2018 ACM on International Conference on Multimedia Retrieval, 2018. 19\u201327","DOI":"10.1145\/3206025.3206064"},{"key":"3367_CR15","doi-asserted-by":"crossref","unstructured":"Song Y, Soleymani M. Polysemous visual-semantic embedding for cross-modal retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2019. 1979\u20131988","DOI":"10.1109\/CVPR.2019.00208"},{"key":"3367_CR16","unstructured":"Li Y, Tarlow D, Brockschmidt M, et al. Gated graph sequence neural networks. 2015. ArXiv:1511.05493"},{"key":"3367_CR17","doi-asserted-by":"crossref","unstructured":"Jiang J, Wei Y, Feng Y, et al. Dynamic hypergraph neural networks. In: Proceedings of International Joint Conference on Artificial Intelligence, 2019. 2635\u20132641","DOI":"10.24963\/ijcai.2019\/366"},{"key":"3367_CR18","unstructured":"Veli\u010dkovi\u0107 P, Cucurull G, Casanova A, et al. Graph attention networks. 2017. ArXiv:1710.10903"},{"key":"3367_CR19","doi-asserted-by":"crossref","unstructured":"Li K, Zhang Y, Li K, et al. Visual semantic reasoning for image-text matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019. 4654\u20134662","DOI":"10.1109\/ICCV.2019.00475"},{"key":"3367_CR20","unstructured":"Graves A, Wayne G, Danihelka I. Neural turing machines. 2014. ArXiv:1410.5401"},{"key":"3367_CR21","first-page":"2440","volume":"2","author":"S Sukhbaatar","year":"2015","unstructured":"Sukhbaatar S, Szlam A, Weston J, et al. End-to-end memory networks. In: Proceedings of the 28th International Conference on Neural Information Processing Systems, 2015. 2: 2440\u20132448","journal-title":"Proceedings of the 28th International Conference on Neural Information Processing Systems"},{"key":"3367_CR22","unstructured":"Xiong C, Merity S, Socher R. Dynamic memory networks for visual and textual question answering. In: Proceedings of International Conference on Machine Learning, 2016. 2397\u20132406"},{"key":"3367_CR23","doi-asserted-by":"crossref","unstructured":"Fan C, Zhang X, Zhang S, et al. Heterogeneous memory enhanced multimodal attention model for video question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2019. 1999\u20132007","DOI":"10.1109\/CVPR.2019.00210"},{"key":"3367_CR24","doi-asserted-by":"crossref","unstructured":"Huang Y, Wang L. ACMM: aligned cross-modal memory for few-shot image and sentence matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019. 5774\u20135783","DOI":"10.1109\/ICCV.2019.00587"},{"key":"3367_CR25","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, et al. Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018. 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"3367_CR26","unstructured":"Devlin J, Chang M W, Lee K, et al. BERT: pre-training of deep bidirectional transformers for language understanding. 2018. ArXiv:1810.04805"},{"key":"3367_CR27","doi-asserted-by":"crossref","unstructured":"Liu C, Mao Z, Zhang T, et al. Graph structured network for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020. 10921\u201310930","DOI":"10.1109\/CVPR42600.2020.01093"},{"key":"3367_CR28","doi-asserted-by":"publisher","first-page":"1261","DOI":"10.1109\/TMM.2018.2877122","volume":"21","author":"G Song","year":"2019","unstructured":"Song G, Wang D, Tan X. Deep memory network for cross-modal retrieval. IEEE Trans Multimedia, 2019, 21: 1261\u20131275","journal-title":"IEEE Trans Multimedia"},{"key":"3367_CR29","doi-asserted-by":"crossref","unstructured":"Sarafianos N, Xu X, Kakadiaris I A. Adversarial representation learning for text-to-image matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019. 5814\u20135824","DOI":"10.1109\/ICCV.2019.00591"},{"key":"3367_CR30","doi-asserted-by":"crossref","unstructured":"Huang Y, Wu Q, Song C, et al. Learning semantic concepts and order for image and sentence matching. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018. 6163\u20136171","DOI":"10.1109\/CVPR.2018.00645"},{"key":"3367_CR31","doi-asserted-by":"crossref","unstructured":"Wang Z, Liu X, Li H, et al. CAMP: cross-modal adaptive message passing for text-image retrieval. In: Proceedings of Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019. 5764\u20135773","DOI":"10.1109\/ICCV.2019.00586"},{"key":"3367_CR32","doi-asserted-by":"crossref","unstructured":"Wu Y, Wang S, Song G, et al. Learning fragment self-attention embeddings for image-text matching. In: Proceedings of the 27th ACM International Conference on Multimedia, 2019. 2088\u20132096","DOI":"10.1145\/3343031.3350940"},{"key":"3367_CR33","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3383184","volume":"16","author":"Z Zheng","year":"2020","unstructured":"Zheng Z, Zheng L, Garrett M, et al. Dual-path convolutional image-text embeddings with instance loss. ACM Trans Multimedia Comput Commun Appl, 2020, 16: 1\u201323","journal-title":"ACM Trans Multimedia Comput Commun Appl"},{"key":"3367_CR34","doi-asserted-by":"crossref","unstructured":"Chen H, Ding G, Liu X, et al. IMRAM: iterative matching with recurrent attention memory for cross-modal image-text retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020. 12655\u201312663","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"3367_CR35","unstructured":"Vendrov I, Kiros R, Fidler S, et al. Order-embeddings of images and language. 2015. ArXiv:1511.06361"},{"key":"3367_CR36","doi-asserted-by":"crossref","unstructured":"Feng F, Wang X, Li R. Cross-modal retrieval with correspondence autoencoder. In: Proceedings of the 22nd ACM International Conference on Multimedia, 2014. 7\u201316","DOI":"10.1145\/2647868.2654902"}],"container-title":["Science China Information Sciences"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-021-3367-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11432-021-3367-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11432-021-3367-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,4]],"date-time":"2023-08-04T21:02:56Z","timestamp":1691182976000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11432-021-3367-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6,20]]},"references-count":36,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2022,7]]}},"alternative-id":["3367"],"URL":"https:\/\/doi.org\/10.1007\/s11432-021-3367-y","relation":{},"ISSN":["1674-733X","1869-1919"],"issn-type":[{"value":"1674-733X","type":"print"},{"value":"1869-1919","type":"electronic"}],"subject":[],"published":{"date-parts":[[2022,6,20]]},"assertion":[{"value":"11 April 2021","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 July 2021","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 October 2021","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 June 2022","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"172104"}}