{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,26]],"date-time":"2025-12-26T16:27:30Z","timestamp":1766766450992},"reference-count":38,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2024,6,15]],"date-time":"2024-06-15T00:00:00Z","timestamp":1718409600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,6,15]],"date-time":"2024-06-15T00:00:00Z","timestamp":1718409600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2024,8]]},"DOI":"10.1007\/s00530-024-01383-z","type":"journal-article","created":{"date-parts":[[2024,6,15]],"date-time":"2024-06-15T19:01:37Z","timestamp":1718478097000},"update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Multi-view and region reasoning semantic enhancement for image-text retrieval"],"prefix":"10.1007","volume":"30","author":[{"given":"Wengang","family":"Cheng","sequence":"first","affiliation":[]},{"given":"Ziyi","family":"Han","sequence":"additional","affiliation":[]},{"given":"Di","family":"He","sequence":"additional","affiliation":[]},{"given":"Lifang","family":"Wu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,6,15]]},"reference":[{"key":"1383_CR1","doi-asserted-by":"publisher","first-page":"2639","DOI":"10.1162\/0899766042321814","volume":"16","author":"DR Hardoon","year":"2004","unstructured":"Hardoon, D.R., Szedmak, S., Shawe-Taylor, J.: Canonical correlation analysis: an overview with application to learning methods. Neural Comput. 16, 2639\u20132664 (2004). https:\/\/doi.org\/10.1162\/0899766042321814","journal-title":"Neural Comput."},{"key":"1383_CR2","doi-asserted-by":"publisher","unstructured":"Qian, S., Xue, D., Zhang, H., Fang, Q., Xu, C.: Dual adversarial graph neural networks for multi-label cross-modal retrieval. In: Proceedings of the AAAI Conference on Artificial Intelligence (AAAI), vol. 35, pp. 2440\u20132448 (2021). https:\/\/doi.org\/10.1609\/aaai.v35i3.16345","DOI":"10.1609\/aaai.v35i3.16345"},{"issue":"4","key":"1383_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3499027","volume":"18","author":"Y Cheng","year":"2022","unstructured":"Cheng, Y., Zhu, X., Qian, J., Wen, F., Liu, P.: Cross-modal graph matching network for image-text retrieval. ACM Trans. Multimed. Comput. Commun. Appl. (ACM TOMM) 18(4), 1\u201323 (2022). https:\/\/doi.org\/10.1145\/3499027","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl. (ACM TOMM)"},{"key":"1383_CR4","doi-asserted-by":"publisher","unstructured":"Liu, C., Mao, Z., Zhang, T., Xie, H., Wang, B., Zhang, Y.: Graph structured network for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10921\u201310930. IEEE, Seattle, WA, USA (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.01093","DOI":"10.1109\/CVPR42600.2020.01093"},{"key":"1383_CR5","doi-asserted-by":"publisher","unstructured":"Li, X., Yin, X., Li, C., Zhang, P., Hu, X., Zhang, L., Wang, L., Hu, H., Dong, L., Wei, F., et al.: Oscar: Object-semantics aligned pre-training for vision-language tasks. In: Proceedings of the European Conference on Computer Vision (ECCV), Glasgow, UK, pp. 121\u2013137. Springer (2020). https:\/\/doi.org\/10.48550\/arXiv.2004.06165","DOI":"10.48550\/arXiv.2004.06165"},{"key":"1383_CR6","first-page":"13","volume":"32","author":"J Lu","year":"2019","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Adv. Neural Inf. Process. Syst. 32, 13\u201323 (2019)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1383_CR7","unstructured":"Bao, H., Dong, L., Piao, S., Wei, F.: Beit: Bert pre-training of image transformers (2021). arXiv preprint arXiv:2106.08254"},{"key":"1383_CR8","doi-asserted-by":"publisher","first-page":"201","DOI":"10.48550\/arXiv.1803.08024","volume":"11208","author":"K-H Lee","year":"2018","unstructured":"Lee, K.-H., Chen, X., Hua, G., Hu, H., He, X.: Stacked cross attention for image-text matching. Proc. Eur. Conf. Comput. Vis. (ECCV) 11208, 201\u2013216 (2018). https:\/\/doi.org\/10.48550\/arXiv.1803.08024","journal-title":"Proc. Eur. Conf. Comput. Vis. (ECCV)"},{"key":"1383_CR9","doi-asserted-by":"publisher","unstructured":"Zhang, K., Mao, Z., Wang, Q., Zhang, Y.: Negative-aware attention framework for image-text matching. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR), pp. 15661\u201315670. IEEE, New Orleans, LA, USA (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.01521","DOI":"10.1109\/CVPR52688.2022.01521"},{"key":"1383_CR10","first-page":"2121","volume":"2","author":"A Frome","year":"2013","unstructured":"Frome, A., Corrado, G.S., Shlens, J., Bengio, S., Dean, J., Ranzato, M., Mikolov, T.: Devise: a deep visual-semantic embedding model. Adv. Neural Inf. Process. Syst. 2, 2121\u20132129 (2013)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1383_CR11","unstructured":"Faghri, F., Fleet, D.J., Kiros, J.R., Fidler, S.V.: Improving visual-semantic embeddings with hard negatives. arXiv preprint 7161\u20137170 (2017) arXiv:1707.05612"},{"key":"1383_CR12","doi-asserted-by":"publisher","unstructured":"Gu, J., Cai, J., Joty, S.R., Niu, L., Wang, G.: Look, imagine and match: Improving textual-visual cross-modal retrieval with generative models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7181\u20137189. IEEE, Salt Lake City, UT, USA (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00750","DOI":"10.1109\/CVPR.2018.00750"},{"issue":"4","key":"1383_CR13","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3451390","volume":"17","author":"N Messina","year":"2021","unstructured":"Messina, N., Amato, G., Esuli, A., Falchi, F., Gennaro, C., Marchand-Maillet, S.: Fine-grained visual textual alignment for cross-modal retrieval using transformer encoders. ACM Trans. Multimed. Comput. Commun. Appl. (ACM TOMM) 17(4), 1\u201323 (2021). https:\/\/doi.org\/10.1145\/3451390","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl. (ACM TOMM)"},{"key":"1383_CR14","doi-asserted-by":"publisher","unstructured":"Yang, J., Duan, J., Tran, S., Xu, Y., Chanda, S., Chen, L., Zeng, B., Chilimbi, T., Huang, J.: Vision-language pre-training with triple contrastive learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15671\u201315680 (2022). https:\/\/doi.org\/10.1109\/CVPR52688.2022.01522","DOI":"10.1109\/CVPR52688.2022.01522"},{"key":"1383_CR15","doi-asserted-by":"crossref","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., Fu, Y.: Visual semantic reasoning for image-text matching. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision(ICCV), pp. 4654\u20134662. IEEE, Milan, Italy (2019). https:\/\/doi.org\/1051-4651","DOI":"10.1109\/ICCV.2019.00475"},{"key":"1383_CR16","doi-asserted-by":"publisher","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., Zhang, L.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 6077\u20136086. IEEE, Salt Lake City, UT, USA (2018). https:\/\/doi.org\/10.1109\/CVPR.2018.00636","DOI":"10.1109\/CVPR.2018.00636"},{"key":"1383_CR17","doi-asserted-by":"publisher","unstructured":"Chen, H., Ding, G., Liu, X., Lin, Z., Liu, J., Han, J.: Imram: Iterative matching with recurrent attention memory for cross-modal image-text retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition(CVPR), pp. 12655\u201312663. IEEE, Seattle, WA, USA (2020). https:\/\/doi.org\/10.1109\/CVPR42600.2020.01267","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"1383_CR18","doi-asserted-by":"publisher","unstructured":"Qu, L., Liu, M., Wu, J., Gao, Z., Nie, L.: Dynamic modality interaction modeling for image-text retrieval. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval(SIGIR), pp. 1104\u20131113 (2021). https:\/\/doi.org\/10.1145\/3404835.3462829","DOI":"10.1145\/3404835.3462829"},{"issue":"7","key":"1383_CR19","doi-asserted-by":"publisher","first-page":"2866","DOI":"10.1109\/TCSVT.2020.3030656","volume":"31","author":"K Wen","year":"2020","unstructured":"Wen, K., Gu, X., Cheng, Q.: Learning dual semantic relations with graph attention for image-text matching. IEEE Trans. Circuits Syst. Video Technol. (TCSVT) 31(7), 2866\u20132879 (2020). https:\/\/doi.org\/10.1109\/TCSVT.2020.3030656","journal-title":"IEEE Trans. Circuits Syst. Video Technol. (TCSVT)"},{"key":"1383_CR20","doi-asserted-by":"publisher","unstructured":"Wang, H., Zhang, Y., Ji, Z., Pang, Y., Ma, L.: Consensus-aware visual-semantic embedding for image-text matching. In: Proceedings of the European Conference on Computer Vision (ECCV), vol. 12369. Glasgow, UK, pp. 18\u201334 (2020). https:\/\/doi.org\/10.48550\/arXiv.2007.08883","DOI":"10.48550\/arXiv.2007.08883"},{"issue":"5","key":"1383_CR21","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3580501","volume":"19","author":"D Feng","year":"2023","unstructured":"Feng, D., He, X., Peng, Y.: Mkvse: multimodal knowledge enhanced visual-semantic embedding for image-text retrieval. ACM Trans. Multimed. Comput. Commun. Appl. (ACM TOMM) 19(5), 1\u201321 (2023). https:\/\/doi.org\/10.1145\/3580501","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl. (ACM TOMM)"},{"key":"1383_CR22","doi-asserted-by":"publisher","unstructured":"Long, S., Han, S.C., Wan, X., Poon, J.: Gradual: graph-based dual-modal representation for image-text matching. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 3459\u20133468. IEEE, Waikoloa, HI, USA (2022). https:\/\/doi.org\/10.1109\/WACV51458.2022.00252","DOI":"10.1109\/WACV51458.2022.00252"},{"key":"1383_CR23","doi-asserted-by":"publisher","first-page":"1323","DOI":"10.1609\/aaai.v36i2.20020","volume":"36","author":"J Li","year":"2022","unstructured":"Li, J., Niu, L., Zhang, L.: Action-aware embedding enhancement for image-text retrieval. Proc. AAAI Conf. Artif. Intell. (AAAI) 36, 1323\u20131331 (2022). https:\/\/doi.org\/10.1609\/aaai.v36i2.20020","journal-title":"Proc. AAAI Conf. Artif. Intell. (AAAI)"},{"key":"1383_CR24","doi-asserted-by":"publisher","first-page":"9193","DOI":"10.1109\/TIP.2021.3123553","volume":"30","author":"J Li","year":"2021","unstructured":"Li, J., Liu, L., Niu, L., Zhang, L.: Memorize, associate and match: embedding enhancement via fine-grained alignment for image-text retrieval. IEEE Trans. Image Process. (TIP) 30, 9193\u20139207 (2021). https:\/\/doi.org\/10.1109\/TIP.2021.3123553","journal-title":"IEEE Trans. Image Process. (TIP)"},{"key":"1383_CR25","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Ji, Z., Wang, D., Pang, Y., Li, X.: User: unified semantic enhancement with momentum contrast for image-text retrieval. arXiv preprint (2023). arXiv:2301.06844","DOI":"10.1109\/TIP.2023.3348297"},{"key":"1383_CR26","unstructured":"Wei, W., Gui, Z., Wu, C., Zhao, A., Wang, X., Wu, H.: Uncertainty-aware multi-view visual semantic embedding. arXiv preprint (2023). arXiv:2309.08154"},{"key":"1383_CR27","first-page":"7","volume":"2","author":"Z Li","year":"2022","unstructured":"Li, Z., Guo, C., Feng, Z., Hwang, J.-N., Xue, X.: Multi-view visual semantic embedding. Int. Jt. Conf. Artif. Intell. (IJCAI) 2, 7 (2022)","journal-title":"Int. Jt. Conf. Artif. Intell. (IJCAI)"},{"key":"1383_CR28","doi-asserted-by":"publisher","unstructured":"Chen, J., Hu, H., Wu, H., Jiang, Y., Wang, C.: Learning the best pooling strategy for visual semantic embedding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 15789\u201315798 (2021). https:\/\/doi.org\/10.48550\/arXiv.2011.04305","DOI":"10.48550\/arXiv.2011.04305"},{"key":"1383_CR29","first-page":"6000","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141., Polosukhin, I.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30, 6000\u20136010 (2017)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"1383_CR30","doi-asserted-by":"crossref","unstructured":"Ge, X., Chen, F., Xu, S., Tao, F., Jose, J.M.: Cross-modal semantic enhanced interaction for image-sentence retrieval. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision(WACV), pp. 1022\u20131031 (2023)","DOI":"10.1109\/WACV56688.2023.00108"},{"key":"1383_CR31","doi-asserted-by":"publisher","first-page":"1218","DOI":"10.1609\/aaai.v35i2.16209","volume":"35","author":"H Diao","year":"2021","unstructured":"Diao, H., Zhang, Y., Ma, L., Lu, H.: Similarity reasoning and filtration for image-text matching. Proc. AAAI Conf. Artif. Intell. (AAAI) 35, 1218\u20131226 (2021). https:\/\/doi.org\/10.1609\/aaai.v35i2.16209","journal-title":"Proc. AAAI Conf. Artif. Intell. (AAAI)"},{"key":"1383_CR32","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco: common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6\u201312, 2014, Proceedings, Part V 13, pp. 740\u2013755 (2014). Springer","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1383_CR33","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. (TACL) 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguist. (TACL)"},{"key":"1383_CR34","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031","author":"S Ren","year":"2016","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: towards real-time object detection with region proposal networks. IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI) (2016). https:\/\/doi.org\/10.1109\/TPAMI.2016.2577031","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI)"},{"issue":"4","key":"1383_CR35","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3572844","volume":"19","author":"S Yang","year":"2023","unstructured":"Yang, S., Li, Q., Li, W., Li, X.-Y., Jin, R., Lv, B., Wang, R., Liu, A.: Semantic completion and filtration for image-text retrieval. ACM Trans. Multimed. Comput. Commun. Appl. (ACM TOMM) 19(4), 1\u201320 (2023). https:\/\/doi.org\/10.1145\/3572844","journal-title":"ACM Trans. Multimed. Comput. Commun. Appl. (ACM TOMM)"},{"issue":"1","key":"1383_CR36","doi-asserted-by":"publisher","first-page":"641","DOI":"10.1109\/TPAMI.2022.3148470","volume":"45","author":"K Li","year":"2022","unstructured":"Li, K., Zhang, Y., Li, K., Li, Y., Fu, Y.: Image-text embedding learning via visual and textual semantic reasoning. IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI) 45(1), 641\u2013656 (2022). https:\/\/doi.org\/10.1109\/TPAMI.2022.3148470","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell. (TPAMI)"},{"key":"1383_CR37","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014). https:\/\/doi.org\/10.1162\/tacl_a_00166","journal-title":"Trans. Assoc. Comput. Linguist."},{"issue":"11","key":"1383_CR38","doi-asserted-by":"publisher","first-page":"2673","DOI":"10.1109\/78.650093","volume":"45","author":"M Schuster","year":"1997","unstructured":"Schuster, M., Paliwal, K.K.: Bidirectional recurrent neural networks. IEEE Trans. Signal Process. (TSP) 45(11), 2673\u20132681 (1997). https:\/\/doi.org\/10.1109\/78.650093","journal-title":"IEEE Trans. Signal Process. (TSP)"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-024-01383-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-024-01383-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-024-01383-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,22]],"date-time":"2024-08-22T08:17:08Z","timestamp":1724314628000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-024-01383-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,15]]},"references-count":38,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,8]]}},"alternative-id":["1383"],"URL":"https:\/\/doi.org\/10.1007\/s00530-024-01383-z","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2024,6,15]]},"assertion":[{"value":"23 November 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 May 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"15 June 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}],"article-number":"179"}}