{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,11]],"date-time":"2026-02-11T18:33:17Z","timestamp":1770834797640,"version":"3.50.1"},"reference-count":42,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2024,4,6]],"date-time":"2024-04-06T00:00:00Z","timestamp":1712361600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,4,6]],"date-time":"2024-04-06T00:00:00Z","timestamp":1712361600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Department of Education of Zhejiang Province of China","award":["Y202147706"],"award-info":[{"award-number":["Y202147706"]}]},{"DOI":"10.13039\/501100003093","name":"Ministry of Higher Education, Malaysia","doi-asserted-by":"publisher","award":["PRGS\/1\/2021\/ICT02\/USM\/02\/1"],"award-info":[{"award-number":["PRGS\/1\/2021\/ICT02\/USM\/02\/1"]}],"id":[{"id":"10.13039\/501100003093","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Ministry of Europe and Foreign Affairs, and Ministry of Higher Education Malaysia","award":["MyPAIR\/1\/2020\/ICT02\/USM\/\/1"],"award-info":[{"award-number":["MyPAIR\/1\/2020\/ICT02\/USM\/\/1"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SIViP"],"published-print":{"date-parts":[[2024,7]]},"DOI":"10.1007\/s11760-024-03126-z","type":"journal-article","created":{"date-parts":[[2024,4,6]],"date-time":"2024-04-06T07:02:05Z","timestamp":1712386925000},"page":"4915-4923","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":7,"title":["Pretrained models for cross-modal retrieval: experiments and improvements"],"prefix":"10.1007","volume":"18","author":[{"given":"Kun","family":"Zhou","sequence":"first","affiliation":[]},{"given":"Fadratul Hafinaz","family":"Hassan","sequence":"additional","affiliation":[]},{"given":"Keng Hoon","family":"Gan","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,4,6]]},"reference":[{"issue":"01","key":"3126_CR1","first-page":"58","volume":"4","author":"M Alkhawlani","year":"2015","unstructured":"Alkhawlani, M., Elmogy, M., El Bakry, H.: Text-based, content-based, and semantic-based image retrievals: a survey. Int. J. Comput. Inf. Technol 4(01), 58\u201366 (2015)","journal-title":"Int. J. Comput. Inf. Technol"},{"issue":"12","key":"3126_CR2","doi-asserted-by":"publisher","first-page":"2639","DOI":"10.1162\/0899766042321814","volume":"16","author":"DR Hardoon","year":"2004","unstructured":"Hardoon, D.R., Szedmak, S., Shawe-Taylor, J.: Canonical correlation analysis: an overview with application to learning methods. Neural Comput. 16(12), 2639\u20132664 (2004). https:\/\/doi.org\/10.1162\/0899766042321814","journal-title":"Neural Comput."},{"key":"3126_CR3","unstructured":"Wang, W., Livescu, K.: Large-scale approximate kernel canonical correlation analysis. Preprint at arXiv:1511.04773 (2015)"},{"key":"3126_CR4","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-021-02166-7","author":"K Bayoudh","year":"2021","unstructured":"Bayoudh, K., Knani, R., Hamdaoui, F., Mtibaa, A.: A survey on deep multimodal learning for computer vision: advances, trends, applications, and datasets. Vis. Comput. (2021). https:\/\/doi.org\/10.1007\/s00371-021-02166-7","journal-title":"Vis. Comput."},{"key":"3126_CR5","doi-asserted-by":"publisher","unstructured":"Jayagopal, A., Aiswarya, A.M., Garg, A., Nandakumar, S.K.: Multimodal representation learning with text and images. Accessed 29 Apr 2022. https:\/\/doi.org\/10.48550\/arXiv.2205.00142","DOI":"10.48550\/arXiv.2205.00142"},{"key":"3126_CR6","doi-asserted-by":"publisher","unstructured":"Radford, A., Metz, L., Chintala, S.: Unsupervised representation learning with deep convolutional generative adversarial networks. Accessed 07 Jan 2016. https:\/\/doi.org\/10.48550\/arXiv.1511.06434","DOI":"10.48550\/arXiv.1511.06434"},{"issue":"15","key":"3126_CR7","doi-asserted-by":"publisher","first-page":"9255","DOI":"10.1007\/s11042-016-3380-8","volume":"75","author":"C Wang","year":"2016","unstructured":"Wang, C., Yang, H., Meinel, C.: A deep semantic framework for multimodal representation learning. Multimedia Tools Appl. 75(15), 9255\u20139276 (2016). https:\/\/doi.org\/10.1007\/s11042-016-3380-8","journal-title":"Multimedia Tools Appl."},{"key":"3126_CR8","doi-asserted-by":"crossref","unstructured":"Zhang, S.-F., Zhai, J.-H., Xie, B.-J., Zhan, Y., Wang, X.: Multimodal representation learning: advances, trends and challenges. In: 2019 International Conference on Machine Learning and Cybernetics (ICMLC), IEEE, pp. 1\u20136 (2019)","DOI":"10.1109\/ICMLC48188.2019.8949228"},{"key":"3126_CR9","unstructured":"Devlin, J., Chang, M.-W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv 24 May 2019. Accessed 15 June 15 2022. [Online]. Preprint at http:\/\/arxiv.org\/abs\/1810.04805"},{"key":"3126_CR10","unstructured":"Liu, Y., et al.: Roberta: a robustly optimized bert pretraining approach. Preprint at arXiv:1907.11692 (2019)"},{"key":"3126_CR11","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"3126_CR12","doi-asserted-by":"crossref","unstructured":"Wang, A., Singh, A., Michael, J., Hill, F., Levy, O., Bowman, S.R.: GLUE: a multitask benchmark and analysis platform for natural language understanding. Preprint at arXiv:1804.07461 (2018)","DOI":"10.18653\/v1\/W18-5446"},{"key":"3126_CR13","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J.J., Gao, J.: Unified vision-language pre-training for image captioning and VQA. arXiv 04 Dec 2019. Accessed 18 June 18 2022. [Online]. Preprint at http:\/\/arxiv.org\/abs\/1909.11059"},{"key":"3126_CR14","doi-asserted-by":"crossref","unstructured":"Zhen, L., Hu, P., Wang, X., Peng, D.: Deep supervised cross-modal retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10394\u201310403 (2019)","DOI":"10.1109\/CVPR.2019.01064"},{"key":"3126_CR15","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, PMLR, pp. 8748\u20138763 (2021)"},{"key":"3126_CR16","unstructured":"Zeng, Z., Mao, W.: A comprehensive empirical study of vision-language pre-trained model for supervised cross-modal retrieval. arXiv, 17 Apr 2022. Accessed 01 Nov 2022. [Online]. Preprint at http:\/\/arxiv.org\/abs\/2201.02772"},{"issue":"7947","key":"3126_CR17","doi-asserted-by":"publisher","first-page":"224","DOI":"10.1038\/d41586-023-00288-7","volume":"614","author":"EA van Dis","year":"2023","unstructured":"van Dis, E.A., Bollen, J., Zuidema, W., van Rooij, R., Bockting, C.L.: ChatGPT: five priorities for research. Nature 614(7947), 224\u2013226 (2023). https:\/\/doi.org\/10.1038\/d41586-023-00288-7","journal-title":"Nature"},{"key":"3126_CR18","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv 10 Apr 2015. Accessed 21 June 2022. [Online]. Preprint at http:\/\/arxiv.org\/abs\/1409.1556"},{"key":"3126_CR19","unstructured":"Szegedy, C., et al.: Going deeper with convolutions. arXiv 16 Sep 2014. Accessed 21 June 2022. [Online]. Preprint at http:\/\/arxiv.org\/abs\/1409.4842"},{"key":"3126_CR20","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. arXiv 10 Dec 2015. Accessed 21 June 2022. [Online]. Preprint at http:\/\/arxiv.org\/abs\/1512.03385"},{"key":"3126_CR21","doi-asserted-by":"publisher","unstructured":"Sanderson, E., Matuszewski, B.J.: FCN-transformer feature fusion for polyp segmentation, pp. 892\u2013907 (2022). https:\/\/doi.org\/10.1007\/978-3-031-12053-4_65","DOI":"10.1007\/978-3-031-12053-4_65"},{"key":"3126_CR22","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. Preprint at arXiv:2010.11929 (2020)"},{"key":"3126_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"3126_CR24","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning, PMLR, pp. 8821\u20138831 (2021)"},{"key":"3126_CR25","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. Preprint at arXiv:2204.06125 (2022)"},{"key":"3126_CR26","doi-asserted-by":"publisher","unstructured":"Wu, J., Lin, Z., Zha, H.: Joint latent subspace learning and regression for cross-modal retrieval. In: Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 917\u2013920 (2017). https:\/\/doi.org\/10.1145\/3077136.3080678","DOI":"10.1145\/3077136.3080678"},{"issue":"10","key":"3126_CR27","doi-asserted-by":"publisher","first-page":"2010","DOI":"10.1109\/TPAMI.2015.2505311","volume":"38","author":"K Wang","year":"2015","unstructured":"Wang, K., He, R., Wang, L., Wang, W., Tan, T.: Joint feature selection and subspace learning for cross-modal retrieval. IEEE Trans. Pattern Anal. Mach. Intell. 38(10), 2010\u20132023 (2015). https:\/\/doi.org\/10.1109\/TPAMI.2015.2505311","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3126_CR28","unstructured":"Peng, Y., Huang, X., Qi, J.: Cross-media shared representation by hierarchical learning with multiple deep networks. In: IJCAI, p. 3853 (2016)"},{"issue":"2","key":"3126_CR29","doi-asserted-by":"publisher","first-page":"405","DOI":"10.1109\/TMM.2017.2742704","volume":"20","author":"Y Peng","year":"2017","unstructured":"Peng, Y., Qi, J., Huang, X., Yuan, Y.: CCL: cross-modal correlation learning with multigrained fusion by hierarchical network. IEEE Trans. Multimedia 20(2), 405\u2013420 (2017). https:\/\/doi.org\/10.1109\/TMM.2017.2742704","journal-title":"IEEE Trans. Multimedia"},{"issue":"11","key":"3126_CR30","doi-asserted-by":"publisher","first-page":"5585","DOI":"10.1109\/TIP.2018.2852503","volume":"27","author":"Y Peng","year":"2018","unstructured":"Peng, Y., Qi, J., Yuan, Y.: Modality-specific cross-modal similarity measurement with recurrent attention network. IEEE Trans. Image Process. 27(11), 5585\u20135599 (2018). https:\/\/doi.org\/10.1109\/TIP.2018.2852503","journal-title":"IEEE Trans. Image Process."},{"issue":"1","key":"3126_CR31","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3284750","volume":"15","author":"Y Peng","year":"2019","unstructured":"Peng, Y., Qi, J.: CM-GANs: cross-modal generative adversarial networks for common representation learning. ACM Trans. Multimedia Comput. Commun. Appl. (TOMM) 15(1), 1\u201324 (2019). https:\/\/doi.org\/10.1145\/3284750","journal-title":"ACM Trans. Multimedia Comput. Commun. Appl. (TOMM)"},{"key":"3126_CR32","doi-asserted-by":"publisher","unstructured":"Wang, B., Yang, Y., Xu, X., Hanjalic, A., Shen, H.T.: Adversarial cross-modal retrieval. In Proceedings of the 25th ACM international conference on Multimedia, pp. 154\u2013162 (2017). https:\/\/doi.org\/10.1145\/3123266.3123326","DOI":"10.1145\/3123266.3123326"},{"key":"3126_CR33","doi-asserted-by":"publisher","unstructured":"Zeng, Z., Sun, Y., Mao, W.: MCCN: multimodal coordinated clustering network for large-scale cross-modal retrieval. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 5427\u20135435 (2021). https:\/\/doi.org\/10.1145\/3474085.3475670","DOI":"10.1145\/3474085.3475670"},{"key":"3126_CR34","doi-asserted-by":"publisher","unstructured":"Zeng, Z., Wang, S., Xu, N., Mao, W.: Pan: prototype-based adaptive network for robust cross-modal retrieval. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 1125\u20131134 (2021). https:\/\/doi.org\/10.1145\/3404835.3462867","DOI":"10.1145\/3404835.3462867"},{"key":"3126_CR35","doi-asserted-by":"publisher","unstructured":"Wang, J., Gong, T., Zeng, Z., Sun, C., Yan, Y.: C 3 CMR: cross-modality cross-instance contrastive learning for cross-media retrieval. In: Proceedings of the 30th ACM International Conference on Multimedia, Lisboa Portugal: ACM, pp. 4300\u20134308 (2022). https:\/\/doi.org\/10.1145\/3503161.3548263","DOI":"10.1145\/3503161.3548263"},{"key":"3126_CR36","doi-asserted-by":"publisher","unstructured":"Rasiwasia, N., et al.: A new approach to cross-modal multimedia retrieval. In: Proceedings of the 18th ACM International Conference on Multimedia, pp. 251\u2013260 (2010). https:\/\/doi.org\/10.1145\/1873951.1873987","DOI":"10.1145\/1873951.1873987"},{"key":"3126_CR37","doi-asserted-by":"publisher","unstructured":"Chua, T.-S., Tang, J., Hong, R., Li, H., Luo, Z., Zheng, Y.: Nus-wide: a real-world web image database from national university of Singapore. In: Proceedings of the ACM international conference on image and video retrieval, pp. 1\u20139 (2009). https:\/\/doi.org\/10.1145\/1646396.1646452","DOI":"10.1145\/1646396.1646452"},{"key":"3126_CR38","unstructured":"Rashtchian, C., Young, P., Hodosh, M., Hockenmaier, J.: Collecting image annotations using amazon\u2019s mechanical turk. In: Proceedings of the NAACL HLT 2010 workshop on creating speech and language data with Amazon\u2019s Mechanical Turk, pp. 139\u2013147 (2010)"},{"key":"3126_CR39","unstructured":"Zeiler, M.D., Fergus, R.: Visualizing and understanding convolutional networks. arXiv, Nov. 28, 2013. Accessed 21 June 21 2022. [Online]. Preprint at http:\/\/arxiv.org\/abs\/1311.2901"},{"key":"3126_CR40","doi-asserted-by":"publisher","unstructured":"Feng, F., Wang, X., Li, R.: Cross-modal retrieval with correspondence autoencoder. In: Proceedings of the 22nd ACM international conference on Multimedia, pp. 7\u201316 (2014). https:\/\/doi.org\/10.1145\/2647868.2654902","DOI":"10.1145\/2647868.2654902"},{"issue":"6","key":"3126_CR41","doi-asserted-by":"publisher","first-page":"965","DOI":"10.1109\/TCSVT.2013.2276704","volume":"24","author":"X Zhai","year":"2013","unstructured":"Zhai, X., Peng, Y., Xiao, J.: Learning cross-media joint representation with sparse and semisupervised regularization. IEEE Trans. Circuits Syst. Video Technol. 24(6), 965\u2013978 (2013). https:\/\/doi.org\/10.1109\/TCSVT.2013.2276704","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"8","key":"3126_CR42","doi-asserted-by":"publisher","first-page":"1341","DOI":"10.1109\/JPROC.2018.2848209","volume":"106","author":"A Zare","year":"2018","unstructured":"Zare, A., Ozdemir, A., Iwen, M.A., Aviyente, S.: Extension of PCA to higher order data structures: an introduction to tensors, tensor decompositions, and tensor PCA. Proc. IEEE 106(8), 1341\u20131358 (2018). https:\/\/doi.org\/10.1109\/JPROC.2018.2848209","journal-title":"Proc. IEEE"}],"container-title":["Signal, Image and Video Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-024-03126-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11760-024-03126-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11760-024-03126-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,5,23]],"date-time":"2024-05-23T13:19:47Z","timestamp":1716470387000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11760-024-03126-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,6]]},"references-count":42,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2024,7]]}},"alternative-id":["3126"],"URL":"https:\/\/doi.org\/10.1007\/s11760-024-03126-z","relation":{},"ISSN":["1863-1703","1863-1711"],"issn-type":[{"value":"1863-1703","type":"print"},{"value":"1863-1711","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,4,6]]},"assertion":[{"value":"11 June 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 February 2024","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 March 2024","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 April 2024","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"Not applicable.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"Not applicable.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}]}}