{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,27]],"date-time":"2026-03-27T05:42:38Z","timestamp":1774590158629,"version":"3.50.1"},"reference-count":58,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,6,26]],"date-time":"2023-06-26T00:00:00Z","timestamp":1687737600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,6,26]],"date-time":"2023-06-26T00:00:00Z","timestamp":1687737600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2024,1]]},"DOI":"10.1007\/s11042-023-15798-9","type":"journal-article","created":{"date-parts":[[2023,6,26]],"date-time":"2023-06-26T06:02:21Z","timestamp":1687759341000},"page":"12363-12377","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["SAM: cross-modal semantic alignments module for image-text retrieval"],"prefix":"10.1007","volume":"83","author":[{"given":"Pilseo","family":"Park","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Soojin","family":"Jang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yunsung","family":"Cho","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Youngbin","family":"Kim","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,6,26]]},"reference":[{"key":"15798_CR1","unstructured":"Ba J, Kiros JR, Hinton GE (2016) Layer normalization. ArXiv arXiv:1607.06450"},{"key":"15798_CR2","doi-asserted-by":"crossref","unstructured":"Bi B, Li C, Wu C, et\u00a0al (2020) Palm: pre-training an autoencodingautoregressive language model for context-conditioned generation. In: Conference on empirical methods in natural language processing","DOI":"10.18653\/v1\/2020.emnlp-main.700"},{"key":"15798_CR3","first-page":"9959","volume":"2020","author":"S Chen","year":"2020","unstructured":"Chen S, Jin Q, Wang P et al (2020) Say as you wish: fine-grained control of image caption generation with abstract scene graphs. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) 2020:9959\u20139968","journal-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"15798_CR4","doi-asserted-by":"publisher","first-page":"3096","DOI":"10.1109\/ICCVW54120.2021.00345","volume":"2021","author":"T Chen","year":"2021","unstructured":"Chen T, Tian R, Ding Z (2021) Visual reasoning using graph convolutional networks for predicting pedestrian crossing intention. IEEE\/CVF International Conference on Computer Vision Workshops (ICCVW) 2021:3096\u20133102","journal-title":"IEEE\/CVF International Conference on Computer Vision Workshops (ICCVW)"},{"key":"15798_CR5","unstructured":"Chen YC, Li L, Yu L, et\u00a0al (2019) Uniter: learning universal image-text representations. ArXiv arXiv:1909.11740"},{"key":"15798_CR6","unstructured":"Dai Z, Lai G, Yang Y, et\u00a0al (2020) Funnel-transformer: filtering out sequential redundancy for efficient language processing. ArXiv arXiv:2006.03236"},{"key":"15798_CR7","unstructured":"Devlin J, Chang MW, Lee K, et\u00a0al (2019) Bert: pre-training of deep bidirectional transformers for language understanding. ArXiv arXiv:1810.04805"},{"key":"15798_CR8","doi-asserted-by":"crossref","unstructured":"Diao H, Zhang Y, Ma L, et\u00a0al (2021) Similarity reasoning and filtration for image-text matching. ArXiv arXiv:2101.01368","DOI":"10.1609\/aaai.v35i2.16209"},{"key":"15798_CR9","doi-asserted-by":"crossref","unstructured":"Dong X, Long C, Xu W, et\u00a0al (2021) Dual graph convolutional networks with transformer and curriculum learning for image captioning. Proceedings of the 29th ACM International Conference on Multimedia","DOI":"10.1145\/3474085.3475439"},{"key":"15798_CR10","unstructured":"Faghri F, Fleet DJ, Kiros JR, et\u00a0al (2017) Vse++: improving visual-semantic embeddings with hard negatives. In: British machine vision conference"},{"key":"15798_CR11","doi-asserted-by":"crossref","unstructured":"Frisoni G, Mizutani M, Moro G, et\u00a0al (2022) Bioreader: a retrieval-enhanced text-to-text transformer for biomedical literature. In: Proceedings of the 2022 conference on empirical methods in natural language processing, pp 5770\u20135793","DOI":"10.18653\/v1\/2022.emnlp-main.390"},{"issue":"1","key":"15798_CR12","doi-asserted-by":"publisher","first-page":"336","DOI":"10.1109\/TCSS.2021.3102591","volume":"9","author":"H Gao","year":"2021","unstructured":"Gao H, Xu K, Cao M et al (2021) The deep features and attention mechanism-based method to dish healthcare under social iot systems: an empirical study with a hand-deep local-global net. IEEE Transactions on Computational Social Systems 9(1):336\u2013347","journal-title":"IEEE Transactions on Computational Social Systems"},{"key":"15798_CR13","doi-asserted-by":"crossref","unstructured":"Gao H, Fang D, Xiao J, et\u00a0al (2022a) Camrl: a joint method of channel attention and multidimensional regression loss for 3d object detection in automated vehicles. IEEE Trans Intell Transp Syst","DOI":"10.1109\/TITS.2022.3219474"},{"key":"15798_CR14","doi-asserted-by":"crossref","unstructured":"Gao H, Xiao J, Yin Y, et\u00a0al (2022b) A mutually supervised graph attention network for few-shot segmentation: the perspective of fully utilizing limited samples. IEEE Transactions on Neural Networks and Learning Systems","DOI":"10.1109\/TNNLS.2022.3155486"},{"key":"15798_CR15","unstructured":"Guo D, Xu C, Tao D (2021) Bilinear graph networks for visual question answering. IEEE Transactions on Neural Networks and Learning Systems PP"},{"key":"15798_CR16","doi-asserted-by":"crossref","unstructured":"Guo J, Lu S, Cai H, et\u00a0al (2018) Long text generation via adversarial training with leaked information. In: Proceedings of the AAAI conference on artificial intelligence","DOI":"10.1609\/aaai.v32i1.11957"},{"key":"15798_CR17","unstructured":"Hamilton WL, Ying Z, Leskovec J (2017) Inductive representation learning on large graphs. In: NIPS"},{"key":"15798_CR18","doi-asserted-by":"crossref","unstructured":"Henderson M, Casanueva I, Mrkvsi\u2019c N, et\u00a0al (2019) Convert: efficient and accurate conversational representations from transformers. ArXiv arXiv:1911.03688","DOI":"10.18653\/v1\/2020.findings-emnlp.196"},{"key":"15798_CR19","doi-asserted-by":"crossref","unstructured":"Ji Z, Chen K, Wang H (2021) Step-wise hierarchical alignment network for image-text matching. ArXiv arXiv:2106.06509","DOI":"10.24963\/ijcai.2021\/106"},{"key":"15798_CR20","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L (2015) Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3128\u20133137","DOI":"10.1109\/CVPR.2015.7298932"},{"issue":"21\u201323","key":"15798_CR21","doi-asserted-by":"publisher","first-page":"32763","DOI":"10.1007\/s11042-021-11217-z","volume":"80","author":"N Kayhan","year":"2021","unstructured":"Kayhan N, Fekri-Ershad S (2021) Content based image retrieval based on weighted fusion of texture and color features derived from modified local binary patterns and local neighborhood difference patterns. Multimed Tools Appl 80(21\u201323):32763\u201332790","journal-title":"Multimed Tools Appl"},{"key":"15798_CR22","doi-asserted-by":"crossref","unstructured":"Kim J, Yoon S, Kim D, et\u00a0al (2021a) Structured co-reference graph attention for video-grounded dialogue. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 1789\u20131797","DOI":"10.1609\/aaai.v35i2.16273"},{"key":"15798_CR23","unstructured":"Kim W, Son B, Kim I (2021b) Vilt: Vision-and-language transformer without convolution or region supervision. In: International Conference on Machine Learning"},{"key":"15798_CR24","unstructured":"Kipf T, Welling M (2017) Semi-supervised classification with graph convolutional networks. ArXiv arXiv:1609.02907"},{"issue":"1","key":"15798_CR25","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna R, Zhu Y, Groth O et al (2017) Visual genome: connecting language and vision using crowdsourced dense image annotations. Int J Comput Vis 123(1):32\u201373","journal-title":"Int J Comput Vis"},{"key":"15798_CR26","doi-asserted-by":"crossref","unstructured":"Lee KH, Chen X, Hua G, et\u00a0al (2018) Stacked cross attention for image-text matching. ArXiv arXiv:1803.08024","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"15798_CR27","unstructured":"Li LH, Yatskar M, Yin D, et\u00a0al (2019) Visualbert: a simple and performant baseline for vision and language. ArXiv arXiv:1908.03557"},{"key":"15798_CR28","doi-asserted-by":"crossref","unstructured":"Li X, Yin X, Li C, et\u00a0al (2020) Oscar: object-semantics aligned pre-training for vision-language tasks. In: European conference on computer vision","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"15798_CR29","first-page":"1","volume-title":"ACM SIGIR Forum","author":"J Lin","year":"2022","unstructured":"Lin J (2022) A proposed conceptual framework for a representational approach to information retrieval. ACM SIGIR Forum. ACM, New York, pp 1\u201329"},{"key":"15798_CR30","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie SJ, et\u00a0al (2014) Microsoft coco: common objects in context. In: European conference on computer vision","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"15798_CR31","first-page":"16331","volume":"34","author":"H Ling","year":"2021","unstructured":"Ling H, Kreis K, Li D et al (2021) Editgan: high-precision semantic image editing. Adv Neural Inf Proces Syst 34:16331\u201316345","journal-title":"Adv Neural Inf Proces Syst"},{"key":"15798_CR32","unstructured":"Loshchilov I, Hutter F (2017) Decoupled weight decay regularization. In: International conference on learning representations"},{"key":"15798_CR33","unstructured":"Lu J, Batra D, Parikh D, et\u00a0al (2019) Vilbert: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Neural information processing systems"},{"key":"15798_CR34","first-page":"10434","volume":"2020","author":"J Lu","year":"2020","unstructured":"Lu J, Goswami V, Rohrbach M et al (2020) 12-in-1: multi-task vision and language representation learning. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) 2020:10434\u201310443","journal-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"15798_CR35","doi-asserted-by":"crossref","unstructured":"Lu X, Zhao T, Lee K (2021) Visualsparta: an embarrassingly simple approach to large-scale text-to-image search with weighted bag-of-words. In: ACL","DOI":"10.18653\/v1\/2021.acl-long.389"},{"key":"15798_CR36","unstructured":"Maas AL, Hannun AY, Ng AY, et\u00a0al (2013) Rectifier nonlinearities improve neural network acoustic models. In: Proc. Icml, Atlanta, p\u00a03"},{"key":"15798_CR37","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3451390","volume":"17","author":"N Messina","year":"2021","unstructured":"Messina N, Amato G, Esuli A et al (2021) Fine-grained visual textual alignment for cross-modal retrieval using transformer encoders. ACM Trans Multimed Comput Commun Appl (TOMM) 17:1\u201323","journal-title":"ACM Trans Multimed Comput Commun Appl (TOMM)"},{"key":"15798_CR38","doi-asserted-by":"crossref","unstructured":"Plummer BA, Wang L, Cervantes CM, et\u00a0al (2015) Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE international conference on computer vision, pp 2641\u20132649","DOI":"10.1109\/ICCV.2015.303"},{"key":"15798_CR39","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2015","unstructured":"Ren S, He K, Girshick RB et al (2015) Faster r-cnn: towards real-time object detection with region proposal networks. IEEE Trans Pattern Anal Mach Intell 39:1137\u20131149","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"15798_CR40","first-page":"1","volume":"2020","author":"R Shah","year":"2020","unstructured":"Shah R, Bhatti N, Akhtar N et al (2020) Random patterns clothing image retrieval using convolutional neural network. International Conference on Emerging Trends in Smart Technologies (ICETST) 2020:1\u20136","journal-title":"International Conference on Emerging Trends in Smart Technologies (ICETST)"},{"key":"15798_CR41","unstructured":"Song K, Tan X, Qin T, et\u00a0al (2020) Mpnet: masked and permuted pre-training for language understanding. ArXiv arXiv:2004.09297"},{"key":"15798_CR42","doi-asserted-by":"crossref","unstructured":"Song X, Jing L, Lin D, et\u00a0al (2022) V2p: vision-to-prompt based multi-modal product summary generation. In: Proceedings of the 45th International ACM SIGIR conference on research and development in information retrieval, pp 992\u20131001","DOI":"10.1145\/3477495.3532076"},{"key":"15798_CR43","unstructured":"Su W, Zhu X, Cao Y, et\u00a0al (2020) Vl-bert: pre-training of generic visual-linguistic representations. ArXiv arXiv:1908.08530"},{"key":"15798_CR44","unstructured":"Takase S, Kiyono S (2021) Lessons on parameter sharing across layers in transformers. arXiv preprint arXiv:2104.06022"},{"key":"15798_CR45","doi-asserted-by":"crossref","unstructured":"Tan HH, Bansal M (2019) Lxmert: learning cross-modality encoder representations from transformers. ArXiv arXiv:1908.07490","DOI":"10.18653\/v1\/D19-1514"},{"key":"15798_CR46","first-page":"6484","volume":"2021","author":"A Toker","year":"2021","unstructured":"Toker A, Zhou Q, Maximov M et al (2021) Coming down to earth: satellite-to-street view synthesis for geo-localization. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) 2021:6484\u20136493","journal-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"},{"key":"15798_CR47","unstructured":"Vaswani A, Shazeer N, Parmar N, et\u00a0al (2017) Attention is all you need. Adv Neural Inf Proces Syst 30"},{"key":"15798_CR48","unstructured":"Velickovic P, Cucurull G, Casanova A, et\u00a0al (2017) Graph attention networks. ArXiv arXiv:1710.10903"},{"key":"15798_CR49","doi-asserted-by":"crossref","unstructured":"Wang W, Zheng H, Lin Z (2020) Self-attention and retrieval enhanced neural networks for essay generation. ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP) 8199\u20138203","DOI":"10.1109\/ICASSP40776.2020.9052954"},{"key":"15798_CR50","doi-asserted-by":"crossref","unstructured":"Wang Y, Yang H, Qian X, et\u00a0al (2019a) Position focused attention network for image-text matching. ArXiv arXiv:1907.09748","DOI":"10.24963\/ijcai.2019\/526"},{"key":"15798_CR51","doi-asserted-by":"publisher","first-page":"5763","DOI":"10.1109\/ICCV.2019.00586","volume":"2019","author":"Z Wang","year":"2019","unstructured":"Wang Z, Liu X, Li H et al (2019) Camp: cross-modal adaptive message passing for text-image retrieval. IEEE\/CVF International Conference on Computer Vision (ICCV) 2019:5763\u20135772","journal-title":"IEEE\/CVF International Conference on Computer Vision (ICCV)"},{"key":"15798_CR52","doi-asserted-by":"crossref","unstructured":"Yang D, Wu D, Zhang W, et\u00a0al (2020) Deep semantic-alignment hashing for unsupervised cross-modal retrieval. Proceedings of the 2020 International conference on multimedia retrieval","DOI":"10.1145\/3372278.3390673"},{"key":"15798_CR53","unstructured":"Yang Z, Dai Z, Yang Y, et\u00a0al (2019) Xlnet: generalized autoregressive pretraining for language understanding. Adv Neural Inf Proces Syst 32"},{"key":"15798_CR54","doi-asserted-by":"crossref","unstructured":"Ye Y, Ji S (2021) Sparse graph attention networks. IEEE Trans Knowl Data Eng","DOI":"10.1109\/TKDE.2021.3072345"},{"key":"15798_CR55","doi-asserted-by":"crossref","unstructured":"Yu T, Liu J, Jin Z, et\u00a0al (2022) Multi-scale multi-modal dictionary bert for effective text-image retrieval in multimedia advertising. In: Proceedings of the 31st ACM International Conference on Information & Knowledge Management, pp 4655\u20134660","DOI":"10.1145\/3511808.3557653"},{"key":"15798_CR56","doi-asserted-by":"crossref","unstructured":"Zhang S, Dinan E, Urbanek J, et\u00a0al (2018) Personalizing dialogue agents: I have a dog, do you have pets too? In: Annual meeting of the association for computational linguistics","DOI":"10.18653\/v1\/P18-1205"},{"key":"15798_CR57","doi-asserted-by":"publisher","first-page":"617","DOI":"10.1109\/TIP.2020.3038354","volume":"30","author":"Y Zhang","year":"2020","unstructured":"Zhang Y, Zhou W, Wang M et al (2020) Deep relation embedding for cross-modal retrieval. IEEE Trans Image Process 30:617\u2013627","journal-title":"IEEE Trans Image Process"},{"issue":"2","key":"15798_CR58","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3383184","volume":"16","author":"Z Zheng","year":"2020","unstructured":"Zheng Z, Zheng L, Garrett M et al (2020) Dual-path convolutional image-text embeddings with instance loss. ACM Trans Multimed Comput Commun Appl (TOMM) 16(2):1\u201323","journal-title":"ACM Trans Multimed Comput Commun Appl (TOMM)"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-023-15798-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-023-15798-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-023-15798-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,10]],"date-time":"2024-01-10T09:38:08Z","timestamp":1704879488000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-023-15798-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,26]]},"references-count":58,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2024,1]]}},"alternative-id":["15798"],"URL":"https:\/\/doi.org\/10.1007\/s11042-023-15798-9","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,6,26]]},"assertion":[{"value":"10 January 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"1 May 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 May 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 June 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}