{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T11:11:49Z","timestamp":1778843509335,"version":"3.51.4"},"reference-count":50,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,11,1]],"date-time":"2026-11-01T00:00:00Z","timestamp":1793491200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Information Fusion"],"published-print":{"date-parts":[[2026,11]]},"DOI":"10.1016\/j.inffus.2026.104447","type":"journal-article","created":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T21:31:25Z","timestamp":1777930285000},"page":"104447","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Division-to-fusion: Foreground-background semantics decoupling and coupling for image-text retrieval"],"prefix":"10.1016","volume":"135","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-2790-4707","authenticated-orcid":false,"given":"Xiaolin","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6846-5782","authenticated-orcid":false,"given":"Zheng","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junhao","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shuxiang","family":"Meng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhini","family":"Cai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"78","reference":[{"issue":"9","key":"10.1016\/j.inffus.2026.104447_bib0001","doi-asserted-by":"crossref","first-page":"9502","DOI":"10.1109\/TCSVT.2025.3558996","article-title":"Matryoshka learning with metric transfer for image-Text matching","volume":"35","author":"Wang","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"5","key":"10.1016\/j.inffus.2026.104447_bib0002","first-page":"427","article-title":"Interpretation of image in poetry from a perspective of figure-ground theory","volume":"12","author":"LI","year":"2022","journal-title":"J. Lit. Art Stud."},{"key":"10.1016\/j.inffus.2026.104447_bib0003","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"5754","article-title":"Saliency-guided attention network for image-sentence matching","author":"Ji","year":"2019"},{"issue":"1","key":"10.1016\/j.inffus.2026.104447_bib0004","doi-asserted-by":"crossref","first-page":"388","DOI":"10.1109\/TCSVT.2021.3060713","article-title":"Region reinforcement network with topic constraint for image-text matching","volume":"32","author":"Wu","year":"2021","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.inffus.2026.104447_bib0005","series-title":"2023IEEE International Conference on Multimedia and Expo (ICME)","first-page":"1967","article-title":"Image-text retrieval via preserving main semantics of vision","author":"Zhang","year":"2023"},{"key":"10.1016\/j.inffus.2026.104447_bib0006","doi-asserted-by":"crossref","first-page":"9189","DOI":"10.1109\/TMM.2023.3248160","article-title":"HGAN: Hierarchical graph alignment network for image-text retrieval","volume":"25","author":"Guo","year":"2023","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.inffus.2026.104447_bib0007","series-title":"Proceedings of the 27th ACM International Conference on Multimedia","first-page":"12","article-title":"Matching images and text with multi-modal tensor fusion and re-ranking","author":"Wang","year":"2019"},{"key":"10.1016\/j.inffus.2026.104447_bib0008","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"15784","article-title":"Learning the best pooling strategy for visual semantic embedding","author":"Chen","year":"2021"},{"key":"10.1016\/j.inffus.2026.104447_bib0009","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2025.103142","article-title":"Multi-label guided graph similarity learning for cross-modal retrieval","volume":"121","author":"Zhu","year":"2025","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.inffus.2026.104447_bib0010","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"12655","article-title":"IMRAM: Iterative matching with recurrent attention memory for cross-modal image-text retrieval","author":"Chen","year":"2020"},{"key":"10.1016\/j.inffus.2026.104447_bib0011","series-title":"Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval","first-page":"1104","article-title":"Dynamic modality interaction modeling for image-text retrieval","author":"Qu","year":"2021"},{"key":"10.1016\/j.inffus.2026.104447_bib0012","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"19275","article-title":"Fine-grained image-text matching by cross-modal hard aligning network","author":"Pan","year":"2023"},{"issue":"7","key":"10.1016\/j.inffus.2026.104447_bib0013","doi-asserted-by":"crossref","first-page":"6590","DOI":"10.1109\/TCSVT.2024.3369656","article-title":"Improving image-text matching with bidirectional consistency of cross-modal alignment","volume":"34","author":"Li","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.inffus.2026.104447_bib0014","doi-asserted-by":"crossref","DOI":"10.1016\/j.inffus.2023.102084","article-title":"Global-local fusion based on adversarial sample generation for image-text matching","volume":"103","author":"Huang","year":"2024","journal-title":"Inf. Fusion"},{"key":"10.1016\/j.inffus.2026.104447_bib0015","article-title":"Redundancy mitigation: towards accurate and efficient image-text retrieval","author":"Wang","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.inffus.2026.104447_bib0016","first-page":"1","article-title":"Fractional fourier-enhanced fusion network based on pareto optimization for hyperspectral and LiDAR data classification","volume":"63","author":"Feng","year":"2025","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"issue":"10","key":"10.1016\/j.inffus.2026.104447_bib0017","doi-asserted-by":"crossref","first-page":"17856","DOI":"10.1109\/TNNLS.2025.3586714","article-title":"An adaptive weighted metric learning network based on fractional domain decoupling for hyperspectral change detection","volume":"36","author":"Feng","year":"2025","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"10.1016\/j.inffus.2026.104447_bib0018","first-page":"1","article-title":"Language-enhanced dual-level contrastive learning network for open-set hyperspectral image classification","volume":"63","author":"Qin","year":"2025","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"10.1016\/j.inffus.2026.104447_bib0019","doi-asserted-by":"crossref","first-page":"2526","DOI":"10.1109\/TMM.2021.3086618","article-title":"MFFENet: multiscale feature fusion and enhancement network for RGB\u2013thermal urban road scene parsing","volume":"24","author":"Zhou","year":"2021","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.inffus.2026.104447_bib0020","doi-asserted-by":"crossref","first-page":"3483","DOI":"10.1109\/TMM.2022.3161852","article-title":"PGDENet: progressive guided fusion and depth enhancement network for RGB-D indoor scene parsing","volume":"25","author":"Zhou","year":"2022","journal-title":"IEEE Trans. Multimed."},{"issue":"12","key":"10.1016\/j.inffus.2026.104447_bib0021","doi-asserted-by":"crossref","first-page":"7096","DOI":"10.1109\/TCSVT.2023.3275314","article-title":"MMSMCNet: modal memory sharing and morphological complementary networks for RGB-T urban scene semantic segmentation","volume":"33","author":"Zhou","year":"2023","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.inffus.2026.104447_bib0022","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"3571","article-title":"Edge-aware guidance fusion network for rgb\u2013thermal scene parsing","volume":"36","author":"Zhou","year":"2022"},{"key":"10.1016\/j.inffus.2026.104447_bib0023","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"7105","article-title":"Identification of necessary semantic undertakers in the causal view for image-text matching","volume":"38","author":"Zhang","year":"2024"},{"issue":"4","key":"10.1016\/j.inffus.2026.104447_bib0024","doi-asserted-by":"crossref","first-page":"3222","DOI":"10.1109\/TCSVT.2024.3508058","article-title":"MDNet: mamba-effective diffusion-distillation network for RGB-thermal urban dense prediction","volume":"35","author":"Zhou","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.inffus.2026.104447_bib0025","first-page":"1","article-title":"Graph attention guidance network with knowledge distillation for semantic segmentation of remote sensing images","volume":"61","author":"Zhou","year":"2023","journal-title":"IEEE Trans. Geosci. Remote Sens."},{"key":"10.1016\/j.inffus.2026.104447_bib0026","series-title":"Proceedings of the 28th ACM International Conference on Multimedia","first-page":"1047","article-title":"Context-aware multi-view summarization network for image-text matching","author":"Qu","year":"2020"},{"key":"10.1016\/j.inffus.2026.104447_bib0027","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3533","article-title":"Context-aware attention network for image-text retrieval","author":"Zhang","year":"2020"},{"key":"10.1016\/j.inffus.2026.104447_bib0028","series-title":"International Conference on Machine Learning","first-page":"5583","article-title":"Vilt: vision-and-language transformer without convolution or region supervision","author":"Kim","year":"2021"},{"key":"10.1016\/j.inffus.2026.104447_bib0029","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"12976","article-title":"Seeing out of the box: end-to-end pre-training for vision-language representation learning","author":"Huang","year":"2021"},{"key":"10.1016\/j.inffus.2026.104447_bib0030","series-title":"European Conference on Computer Vision","first-page":"104","article-title":"Uniter: universal image-text representation learning","author":"Chen","year":"2020"},{"key":"10.1016\/j.inffus.2026.104447_bib0031","series-title":"Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)","first-page":"2592","article-title":"Unimo: towards unified-modal understanding and generation via cross-modal contrastive learning","author":"Li","year":"2021"},{"key":"10.1016\/j.inffus.2026.104447_bib0032","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.inffus.2026.104447_bib0033","series-title":"International Conference on Machine Learning","first-page":"12888","article-title":"Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"issue":"6","key":"10.1016\/j.inffus.2026.104447_bib0034","doi-asserted-by":"crossref","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","article-title":"Faster R-CNN: towards real-time object detection with region proposal networks","volume":"39","author":"Ren","year":"2016","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"10.1016\/j.inffus.2026.104447_bib0035","series-title":"Proceedings of the COLING\/ACL 2006 Interactive Presentation Sessions","first-page":"69","article-title":"NLTK: The natural language toolkit","author":"Bird","year":"2006"},{"key":"10.1016\/j.inffus.2026.104447_bib0036","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"15661","article-title":"Negative-aware attention framework for image-text matching","author":"Zhang","year":"2022"},{"key":"10.1016\/j.inffus.2026.104447_bib0037","article-title":"Deep fragment embeddings for bidirectional image sentence mapping","volume":"27","author":"Karpathy","year":"2014","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.inffus.2026.104447_bib0038","series-title":"Proceedings of the European Conference on Computer Vision (ECCV)","first-page":"201","article-title":"Stacked cross attention for image-text matching","author":"Lee","year":"2018"},{"key":"10.1016\/j.inffus.2026.104447_bib0039","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"4654","article-title":"Visual semantic reasoning for image-text matching","author":"Li","year":"2019"},{"key":"10.1016\/j.inffus.2026.104447_bib0040","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"5763","article-title":"Camp: cross-modal adaptive message passing for text-image retrieval","author":"Wang","year":"2019"},{"key":"10.1016\/j.inffus.2026.104447_bib0041","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10921","article-title":"Graph structured network for image-text matching","author":"Liu","year":"2020"},{"key":"10.1016\/j.inffus.2026.104447_bib0042","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"1218","article-title":"Similarity reasoning and filtration for image-text matching","volume":"35","author":"Diao","year":"2021"},{"key":"10.1016\/j.inffus.2026.104447_bib0043","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"15159","article-title":"Learning semantic relationship among instances for image-text matching","author":"Fu","year":"2023"},{"key":"10.1016\/j.inffus.2026.104447_bib0044","doi-asserted-by":"crossref","first-page":"7555","DOI":"10.1109\/TMM.2024.3369968","article-title":"A mutually textual and visual refinement network for image-text matching","volume":"26","author":"Pang","year":"2024","journal-title":"IEEE Trans. Multimed."},{"issue":"10","key":"10.1016\/j.inffus.2026.104447_bib0045","doi-asserted-by":"crossref","first-page":"9678","DOI":"10.1109\/TCSVT.2024.3392619","article-title":"Reference-aware adaptive network for image-text matching","volume":"34","author":"Xiong","year":"2024","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"issue":"11","key":"10.1016\/j.inffus.2026.104447_bib0046","doi-asserted-by":"crossref","first-page":"11627","DOI":"10.1109\/TCSVT.2025.3571731","article-title":"Multi-scale feature fusion based on piecewise polynomial activation function for image-text matching","volume":"35","author":"Ji","year":"2025","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"10.1016\/j.inffus.2026.104447_bib0047","series-title":"Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)","first-page":"31769","article-title":"Maximal matching matters: preventing representation collapse for robust cross-Modal retrieval","author":"Alomari","year":"2025"},{"key":"10.1016\/j.inffus.2026.104447_bib0048","doi-asserted-by":"crossref","first-page":"9723","DOI":"10.1109\/TMM.2025.3618550","article-title":"Similarity shuffled criss-cross transformer with angle loss for image-text matching","volume":"27","author":"Chen","year":"2025","journal-title":"IEEE Trans. Multimed."},{"key":"10.1016\/j.inffus.2026.104447_bib0049","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"24993","article-title":"DH-Set: improving vision-language alignment with diverse and hybrid set-embeddings learning","author":"Zhang","year":"2025"},{"key":"10.1016\/j.inffus.2026.104447_bib0050","series-title":"Proceedings of the 38th International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume":"139","author":"Radford","year":"2021"}],"container-title":["Information Fusion"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1566253526003271?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1566253526003271?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T10:52:06Z","timestamp":1778842326000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1566253526003271"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,11]]},"references-count":50,"alternative-id":["S1566253526003271"],"URL":"https:\/\/doi.org\/10.1016\/j.inffus.2026.104447","relation":{},"ISSN":["1566-2535"],"issn-type":[{"value":"1566-2535","type":"print"}],"subject":[],"published":{"date-parts":[[2026,11]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Division-to-fusion: Foreground-background semantics decoupling and coupling for image-text retrieval","name":"articletitle","label":"Article Title"},{"value":"Information Fusion","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.inffus.2026.104447","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"104447"}}