{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:28:30Z","timestamp":1763191710430,"version":"3.45.0"},"reference-count":52,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,30]],"date-time":"2025-06-30T00:00:00Z","timestamp":1751241600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1109\/ijcnn64981.2025.11227998","type":"proceedings-article","created":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T18:46:15Z","timestamp":1763145975000},"page":"1-8","source":"Crossref","is-referenced-by-count":0,"title":["Enhancing Image-Text Retrieval with Phrase-aware and Modality Difference-aware Embeddings"],"prefix":"10.1109","author":[{"given":"Meng","family":"Meng","sequence":"first","affiliation":[{"name":"Shenzhen Institutes of Advanced Technology, CAS,Shenzhen Key Laboratory of Computer Vision and Pattern Recognition"}]},{"given":"Shifeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Sangfor Technologies Inc."}]},{"given":"Xiaofan","family":"Chen","sequence":"additional","affiliation":[{"name":"Sangfor Technologies Inc."}]},{"given":"Shifeng","family":"Chen","sequence":"additional","affiliation":[{"name":"Shenzhen Institutes of Advanced Technology, CAS,Shenzhen Key Laboratory of Computer Vision and Pattern Recognition"}]},{"given":"Xu","family":"Zhou","sequence":"additional","affiliation":[{"name":"Sangfor Technologies Inc."}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00565"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01073"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02110"},{"key":"ref4","first-page":"5825","article-title":"Multi-modality latent interaction network for visual question answering","volume-title":"ICCV","author":"Gao"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3392619"},{"key":"ref6","article-title":"Devise: A deep visual-semantic embedding model","volume-title":"NIPS","volume":"26","author":"Frome"},{"article-title":"Deep captioning with multimodal recurrent neural networks (m-rnn)","year":"2014","author":"Mao","key":"ref7"},{"key":"ref8","article-title":"Faster r-cnn: Towards real-time object detection with region proposal networks","volume-title":"NIPS","volume":"28","author":"Ren"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/78.650093"},{"article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","year":"2018","author":"Devlin","key":"ref11"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29789"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01267"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462829"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73668-1_10"},{"key":"ref17","article-title":"Sinkhorn distances: Lightspeed computation of optimal transport","volume-title":"NIPS","volume":"26","author":"Cuturi"},{"article-title":"Unifying visual-semantic embeddings with multimodal neural language models","year":"2014","author":"Kiros","key":"ref18"},{"article-title":"Vse++: Improving visual-semantic embeddings with hard negatives","year":"2017","author":"Faghri","key":"ref19"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01361"},{"article-title":"Semi-supervised classification with graph convolutional networks","year":"2016","author":"Kipf","key":"ref22"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01302"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7006"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2021.3139210"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58601-0_33"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01553"},{"key":"ref28","article-title":"Deep fragment embeddings for bidirectional image sentence mapping","volume-title":"NIPS","volume":"27","author":"Karpathy"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00585"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6915"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00108"},{"article-title":"Loopitr: Combining dual and cross encoder architectures for image-text retrieval","year":"2022","author":"Lei","key":"ref32"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021\/106"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01095"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01455"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i16.29738"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-71246-8_45"},{"key":"ref39","first-page":"6996","article-title":"Adaptive distribution calibration for few-shot learning with hierarchical optimal transport","volume-title":"NIPS","volume":"35","author":"Guo"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01093"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2022\/158"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01521"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28538"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350940"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413961"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3148470"},{"key":"ref48","article-title":"Unsupervised learning of visual features by contrasting cluster assignments","volume-title":"NIPS","volume":"33","author":"Caron"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.4324\/9781410618894-47"},{"article-title":"Automatic differentiation in pytorch","year":"2017","author":"Paszke","key":"ref52"}],"event":{"name":"2025 International Joint Conference on Neural Networks (IJCNN)","start":{"date-parts":[[2025,6,30]]},"location":"Rome, Italy","end":{"date-parts":[[2025,7,5]]}},"container-title":["2025 International Joint Conference on Neural Networks (IJCNN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11227166\/11227148\/11227998.pdf?arnumber=11227998","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:26:04Z","timestamp":1763191564000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11227998\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":52,"URL":"https:\/\/doi.org\/10.1109\/ijcnn64981.2025.11227998","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]}}}