{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T00:55:01Z","timestamp":1773708901282,"version":"3.50.1"},"reference-count":29,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,7,4]],"date-time":"2023-07-04T00:00:00Z","timestamp":1688428800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,7,4]],"date-time":"2023-07-04T00:00:00Z","timestamp":1688428800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,7,4]]},"DOI":"10.1109\/icufn57995.2023.10200668","type":"proceedings-article","created":{"date-parts":[[2023,8,7]],"date-time":"2023-08-07T17:52:13Z","timestamp":1691430733000},"page":"1-7","source":"Crossref","is-referenced-by-count":1,"title":["Multi-modal Fine-grained Retrieval with Local and Global Cross-Attention"],"prefix":"10.1109","author":[{"given":"Qiaosong","family":"Chen","sequence":"first","affiliation":[{"name":"Chongqing University of Posts and Telecommunications,Key Laboratory of Data Engineering and Visual Computing,Chongqing,P.R.China,400065"}]},{"given":"Ye","family":"Zhang","sequence":"additional","affiliation":[{"name":"Chongqing University of Posts and Telecommunications,Key Laboratory of Data Engineering and Visual Computing,Chongqing,P.R.China,400065"}]},{"given":"Junzhuo","family":"Liu","sequence":"additional","affiliation":[{"name":"Chongqing University of Posts and Telecommunications,Key Laboratory of Data Engineering and Visual Computing,Chongqing,P.R.China,400065"}]},{"given":"Zhixiang","family":"Wang","sequence":"additional","affiliation":[{"name":"GROW-School Maastricht University Medical Centre&#x002B;,Department of Radiation Oncology (Maastro),Maastricht,The Netherlands"}]},{"given":"Xin","family":"Deng","sequence":"additional","affiliation":[{"name":"Chongqing University of Posts and Telecommunications,Key Laboratory of Data Engineering and Visual Computing,Chongqing,P.R.China,400065"}]},{"given":"Jin","family":"Wang","sequence":"additional","affiliation":[{"name":"Chongqing University of Posts and Telecommunications,Key Laboratory of Data Engineering and Visual Computing,Chongqing,P.R.China,400065"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1561\/0600000105"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-021-00825-2"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2018.2879846"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2013.2276704"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350974"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-60639-8_16"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1016\/j.dsp.2018.02.009"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2892802"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.282"},{"key":"ref16","first-page":"3846","article-title":"Cross-media shared representation by hierarchical learning with multiple deep networks","author":"peng","year":"2016","journal-title":"IJCAI"},{"key":"ref19","article-title":"Attention is all you need","volume":"30","author":"vaswani","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123326"},{"key":"ref24","first-page":"213","article-title":"End-to-end object detection with transformers","author":"carion","year":"2020","journal-title":"European Conference on Computer Vision"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref26","first-page":"15908","article-title":"Transformer in transformer","volume":"34","author":"han","year":"2021","journal-title":"Advances in neural information processing systems"},{"key":"ref25","first-page":"1691","article-title":"Generative pretraining from pixels","author":"chen","year":"2020","journal-title":"International Conference on Machine Learning"},{"key":"ref20","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018","journal-title":"arXiv preprint arXiv 1810 04805"},{"key":"ref22","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2020","journal-title":"arXiv preprint arXiv 2010 11419"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00338"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00060"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"ref8","article-title":"Fine-grained image analysis with deep learning: A survey","author":"wei","year":"2021","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"ref7","first-page":"489","article-title":"Survey of research on deep learning image-text cross-modal retrieval","volume":"16","author":"ying","year":"2022","journal-title":"Journal of Frontiers of Computer Science & Technology"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3469877.3490590"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2847248"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/151"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2018.05.028"},{"key":"ref5","first-page":"436","article-title":"Extracting privileged information for enhancing classifier learning","volume":"28","year":"2018","journal-title":"IEEE Transactions on Image Processing"}],"event":{"name":"2023 Fourteenth International Conference on Ubiquitous and Future Networks (ICUFN)","location":"Paris, France","start":{"date-parts":[[2023,7,4]]},"end":{"date-parts":[[2023,7,7]]}},"container-title":["2023 Fourteenth International Conference on Ubiquitous and Future Networks (ICUFN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10199107\/10199171\/10200668.pdf?arnumber=10200668","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,28]],"date-time":"2023-08-28T17:43:02Z","timestamp":1693244582000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10200668\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,4]]},"references-count":29,"URL":"https:\/\/doi.org\/10.1109\/icufn57995.2023.10200668","relation":{},"subject":[],"published":{"date-parts":[[2023,7,4]]}}}