{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,1,18]],"date-time":"2024-01-18T02:28:23Z","timestamp":1705544903874},"reference-count":33,"publisher":"Springer Science and Business Media LLC","issue":"2","license":[{"start":{"date-parts":[[2023,8,3]],"date-time":"2023-08-03T00:00:00Z","timestamp":1691020800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,8,3]],"date-time":"2023-08-03T00:00:00Z","timestamp":1691020800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Chongqing Natural Science Foundation of China","award":["CSTB2022NSCQ-MSX1417"],"award-info":[{"award-number":["CSTB2022NSCQ-MSX1417"]}]},{"name":"the Science and Technology Research Program of Chongqing Municipal Education Commission","award":["KJZD-K202200513"],"award-info":[{"award-number":["KJZD-K202200513"]}]},{"name":"Chongqing Normal University Fund","award":["22XLB003"],"award-info":[{"award-number":["22XLB003"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Multimed Info Retr"],"published-print":{"date-parts":[[2023,12]]},"DOI":"10.1007\/s13735-023-00287-4","type":"journal-article","created":{"date-parts":[[2023,8,3]],"date-time":"2023-08-03T13:01:41Z","timestamp":1691067701000},"update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Modal interaction-enhanced prompt learning by transformer decoder for vision-language models"],"prefix":"10.1007","volume":"12","author":[{"given":"Mingyue","family":"Liu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Honggang","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Longfei","family":"Ma","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingyong","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,8,3]]},"reference":[{"key":"287_CR1","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, Sun J (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"issue":"6","key":"287_CR2","doi-asserted-by":"publisher","first-page":"84","DOI":"10.1145\/3065386","volume":"60","author":"A Krizhevsky","year":"2017","unstructured":"Krizhevsky A, Sutskever I, Hinton GE (2017) Imagenet classification with deep convolutional neural networks. Commun ACM 60(6):84\u201390","journal-title":"Commun ACM"},{"key":"287_CR3","doi-asserted-by":"crossref","unstructured":"Carion N, Massa F, Synnaeve G, Usunier N, Kirillov A, Zagoruyko S (2020) End-to-end object detection with transformers. In: European conference on computer vision, Springer, pp 213\u2013229","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"287_CR4","doi-asserted-by":"crossref","unstructured":"Gao P, Zheng M, Wang X, Dai J, Li H (2021) Fast convergence of detr with spatially modulated co-attention. In: Proceedings of the IEEE\/CVF international conference on computer vision, pp 3621\u20133630","DOI":"10.1109\/ICCV48922.2021.00360"},{"key":"287_CR5","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster r-cnn: towards real-time object detection with region proposal networks. Adv Neural Inform Process Syst 28"},{"key":"287_CR6","doi-asserted-by":"crossref","unstructured":"Long J, Shelhamer E, Darrell T (2015) Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3431\u20133440","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"287_CR7","unstructured":"Devlin J, Chang M-W, Lee K, Toutanova K (2018) Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805"},{"key":"287_CR8","unstructured":"Radford A, Narasimhan K, Salimans T, Sutskever I et al (2018) Improving language understanding by generative pre-training"},{"key":"287_CR9","unstructured":"Radford A, Kim JW, Hallacy C, Ramesh A, Goh G, Agarwal S, Sastry G, Askell A, Mishkin P, Clark J (2021) Learning transferable visual models from natural language supervision. In: International conference on machine learning, PMLR, pp 8748\u20138763"},{"key":"287_CR10","unstructured":"Gao P, Geng S, Zhang R, Ma T, Fang R, Zhang Y, Li H, Qiao Y (2021) Clip-adapter: better vision-language models with feature adapters. arXiv preprint arXiv:2110.04544"},{"key":"287_CR11","unstructured":"Zhang R, Fang R, Gao P, Zhang W, Li K, Dai J, Qiao Y, Li H (2021) Tip-adapter: training-free clip-adapter for better vision-language modeling. arXiv preprint arXiv:2111.03930"},{"issue":"9","key":"287_CR12","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","volume":"130","author":"K Zhou","year":"2022","unstructured":"Zhou K, Yang J, Loy CC, Liu Z (2022) Learning to prompt for vision-language models. Int J Comput Vis 130(9):2337\u20132348","journal-title":"Int J Comput Vis"},{"key":"287_CR13","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser \u0141, Polosukhin I (2017) Attention is all you need. Adv Neural Inform Process Syst 30"},{"key":"287_CR14","doi-asserted-by":"crossref","unstructured":"Zhou K, Yang J, Loy CC, Liu Z (2022) Conditional prompt learning for vision-language models. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 16816\u201316825","DOI":"10.1109\/CVPR52688.2022.01631"},{"key":"287_CR15","doi-asserted-by":"crossref","unstructured":"Rao Y, Zhao W, Chen G, Tang Y, Zhu Z, Huang G, Zhou J, Lu J (2022) Denseclip: language-guided dense prediction with context-aware prompting. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 18082\u201318091","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"287_CR16","doi-asserted-by":"crossref","unstructured":"Li M, Xu R, Wang S, Zhou L, Lin X, Zhu C, Zeng M, Ji H, Chang S-F (2022) Clip-event: connecting text and images with event structures. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 16420\u201316429","DOI":"10.1109\/CVPR52688.2022.01593"},{"key":"287_CR17","doi-asserted-by":"crossref","unstructured":"Ge C, Huang R, Xie M, Lai Z, Song S, Li S, Huang G (2022) Domain adaptation via prompt learning. arXiv preprint arXiv:2202.06687","DOI":"10.1109\/TNNLS.2023.3327962"},{"key":"287_CR18","doi-asserted-by":"crossref","unstructured":"Jia M, Tang L, Chen B-C, Cardie C, Belongie S, Hariharan B, Lim S-N (2022) Visual prompt tuning. arXiv preprint arXiv:2203.12119","DOI":"10.1007\/978-3-031-19827-4_41"},{"key":"287_CR19","doi-asserted-by":"crossref","unstructured":"Deng J, Dong W, Socher R, Li L-J, Li K, Fei-Fei L (2009) Imagenet: A large-scale hierarchical image database. In: 2009 IEEE conference on computer vision and pattern recognition, IEEE, pp 248\u2013255","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"287_CR20","doi-asserted-by":"crossref","unstructured":"Fei-Fei L, Fergus R, Perona P (2004) Learning generative visual models from few training examples: an incremental Bayesian approach tested on 101 object categories. In: 2004 conference on computer vision and pattern recognition workshop, IEEE, pp 178\u2013178","DOI":"10.1109\/CVPR.2004.383"},{"key":"287_CR21","doi-asserted-by":"crossref","unstructured":"Parkhi OM, Vedaldi A, Zisserman A, Jawahar C (2012) Cats and dogs. In: 2012 IEEE conference on computer vision and pattern recognition, IEEE, pp 3498\u20133505","DOI":"10.1109\/CVPR.2012.6248092"},{"key":"287_CR22","doi-asserted-by":"crossref","unstructured":"Krause J, Stark M, Deng J, Fei-Fei L (2013) 3D object representations for fine-grained categorization. In: Proceedings of the IEEE international conference on computer vision workshops, pp 554\u2013561","DOI":"10.1109\/ICCVW.2013.77"},{"key":"287_CR23","doi-asserted-by":"crossref","unstructured":"Nilsback M-E, Zisserman A (2008) Automated flower classification over a large number of classes. In: 2008 Sixth Indian conference on computer vision, graphics & image processing, IEEE, pp 722\u2013729","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"287_CR24","doi-asserted-by":"crossref","unstructured":"Bossard L, Guillaumin M, Gool LV (2014) Food-101\u2013mining discriminative components with random forests. In: European conference on computer vision, Springer, pp 446\u2013461","DOI":"10.1007\/978-3-319-10599-4_29"},{"key":"287_CR25","unstructured":"Maji S, Rahtu E, Kannala J, Blaschko M, Vedaldi A (2013) Fine-grained visual classification of aircraft. arXiv preprint arXiv:1306.5151"},{"key":"287_CR26","doi-asserted-by":"crossref","unstructured":"Xiao J, Hays J, Ehinger KA, Oliva A, Torralba A (2010) Sun database: large-scale scene recognition from abbey to zoo. In: 2010 IEEE computer society conference on computer vision and pattern recognition, IEEE, pp 3485\u20133492","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"287_CR27","unstructured":"Soomro K, Zamir AR, Shah M (2012) Ucf101: a dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402"},{"key":"287_CR28","doi-asserted-by":"crossref","unstructured":"Cimpoi M, Maji S, Kokkinos I, Mohamed S, Vedaldi A (2014) Describing textures in the wild. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 3606\u20133613","DOI":"10.1109\/CVPR.2014.461"},{"issue":"7","key":"287_CR29","doi-asserted-by":"publisher","first-page":"2217","DOI":"10.1109\/JSTARS.2019.2918242","volume":"12","author":"P Helber","year":"2019","unstructured":"Helber P, Bischke B, Dengel A, Borth D (2019) Eurosat: a novel dataset and deep learning benchmark for land use and land cover classification. IEEE J Select Top Appl Earth Observ Remote Sens 12(7):2217\u20132226","journal-title":"IEEE J Select Top Appl Earth Observ Remote Sens"},{"key":"287_CR30","unstructured":"Recht B, Roelofs R, Schmidt L, Shankar V (2019) Do imagenet classifiers generalize to imagenet? In: International conference on machine learning, PMLR, pp 5389\u20135400"},{"key":"287_CR31","unstructured":"Wang H, Ge S, Lipton Z, Xing EP (2019) Learning robust global representations by penalizing local predictive power. Adv Neural Inform Process Syst 32"},{"key":"287_CR32","doi-asserted-by":"crossref","unstructured":"Hendrycks D, Zhao K, Basart S, Steinhardt J, Song D (2021) Natural adversarial examples. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 15262\u201315271","DOI":"10.1109\/CVPR46437.2021.01501"},{"key":"287_CR33","unstructured":"Hendrycks D, Basart S, Mu N, Kadavath S, Wang F, Dorundo E, Desai R, Zhu T, Parajuli S, Guo M et al. The many faces of robustness: a critical analysis of out-of-distribution generalization supplementary material"}],"container-title":["International Journal of Multimedia Information Retrieval"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-023-00287-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s13735-023-00287-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s13735-023-00287-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,2]],"date-time":"2023-12-02T14:11:29Z","timestamp":1701526289000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s13735-023-00287-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,8,3]]},"references-count":33,"journal-issue":{"issue":"2","published-print":{"date-parts":[[2023,12]]}},"alternative-id":["287"],"URL":"https:\/\/doi.org\/10.1007\/s13735-023-00287-4","relation":{},"ISSN":["2192-6611","2192-662X"],"issn-type":[{"value":"2192-6611","type":"print"},{"value":"2192-662X","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,8,3]]},"assertion":[{"value":"2 March 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 May 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 July 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 August 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that there are no conflict of interest regarding the publication of this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}},{"value":"No ethical approval involved regarding the publication of this paper.","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical approval"}}],"article-number":"19"}}