{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T11:02:02Z","timestamp":1776942122302,"version":"3.51.4"},"reference-count":64,"publisher":"Tech Science Press","issue":"1","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":88,"URL":"https:\/\/doi.org\/10.32604\/TSP-CROSSMARKPOLICY"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["CMC"],"published-print":{"date-parts":[[2025]]},"DOI":"10.32604\/cmc.2025.059745","type":"journal-article","created":{"date-parts":[[2025,1,31]],"date-time":"2025-01-31T01:24:51Z","timestamp":1738286691000},"page":"219-238","update-policy":"https:\/\/doi.org\/10.32604\/tsp-crossmarkpolicy","source":"Crossref","is-referenced-by-count":2,"title":["UniTrans: Unified Parameter-Efficient Transfer Learning and Multimodal Alignment for Large Multimodal Foundation Model"],"prefix":"10.32604","volume":"83","author":[{"given":"Jiakang","family":"Sun","sequence":"first","affiliation":[]},{"given":"Ke","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Xinyang","family":"He","sequence":"additional","affiliation":[]},{"given":"Xu","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Ke","family":"Li","sequence":"additional","affiliation":[]},{"given":"Cheng","family":"Peng","sequence":"additional","affiliation":[]}],"member":"17807","published-online":{"date-parts":[[2025]]},"reference":[{"key":"ref1","unstructured":"Raji\u010d F, Ke L, Tai YW, Tang CK, Danelljan M, Yu F. Segment anything meets point tracking. arXiv:230701197. 2023. doi:10.48550\/arXiv.2307.01197."},{"key":"ref2","unstructured":"Touvron H, Lavril T, Izacard G, Martinet X, Lachaux MA, Lacroix T, et al. LLaMA: open and efficient foundation language models. arXiv:230213971. 2023. doi:10.48550\/arXiv.2302.13971."},{"key":"ref3","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"4015","article-title":"Segment anything","author":"Kirillov","year":"2023"},{"key":"ref4","series-title":"International Conference on Machine Learning","first-page":"12888","article-title":"BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation","author":"Li","year":"2022"},{"key":"ref5","unstructured":"Brown TB. Language models are few-shot learners. arXiv:2005.14165. 2020. doi:10.48550\/arXiv.2005.14165."},{"key":"ref6","doi-asserted-by":"crossref","first-page":"110851","DOI":"10.1016\/j.knosys.2023.110851","article-title":"Deep transfer learning for automatic speech recognition: towards better generalization","volume":"277","author":"Kheddar","year":"2023","journal-title":"Knowl Based Syst"},{"key":"ref7","doi-asserted-by":"crossref","first-page":"40","DOI":"10.3390\/technologies11020040","article-title":"A review of deep transfer learning and recent advancements","volume":"11","author":"Iman","year":"2023","journal-title":"Technologies"},{"key":"ref8","unstructured":"Hu EJ, Shen Y, Wallis P, Allen-Zhu Z, Li Y, Wang S, et al. LoRA: low-rank adaptation of large language models. arXiv:2106.09685. 2021. doi:10.48550\/arXiv.2106.09685."},{"key":"ref9","series-title":"International Conference on Machine Learning","first-page":"2790","article-title":"Parameter-efficient transfer learning for NLP","author":"Houlsby","year":"2019"},{"key":"ref10","unstructured":"Li XL, Liang P. Prefix-tuning: optimizing continuous prompts for generation. arXiv:2101.00190. doi:10.48550\/arXiv.2101.00190."},{"key":"ref11","series-title":"CAT-seg: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"4113","article-title":"Cost aggregation for open-vocabulary semantic segmentation","author":"Cho","year":"2024"},{"key":"ref12","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"18082","article-title":"DenseCLIP: language-guided dense prediction with context-aware prompting","author":"Rao","year":"2022"},{"key":"ref13","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"5227","article-title":"Vl-adapter: parameter-efficient transfer learning for vision-and-language tasks","author":"Sung","year":"2022"},{"key":"ref14","unstructured":"Lu H, Huo Y, Yang G, Lu Z, Zhan W, Tomizuka M, et al. UniAdapter: unified parameter-efficient transfer learning for cross-modal modeling. arXiv:2302.06605. 2023. doi:10.48550\/arXiv.2302.06605."},{"key":"ref15","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"ref16","first-page":"25971","article-title":"Segment anything in 3D with NeRFs","volume":"36","author":"Cen","year":"2023","journal-title":"Adv Neural Inf Process Syst"},{"key":"ref17","doi-asserted-by":"crossref","unstructured":"Kheddar H. Transformers and large language models for efficient intrusion detection systems: a comprehensive survey. arXiv:2408.07583. 2024. doi:10.48550\/arXiv.2408.07583.","DOI":"10.1016\/j.inffus.2025.103347"},{"key":"ref18","unstructured":"Devlin J. BERT: pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805. 2018. doi:10.48550\/arXiv.1810.04805."},{"key":"ref19","series-title":"International Conference on Machine Learning","first-page":"19730","article-title":"BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"ref20","first-page":"34892","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2024","journal-title":"Adv Neural Inf Process Syst"},{"key":"ref21","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"12655","article-title":"IMRAM: iterative matching with recurrent attention memory for cross-modal image-text retrieval","author":"Chen","year":"2020"},{"key":"ref22","series-title":"Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval","first-page":"1104","article-title":"Dynamic modality interaction modeling for image-text retrieval","author":"Qu","year":"2021"},{"key":"ref23","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"21","article-title":"Stacked attention networks for image question answering","author":"Yang","year":"2016"},{"key":"ref24","first-page":"11336","article-title":"Unicoder-VL: a universal encoder for vision and language by cross-modal pre-training","volume":"34","author":"Li","year":"2020","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"ref25","first-page":"5180","article-title":"Context-I2W: mapping images to context-dependent words for accurate zero-shot composed image retrieval","volume":"38","author":"Tang","year":"2024","journal-title":"Proc AAAI Conf Artif Intell"},{"key":"ref26","series-title":"European Conference on Computer Vision","first-page":"38","article-title":"LocVTP: video-text pre-training for temporal localization","author":"Cao","year":"2022"},{"key":"ref27","doi-asserted-by":"crossref","first-page":"1361","DOI":"10.1109\/TMM.2023.3280734","article-title":"Towards fast and accurate image-text retrieval with self-supervised fine-grained alignment","volume":"26","author":"Zhuang","year":"2023","journal-title":"IEEE Trans Multimed"},{"key":"ref28","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3024","article-title":"Dense contrastive learning for self-supervised visual pre-training","author":"Wang","year":"2021"},{"key":"ref29","unstructured":"Tang Y, Yu J, Gai K, Wang Y, Hu Y, Xiong G, et al. Align before Search: aligning ads image to text for accurate cross-modal sponsored search. arXiv:2309.16141. 2023. doi:10.48550\/arXiv.2309.16141."},{"key":"ref30","doi-asserted-by":"crossref","unstructured":"Lester B, Al-Rfou R, Constant N. The power of scale for parameter-efficient prompt tuning. arXiv:2104.08691. 2021. doi:10.48550\/arXiv.2104.08691.","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"ref31","doi-asserted-by":"crossref","first-page":"208","DOI":"10.1016\/j.aiopen.2023.08.012","article-title":"GPT understands, too","volume":"5","author":"Liu","year":"2024","journal-title":"AI Open"},{"key":"ref32","doi-asserted-by":"crossref","unstructured":"Pfeiffer J, Kamath A, R\u00fcckl\u00e9 A, Cho K, Gurevych I. Adapterfusion: non-destructive task composition for transfer learning. arXiv:2005.00247. 2020. doi:10.48550\/arXiv.2005.00247.","DOI":"10.18653\/v1\/2021.eacl-main.39"},{"key":"ref33","doi-asserted-by":"crossref","unstructured":"R\u00fcckl\u00e9 A, Geigle G, Glockner M, Beck T, Pfeiffer J, Reimers N, et al. AdapterDrop: on the efficiency of adapters in transformers. arXiv:2010.11918. 2020. doi:10.48550\/arXiv.2010.11918.","DOI":"10.18653\/v1\/2021.emnlp-main.626"},{"key":"ref34","unstructured":"Zaken EB, Ravfogel S, Goldberg Y. BitFit: simple parameter-efficient fine-tuning for transformer-based masked language-models. arXiv:2106.10199. 2021. doi:10.48550\/arXiv.2106.10199."},{"key":"ref35","doi-asserted-by":"crossref","unstructured":"Guo D, Rush AM, Kim Y. Parameter-efficient transfer learning with diff pruning. arXiv:2012.07463. 2020. doi:10.48550\/arXiv.2012.07463.","DOI":"10.18653\/v1\/2021.acl-long.378"},{"key":"ref36","unstructured":"Zhang Q, Chen M, Bukharin A, Karampatziakis N, He P, Cheng Y, et al. AdaLoRA: adaptive budget allocation for parameter-efficient fine-tuning. arXiv:2303.10512. 2023. doi:10.48550\/arXiv.2303.10512."},{"key":"ref37","doi-asserted-by":"crossref","unstructured":"Renduchintala A, Konuk T, Kuchaiev O. Tied-LoRA: enhacing parameter efficiency of lora with weight tying. arXiv:2311.09578. 2023. doi:10.48550\/arXiv.2311.09578.","DOI":"10.18653\/v1\/2024.naacl-long.481"},{"key":"ref38","unstructured":"Liu SY, Wang CY, Yin H, Molchanov P, Wang YCF, Cheng KT, et al. DoRA: weight-decomposed low-rank adaptation. arXiv:2402.09353. 2024. doi:10.48550\/arXiv.2402.09353."},{"key":"ref39","unstructured":"Hyeon-Woo N, Ye-Bin M, Oh TH. Fedpara: low-rank hadamard product for communication-efficient federated learning. arXiv:2108.06098. 2021. doi:10.48550\/arXiv.2108.06098."},{"key":"ref40","unstructured":"Dosovitskiy A. An image is worth 16x16 words: transformers for image recognition at scale. arXiv:2010.11929. 2020. doi:10.48550\/arXiv.2010.11929."},{"key":"ref41","series-title":"Computer Vision\u2013ECCV 2014: 13th European Conference","first-page":"13","article-title":"Microsoft COCO: common objects in context","author":"Lin","year":"2014 Sep 6\u201312"},{"key":"ref42","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"2641","article-title":"Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models","author":"Plummer","year":"2015"},{"key":"ref43","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"5803","article-title":"Localizing moments in video with natural language","author":"Anne Hendricks","year":"2017"},{"key":"ref44","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"6904","article-title":"Making the V in VQA matter: elevating the role of image understanding in visual question answering","author":"Goyal","year":"2017"},{"key":"ref45","series-title":"Proceedings of the 25th ACM International Conference on Multimedia","first-page":"1645","article-title":"Video question answering via gradually refined attention over appearance and motion","author":"Xu","year":"2017"},{"key":"ref46","series-title":"European Conference on Computer Vision","first-page":"104","article-title":"Uniter: universal image-text representation learning","author":"Chen","year":"2020"},{"key":"ref47","doi-asserted-by":"crossref","unstructured":"Li W, Gao C, Niu G, Xiao X, Liu H, Liu J, et al. UNIMO: towards unified-modal understanding and generation via cross-modal contrastive learning. arXiv:2012.15409. 2020. doi:10.18653\/v1\/2021.acl-long.202.","DOI":"10.18653\/v1\/2021.acl-long.202"},{"key":"ref48","series-title":"International Conference on Machine Learning","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"Jia","year":"2021"},{"key":"ref49","first-page":"9694","article-title":"Align before fuse: vision and language representation learning with momentum distillation","volume":"34","author":"Li","year":"2021","journal-title":"Adv Neural Inf Process Syst"},{"key":"ref50","series-title":"Computer Vision\u2013ECCV 2020: 16th European Conference","first-page":"121","article-title":"Oscar: object-semantics aligned pre-training for vision-language tasks","author":"Li","year":"2020 Aug 23\u201328"},{"key":"ref51","series-title":"International Conference on Machine Learning","first-page":"1931","article-title":"Unifying vision-and-language tasks via text generation","author":"Cho","year":"2021"},{"key":"ref52","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"12976","article-title":"Seeing out of the box: end-to-end pre-training for vision-language representation learning","author":"Huang","year":"2021"},{"key":"ref53","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"3558","article-title":"Conceptual 12m: pushing web-scale image-text pre-training to recognize long-tail visual concepts","author":"Changpinyo","year":"2021"},{"key":"ref54","doi-asserted-by":"crossref","unstructured":"Zhang P, Li X, Hu X, Yang J, Zhang L, Wang L, et al. VinVL: making visual representations matter in vision-language models. arXiv:2101.00529. 2021. doi:10.48550\/arXiv.2101.00529.","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"ref55","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"17980","article-title":"Scaling up vision-language pre-training for image captioning","author":"Hu","year":"2022"},{"key":"ref56","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"7331","article-title":"Less is more: CLIPBERT for video-and-language learning via sparse sampling","author":"Lei","year":"2021"},{"key":"ref57","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"1728","article-title":"Frozen in time: a joint video and image encoder for end-to-end retrieval","author":"Bain","year":"2021"},{"key":"ref58","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"16877","article-title":"Look before you speak: visually contextualized utterances","author":"Seo","year":"2021"},{"key":"ref59","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"4953","article-title":"Align and prompt: video-and-language pre-training with entity prompts","author":"Li","year":"2022"},{"key":"ref60","unstructured":"Fu TJ, Li L, Gan Z, Lin K, Wang WY, Wang L, et al. VIOLET: end-to-end video-language transformers with masked visual-token modeling. arXiv:2111.12681. 2021. doi:10.48550\/arXiv.2111.12681."},{"key":"ref61","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"1686","article-title":"Just ask: learning to answer questions from millions of narrated videos","author":"Yang","year":"2021"},{"key":"ref62","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"6598","article-title":"All in one: exploring unified video-language pre-training","author":"Wang","year":"2023"},{"key":"ref63","doi-asserted-by":"crossref","first-page":"293","DOI":"10.1016\/j.neucom.2022.07.028","article-title":"Clip4Clip: an empirical study of clip for end to end video clip retrieval and captioning","volume":"508","author":"Luo","year":"2022","journal-title":"Neurocomputing"},{"key":"ref64","first-page":"23634","article-title":"MERLOT: multimodal neural script knowledge models","volume":"34","author":"Zellers","year":"2021","journal-title":"Adv Neural Inf Process Syst"}],"container-title":["Computers, Materials &amp; Continua"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/cdn.techscience.cn\/files\/cmc\/2025\/TSP_CMC-83-1\/TSP_CMC_59745\/TSP_CMC_59745.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,14]],"date-time":"2025-11-14T06:34:48Z","timestamp":1763102088000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.techscience.com\/cmc\/v83n1\/60076"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":64,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025]]},"published-print":{"date-parts":[[2025]]}},"URL":"https:\/\/doi.org\/10.32604\/cmc.2025.059745","relation":{},"ISSN":["1546-2226"],"issn-type":[{"value":"1546-2226","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"2024-10-16","order":0,"name":"received","label":"Received","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-01-08","order":1,"name":"accepted","label":"Accepted","group":{"name":"publication_history","label":"Publication History"}},{"value":"2025-03-26","order":2,"name":"published","label":"Published Online","group":{"name":"publication_history","label":"Publication History"}}]}}