{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T15:13:49Z","timestamp":1769181229019,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":29,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819555666","type":"print"},{"value":"9789819555673","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5567-3_11","type":"book-chapter","created":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T21:14:23Z","timestamp":1769116463000},"page":"151-164","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["STDC: Sparse Transformer Deep Collaboration Prompt Tuning for\u00a0Industrial Multimodal Large Models"],"prefix":"10.1007","author":[{"given":"Yijun","family":"Bei","sequence":"first","affiliation":[]},{"given":"Ke","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Bin","family":"Zhao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,23]]},"reference":[{"key":"11_CR1","doi-asserted-by":"crossref","unstructured":"Ainslie, J., et al.: Encoding long and structured inputs in transformers. arXiv preprint arXiv:2004.08483 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.19"},{"key":"11_CR2","first-page":"11079","volume":"35","author":"A Bulatov","year":"2022","unstructured":"Bulatov, A., Kuratov, Y., Burtsev, M.: Recurrent memory transformer. Adv. Neural. Inf. Process. Syst. 35, 11079\u201311091 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"3","key":"11_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3641289","volume":"15","author":"Y Chang","year":"2024","unstructured":"Chang, Y., et al.: A survey on evaluation of large language models. ACM Trans. Intell. Syst. Technol. 15(3), 1\u201345 (2024)","journal-title":"ACM Trans. Intell. Syst. Technol."},{"key":"11_CR4","doi-asserted-by":"crossref","unstructured":"Chavan, V., Koch, P., Schl\u00fcter, M., Briese, C.: Towards realistic evaluation of industrial continual learning scenarios with an emphasis on energy consumption and computational footprint. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11506\u201311518 (2023)","DOI":"10.1109\/ICCV51070.2023.01057"},{"key":"11_CR5","doi-asserted-by":"crossref","unstructured":"Chen, Y.C., Li, W.H., Sun, C., Wang, Y.C.F., Chen, C.S.: SAM4MLLM: enhance multi-modal large language model for referring expression segmentation (2024). https:\/\/arxiv.org\/abs\/2409.10542","DOI":"10.1007\/978-3-031-73004-7_19"},{"key":"11_CR6","unstructured":"Child, R., Gray, S., Radford, A., Sutskever, I.: Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509 (2019)"},{"key":"11_CR7","unstructured":"Desai, A., et al.: HashAttention: semantic sparsity for faster inference. arXiv preprint arXiv:2412.14468 (2024)"},{"key":"11_CR8","doi-asserted-by":"crossref","unstructured":"Doris, A.C., et al.: DesignQA: a multimodal benchmark for evaluating large language models\u2019 understanding of engineering documentation. arXiv preprint arXiv:2404.07917 (2024)","DOI":"10.1115\/1.4067333"},{"key":"11_CR9","unstructured":"Team GLM, et\u00a0al.: ChatGLM: a family of large language models from GLM-130B to GLM-4 all tools. arXiv preprint arXiv:2406.12793 (2024)"},{"key":"11_CR10","unstructured":"Gu, Z., Zhu, B., Zhu, G., Chen, Y., Tang, M., Wang, J.: AnomalyGPT: detecting industrial anomalies using large vision-language models (2023). https:\/\/arxiv.org\/abs\/2308.15366"},{"key":"11_CR11","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. ICLR 1(2), 3 (2022)"},{"key":"11_CR12","unstructured":"Jie, S., Tang, Y., Ding, N., Deng, Z.H., Han, K., Wang, Y.: Memory-space visual prompting for efficient vision-language fine-tuning (2024). https:\/\/arxiv.org\/abs\/2405.05615"},{"issue":"3","key":"11_CR13","doi-asserted-by":"publisher","first-page":"197","DOI":"10.1007\/s40171-021-00272-y","volume":"22","author":"M Johnson","year":"2021","unstructured":"Johnson, M., et al.: Impact of Big Data and artificial intelligence on industry: developing a workforce roadmap for a data driven economy. Glob. J. Flex. Syst. Manag. 22(3), 197\u2013217 (2021)","journal-title":"Glob. J. Flex. Syst. Manag."},{"key":"11_CR14","doi-asserted-by":"crossref","unstructured":"Khattak, M.U., Rasheed, H., Maaz, M., Khan, S., Khan, F.S.: MaPLE: multi-modal prompt learning (2023). https:\/\/arxiv.org\/abs\/2210.03117","DOI":"10.1109\/CVPR52729.2023.01832"},{"key":"11_CR15","doi-asserted-by":"crossref","unstructured":"Lee, S.I., et al.: Vision transformer models for mobile\/edge devices: a survey. Multimedia Syst. 30(2), 109 (2024)","DOI":"10.1007\/s00530-024-01312-0"},{"key":"11_CR16","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning. arXiv preprint arXiv:2104.08691 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"11_CR17","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: bootstrapping language-image pre-training with frozen image encoders and large language models. In: International Conference on Machine Learning, pp. 19730\u201319742. PMLR (2023)"},{"key":"11_CR18","unstructured":"Liu, T., Wu, Z., Xiong, W., Chen, J., Jiang, Y.G.: Unified multimodal pre-training and prompt-based tuning for vision-language understanding and generation. arXiv preprint arXiv:2112.05587 (2021)"},{"key":"11_CR19","unstructured":"Lozupone, G., Bria, A., Fontanella, F., Meijer, F.J., De\u00a0Stefano, C.: Axial: attention-based explainability for interpretable Alzheimer\u2019s localized diagnosis using 2D CNNs on 3D MRI brain scans. arXiv preprint arXiv:2407.02418 (2024)"},{"key":"11_CR20","unstructured":"Van\u00a0der Maaten, L., Hinton, G.: Visualizing data using t-SNE. J. Mach. Learn. Res. 9(11) (2008)"},{"key":"11_CR21","doi-asserted-by":"crossref","unstructured":"Miao, Y., Li, S., Tang, J., Wang, T.: MuDPT: multi-modal deep-symphysis prompt tuning for large pre-trained vision-language models. In: 2023 IEEE International Conference on Multimedia and Expo (ICME), pp. 25\u201330. IEEE (2023)","DOI":"10.1109\/ICME55011.2023.00013"},{"key":"11_CR22","doi-asserted-by":"publisher","first-page":"203","DOI":"10.1016\/j.inffus.2021.12.003","volume":"81","author":"A Rahate","year":"2022","unstructured":"Rahate, A., Walambe, R., Ramanna, S., Kotecha, K.: Multimodal co-learning: challenges, applications with datasets, recent advances and future directions. Inf. Fus. 81, 203\u2013239 (2022)","journal-title":"Inf. Fus."},{"key":"11_CR23","unstructured":"Shen, Y., Song, K., Tan, X., Li, D., Lu, W., Zhuang, Y.: HuggingGPT: solving ai tasks with ChatGPT and its friends in Hugging Face (2023). https:\/\/arxiv.org\/abs\/2303.17580"},{"key":"11_CR24","first-page":"1","volume":"71","author":"Y Song","year":"2022","unstructured":"Song, Y., Liu, Z., Ling, S., Tang, R., Duan, G., Tan, J.: Coarse-to-fine few-shot defect recognition with dynamic weighting and joint metric. IEEE Trans. Instrum. Meas. 71, 1\u201310 (2022)","journal-title":"IEEE Trans. Instrum. Meas."},{"key":"11_CR25","doi-asserted-by":"crossref","unstructured":"Tian, X., Zou, S., Yang, Z., Zhang, J.: ArGue: attribute-guided prompt tuning for vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 28578\u201328587 (2024)","DOI":"10.1109\/CVPR52733.2024.02700"},{"key":"11_CR26","unstructured":"Wang, H., et al.: Parameter-efficient tuning of large-scale multimodal foundation model (2023). https:\/\/arxiv.org\/abs\/2305.08381"},{"key":"11_CR27","unstructured":"Wang, P., et\u00a0al.: Qwen2-VL: enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:2409.12191 (2024)"},{"key":"11_CR28","doi-asserted-by":"crossref","unstructured":"Xiong, W., et al.: Simple local attentions remain competitive for long-context tasks. arXiv preprint arXiv:2112.07210 (2021)","DOI":"10.18653\/v1\/2022.naacl-main.144"},{"key":"11_CR29","unstructured":"Zhao, G., Lin, J., Zhang, Z., Ren, X., Su, Q., Sun, X.: Explicit sparse transformer: concentrated attention through explicit selection. arXiv preprint arXiv:1912.11637 (2019)"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5567-3_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T21:14:29Z","timestamp":1769116469000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5567-3_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819555666","9789819555673"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5567-3_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"23 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shanghai","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15 October 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18 October 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/2025.prcv.cn\/index.asp","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}