{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,27]],"date-time":"2025-11-27T10:54:38Z","timestamp":1764240878678,"version":"build-2065373602"},"reference-count":219,"publisher":"Springer Science and Business Media LLC","issue":"10","license":[{"start":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T00:00:00Z","timestamp":1750204800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T00:00:00Z","timestamp":1750204800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["Grant 623B2063","Grant 62125603","Grant 62321005","Grant 62336004"],"award-info":[{"award-number":["Grant 623B2063","Grant 62125603","Grant 62321005","Grant 62336004"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Key Research and Development Program of China","award":["Grant 2022ZD0160102"],"award-info":[{"award-number":["Grant 2022ZD0160102"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,10]]},"DOI":"10.1007\/s11263-025-02502-7","type":"journal-article","created":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T08:22:19Z","timestamp":1750234939000},"page":"6639-6667","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Vision Generalist Model: A Survey"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9007-1210","authenticated-orcid":false,"given":"Ziyi","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yongming","family":"Rao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shuofeng","family":"Sun","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinrun","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi","family":"Wei","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xumin","family":"Yu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zuyan","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanbo","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongmin","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jie","family":"Zhou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6121-5529","authenticated-orcid":false,"given":"Jiwen","family":"Lu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,6,18]]},"reference":[{"key":"2502_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F. L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S., & Avila, R. (2023). Gpt-4 technical report. arXiv preprint arXiv:2303.08774"},{"key":"2502_CR2","unstructured":"Agarap, A. (2018). Deep learning using rectified linear units (relu). arXiv preprint arXiv:1803.08375."},{"key":"2502_CR3","unstructured":"Alayrac, J.-B., Donahue, J., Luc, P., Miech, A., Barr, I., Hasson, Y., Lenc, K., Mensch, A., Millican, K., Reynolds, M., & Ring, R. (2022). Flamingo: a visual language model for few-shot learning. NeurIPS."},{"key":"2502_CR4","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., & Zhang, L. (2018). Bottom-up and top-down attention for image captioning and visual question answering. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 6077\u20136086.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"2502_CR5","doi-asserted-by":"crossref","unstructured":"Andreas, J., Rohrbach, M., Darrell, T., & Klein, D. (2016). Neural module networks. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 39\u201348.","DOI":"10.1109\/CVPR.2016.12"},{"key":"2502_CR6","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C. L., & Parikh, D. (2015). Vqa: Visual question answering. In Proceedings of the IEEE international conference on computer vision, pp. 2425\u20132433.","DOI":"10.1109\/ICCV.2015.279"},{"key":"2502_CR7","unstructured":"Ataallah, K., Shen, X., Abdelrahman, E., Sleiman, E., Zhu, D., Ding, J., & Elhoseiny, M. (2024). Minigpt4-video: Advancing multimodal llms for video understanding with interleaved visual-textual tokens. arXiv preprint arXiv:2404.03413."},{"key":"2502_CR8","unstructured":"Awais, M., Naseer, M., Khan, S., Anwer, R. M., Cholakkal, H., Shah, M., Yang, M.-H., & Khan, F. S. (2023). Foundational models defining a new era in vision: A survey and outlook. arXiv preprint arXiv:2307.13721."},{"key":"2502_CR9","unstructured":"Ba, J. L. (2016). Layer normalization. arXiv preprint arXiv:1607.06450."},{"key":"2502_CR10","doi-asserted-by":"crossref","unstructured":"Bachmann, R., Mizrahi, D., Atanov, A., & Zamir, A. (2022). Multimae: Multi-modal multi-task masked autoencoders. In ECCV.","DOI":"10.1007\/978-3-031-19836-6_20"},{"key":"2502_CR11","unstructured":"Bae, S., Byun, H., Oh, C., Cho, Y.-S., & Song, K. (2022). Graph perceiver io: A general architecture for graph structured data. arXiv preprint arXiv:2209.06418."},{"key":"2502_CR12","unstructured":"Baevski, A., Hsu, W.-N., Xu, Q., Babu, A., Gu, J., & Auli, M. (2022). Data2vec: A general framework for self-supervised learning in speech, vision and language. In International Conference on Machine Learning, pp. 1298\u20131312. PMLR."},{"key":"2502_CR13","unstructured":"Bai, J., Men, R., Yang, H., Ren, X., Dang, K., Zhang, Y., Zhou, X., Wang, P., Tan, S., Yang, A., Cui, Z. (2022). Ofasys: A multi-modal multi-task learning system for building generalist models. arXiv preprint arXiv:2212.04408."},{"key":"2502_CR14","unstructured":"Bai, J., Bai, S., Yang, S., Wang, S., Tan, S., Wang, P., Lin, J., Zhou, C., & Zhou, J. (2023). Qwen-vl: A versatile vision-language model for understanding, localization, text reading, and beyond. arXiv preprint arXiv:2308.12966."},{"key":"2502_CR15","unstructured":"Bai, S., Chen, K., Liu, X., Wang, J., Ge, W., Song, S., Dang, K., Wang, P., Wang, S., Tang, J., & Zhong, H. (2025). Qwen2. 5-vl technical report. arXiv preprint arXiv:2502.13923."},{"key":"2502_CR16","doi-asserted-by":"crossref","unstructured":"Bai, Y., Geng, X., Mangalam, K., Bar, A., Yuille, A. L., Darrell, T., Malik, J., & Efros, A. A. (2024). Sequential modeling enables scalable learning for large vision models. In CVPR.","DOI":"10.1109\/CVPR52733.2024.02157"},{"key":"2502_CR17","first-page":"25005","volume":"35","author":"A Bar","year":"2022","unstructured":"Bar, A., Gandelsman, Y., Darrell, T., Globerson, A., & Efros, A. (2022). Visual prompting via image inpainting. Advances in Neural Information Processing Systems, 35, 25005\u201325017.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2502_CR18","unstructured":"Bengio, Y., L\u00e9onard, N., & Courville, A. (2013). Estimating or propagating gradients through stochastic neurons for conditional computation. arXiv preprint arXiv:1308.3432."},{"key":"2502_CR19","doi-asserted-by":"crossref","unstructured":"Bhattacharjee, D., Zhang, T., S\u00fcsstrunk, S., & Salzmann, M. (2022). Mult: An end-to-end multitask learning transformer. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12031\u201312041.","DOI":"10.1109\/CVPR52688.2022.01172"},{"key":"2502_CR20","unstructured":"Brohan, A., Brown, N., Carbajal, J., Chebotar, Y., Chen, X., Choromanski, K., Ding, T., Driess, D., Dubey, A., Finn, C., & Florence, P. (2023). Rt-2: Vision-language-action models transfer web knowledge to robotic control. arXiv preprint arXiv:2307.15818."},{"issue":"3","key":"2502_CR21","doi-asserted-by":"publisher","first-page":"404","DOI":"10.1037\/0033-295X.97.3.404","volume":"97","author":"Patricia A Carpenter","year":"1990","unstructured":"Carpenter, Patricia A., Just, Marcel A., & Shell, P. (1990). What one intelligence test measures: a theoretical account of the processing in the raven progressive matrices test. Psychological review, 97(3), 404.","journal-title":"Psychological review"},{"key":"2502_CR22","unstructured":"Carreira, J., Koppula, S., Zoran, D., Recasens, A., Ionescu, C., Henaff, O., Shelhamer, E., Arandjelovic, R., Botvinick, M., Vinyals, O., & Simonyan, K. (2022). Hip: Hierarchical perceiver. arXiv preprint arXiv:2202.10890."},{"key":"2502_CR23","doi-asserted-by":"crossref","unstructured":"Changpinyo, S., Sharma, P., Ding, N., & Soricut, R. (2021). Conceptual 12m: Pushing web-scale image-text pre-training to recognize long-tail visual concepts. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3558\u20133568.","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"2502_CR24","unstructured":"Chen, J., Zhu, D., Shen, X., Li, X., Liu, Z., Zhang, P., Krishnamoorthi, R., Chandra, V., Xiong, Y., & Elhoseiny, M. (2023a). Minigpt-v2: large language model as a unified interface for vision-language multi-task learning. arXiv preprint arXiv:2310.09478."},{"key":"2502_CR25","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., & Zhao, R. (2023b). Shikra: Unleashing multimodal llm\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195."},{"key":"2502_CR26","doi-asserted-by":"crossref","unstructured":"Chen, M., Peng, H., Fu, J., & Ling, H. (2021). Autoformer: Searching transformers for visual recognition. In Proceedings of the IEEE\/CVF international conference on computer vision, pp. 12270\u201312280.","DOI":"10.1109\/ICCV48922.2021.01205"},{"key":"2502_CR27","doi-asserted-by":"crossref","unstructured":"Chen, T., Li, L., Saxena, S., Hinton, G., & Fleet, D. J. (2022a). A generalist framework for panoptic segmentation of images and videos. arXiv preprint arXiv:2210.06366.","DOI":"10.1109\/ICCV51070.2023.00090"},{"key":"2502_CR28","first-page":"31333","volume":"35","author":"T Chen","year":"2022","unstructured":"Chen, T., Saxena, S., Li, L., Lin, T.-Y., Fleet, David J., & Hinton, G. E. (2022). A unified sequence interface for vision tasks. Advances in Neural Information Processing Systems, 35, 31333\u201331346.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2502_CR29","unstructured":"Chen, X., Wang, X., Changpinyo, S., Piergiovanni, A., Padlewski, P., Salz, D., Goodman, S., Grycner, A., Mustafa, B., Beyer, L., & Kolesnikov, A. (2022c). Pali: A jointly-scaled multilingual language-image model. arXiv preprint arXiv:2209.06794"},{"key":"2502_CR30","doi-asserted-by":"crossref","unstructured":"Chen, X., Djolonga, J., Padlewski, P., Mustafa, B., Changpinyo, S., Wu, J., Ruiz, C. R., Goodman, S., Wang, X., Tay, Y., & Shakeri, S. (2023c). Pali-x: On scaling up a multilingual vision and language model. arXiv preprint arXiv:2305.18565","DOI":"10.1109\/CVPR52733.2024.01368"},{"key":"2502_CR31","doi-asserted-by":"crossref","unstructured":"Chen, X., Djolonga, J., Padlewski, P., Mustafa, B., Changpinyo, S., Wu, J., Ruiz, C. R., Goodman, S., Wang, X., Tay, Y., & Shakeri, S. (2023d). Pali-x: On scaling up a multilingual vision and language model. arXiv preprint arXiv:2305.18565","DOI":"10.1109\/CVPR52733.2024.01368"},{"key":"2502_CR32","doi-asserted-by":"crossref","unstructured":"Chen, X., Liu, Y., Pu, Y., Zhang, W., Zhou, J., Qiao, Y., & Dong, C. (2024). Learning a low-level vision generalist via visual task prompt. In ACM MM.","DOI":"10.1145\/3664647.3681621"},{"key":"2502_CR33","unstructured":"Chen, X., Fang, H., Lin, T.-Y., Vedantam, R., Gupta, S., Doll\u00e1r, P., & Zitnick, C. L. (2015). Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325"},{"key":"2502_CR34","doi-asserted-by":"crossref","unstructured":"Chen, Y.-C., Li, L., Yu, L., Kholy, A. E., Ahmed, F., Gan, Z., Cheng, Y., & Liu, J. (2020a). Uniter: Universal image-text representation learning. In European conference on computer vision, pp. 104\u2013120. Springer.","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"2502_CR35","doi-asserted-by":"crossref","unstructured":"Chen, Y.-C., Li, L., Yu, L., Ahmed Kholy, E., Ahmed, F., Gan, Z., Cheng, Y., & Liu, J. (2020b). Uniter: Universal image-text representation learning. In European conference on computer vision, pp. 104\u2013120. Springer.","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"2502_CR36","unstructured":"Chen, Z., Badrinarayanan, V., Lee, C.-Y., & Rabinovich, A. (2018). Gradnorm: Gradient normalization for adaptive loss balancing in deep multitask networks. In International conference on machine learning, pp. 794\u2013803. PMLR."},{"key":"2502_CR37","first-page":"17864","volume":"34","author":"B Cheng","year":"2021","unstructured":"Cheng, B., Schwing, A., & Kirillov, A. (2021). Per-pixel classification is not all you need for semantic segmentation. Advances in Neural Information Processing Systems, 34, 17864\u201317875.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2502_CR38","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A. G., Kirillov, A., & Girdhar, R. (2022). Masked-attention mask transformer for universal image segmentation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 1290\u20131299.","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"2502_CR39","unstructured":"Cho, J., Lei, J., Tan, H., & Bansal, M. (2021). Unifying vision-and-language tasks via text generation. In ICML."},{"key":"2502_CR40","unstructured":"Chu, X., Tian, Z., Zhang, B., Wang, X., & Shen, C. (2021). Conditional positional encodings for vision transformers. arXiv preprint arXiv:2102.10882"},{"key":"2502_CR41","doi-asserted-by":"crossref","unstructured":"Dai, J., He, K., & Sun, J. (2016). Instance-aware semantic segmentation via multi-task network cascades. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3150\u20133158.","DOI":"10.1109\/CVPR.2016.343"},{"key":"2502_CR42","unstructured":"Devlin, J., Chang, M.-W., Lee, K., & Toutanova, K. (2018). Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805"},{"key":"2502_CR43","doi-asserted-by":"crossref","unstructured":"Ding, J., Xue, N., Xia, G.-S., & Dai, D. (2022). Decoupling zero-shot semantic segmentation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11583\u201311592.","DOI":"10.1109\/CVPR52688.2022.01129"},{"key":"2502_CR44","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., & Uszkoreit, J. (2020). An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929"},{"key":"2502_CR45","unstructured":"Driess, D., Xia, F., Sajjadi, M. S., Lynch, C., Chowdhery, A., Ichter, B., Wahid, A., Tompson, J., Vuong, Q., Yu, T., & Chebotar, Y. (2023). Palm-e: An embodied multimodal language model. arXiv preprint arXiv:2303.03378"},{"key":"2502_CR46","unstructured":"Du, N., Huang, Y., Dai, A. M., Tong, S., Lepikhin, D., Xu, Y., Krikun, M., Zhou, Y., Yu, A. W., Firat, O., Zoph, B. (2022). Glam: Efficient scaling of language models with mixture-of-experts. In International Conference on Machine Learning, pp. 5547\u20135569. PMLR."},{"key":"2502_CR47","doi-asserted-by":"crossref","unstructured":"Duan, J., Chen, L., Tran, S., Yang, J., Xu, Y., Zeng, B., & Chilimbi, T. (2022). Multi-modal alignment using representation codebook. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15651\u201315660.","DOI":"10.1109\/CVPR52688.2022.01520"},{"key":"2502_CR48","doi-asserted-by":"crossref","unstructured":"Duong, L., Cohn, T., Bird, S., & Cook, P. (2015). Low resource dependency parsing: Cross-lingual parameter sharing in a neural network parser. In Proceedings of the 53rd annual meeting of the Association for Computational Linguistics and the 7th international joint conference on natural language processing (volume 2: short papers), pp. 845\u2013850.","DOI":"10.3115\/v1\/P15-2139"},{"key":"2502_CR49","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., & Ommer, B. (2021). Taming transformers for high-resolution image synthesis. In CVPR.","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"2502_CR50","unstructured":"Fang, X., Mao, K., Duan, H., Zhao, X., Li, Y., Lin, D., & Chen, K. (2024). Mmbench-video: A long-form multi-shot benchmark for holistic video understanding. NeurIPS."},{"issue":"120","key":"2502_CR51","first-page":"1","volume":"23","author":"W Fedus","year":"2022","unstructured":"Fedus, W., Zoph, B., & Shazeer, N. (2022). Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research, 23(120), 1\u201339.","journal-title":"Journal of Machine Learning Research"},{"key":"2502_CR52","unstructured":"Fu, C., Chen, P., Shen, Y., Qin, Y., Zhang, M., Lin, X., Yang, J., Zheng, X., Li, K., Sun, X., & Liu, K. (2023). Mme: A comprehensive evaluation benchmark for multimodal large language models. arXiv preprint arXiv:2306.13394."},{"key":"2502_CR53","doi-asserted-by":"crossref","unstructured":"Fu, C., Dai, Y., Luo, Y., Li, L., Ren, S., Zhang, R., Wang, Z., Zhou, C., Shen, Y., Zhang, M., & Chen, P. (2024a). Video-mme: The first-ever comprehensive evaluation benchmark of multi-modal llms in video analysis. arXiv preprint arXiv:2405.21075.","DOI":"10.1109\/CVPR52734.2025.02245"},{"key":"2502_CR54","unstructured":"Fu, C., Zhang, Y.-F., Yin, S., Li, B., Fang, X., Zhao, S., Duan, H., Sun, X., Liu, Z., Wang, L., & Shan, C. (2024b). Mme-survey: A comprehensive survey on evaluation of multimodal llms. arXiv preprint arXiv:2411.15296"},{"key":"2502_CR55","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D. H., Yang, D., Rohrbach, A., Darrell, T., & Rohrbach, M. (2016). Multimodal compact bilinear pooling for visual question answering and visual grounding. arXiv preprint arXiv:1606.01847","DOI":"10.18653\/v1\/D16-1044"},{"key":"2502_CR56","unstructured":"Gao, P., Han, J., Zhang, R., Lin, Z., Geng, S., Zhou, A., Zhang, W., Lu, P., He, C., Yue, X., Li, H. (2023). Llama-adapter v2: Parameter-efficient visual instruction model. arXiv preprint arXiv:2304.15010"},{"key":"2502_CR57","doi-asserted-by":"crossref","unstructured":"Gao, Y., Bai, H., Jie, Z., Ma, J., Jia, K., & Liu, W. (2020). Mtl-nas: Task-agnostic neural architecture search towards general-purpose multi-task learning. In Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition, pp. 11543\u201311552","DOI":"10.1109\/CVPR42600.2020.01156"},{"key":"2502_CR58","doi-asserted-by":"crossref","unstructured":"Geng, Z., Yang, B., Hang, T., Li, C., Gu, S., Zhang, T., Bao, J., Zhang, Z., Li, H., Hu, H., & Chen, D. (2024). Instructdiffusion: A generalist modeling interface for vision tasks. In CVPR.","DOI":"10.1109\/CVPR52733.2024.01208"},{"key":"2502_CR59","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., Gu, X., Cui, Y., & Lin, T.-Y. (2022). Scaling open-vocabulary image segmentation with image-level labels. In European Conference on Computer Vision, pp. 540\u2013557. Springer.","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"2502_CR60","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Singh, M., Ravi, N., van der Maaten, L., Joulin, A., & Misra, I. (2022). Omnivore: A single model for many visual modalities. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16102\u201316112.","DOI":"10.1109\/CVPR52688.2022.01563"},{"key":"2502_CR61","doi-asserted-by":"crossref","unstructured":"Gormley, I. C., & Fr\u00fchwirth-Schnatter, S. (2019). Mixture of experts models. Handbook of mixture analysis, pp. 271\u2013307.","DOI":"10.1201\/9780429055911-12"},{"key":"2502_CR62","doi-asserted-by":"crossref","unstructured":"Graham, B., Engelcke, M., & Van Der Maaten, L. (2018). 3d semantic segmentation with submanifold sparse convolutional networks. In CVPR, pp. 9224\u20139232.","DOI":"10.1109\/CVPR.2018.00961"},{"key":"2502_CR63","unstructured":"Gu, X., Lin, T.-Y., Kuo, W., & Cui, Y. (2021). Zero-shot detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921, 2 (3): 4."},{"key":"2502_CR64","doi-asserted-by":"crossref","unstructured":"G\u00fcera, D., & Delp, E. J. (2018). Deepfake video detection using recurrent neural networks. In AVSS, pp. 1\u20136. IEEE.","DOI":"10.1109\/AVSS.2018.8639163"},{"key":"2502_CR65","doi-asserted-by":"crossref","unstructured":"Guo, M., Haque, A., Huang, D.-A., Yeung, S., & Fei-Fei, L. (2018). Dynamic task prioritization for multitask learning. In Proceedings of the European conference on computer vision (ECCV), pp. 270\u2013287.","DOI":"10.1007\/978-3-030-01270-0_17"},{"key":"2502_CR66","unstructured":"Gupta, T., Marten, R., Kembhavi, A., & Hoiem, D. (2022). Grit: General robust image task benchmark. arXiv preprint arXiv:2204.13653"},{"key":"2502_CR67","doi-asserted-by":"crossref","unstructured":"Han, J., Gong, K., Zhang, Y., Wang, J., Zhang, K., Lin, D., Qiao, Y., Gao, P., & Yue, X. (2024). Onellm: One framework to align all modalities with language. In CVPR.","DOI":"10.1109\/CVPR52733.2024.02510"},{"key":"2502_CR68","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016a). Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 770\u2013778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2502_CR69","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2016b). Deep residual learning for image recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.90"},{"key":"2502_CR70","doi-asserted-by":"crossref","unstructured":"He, K., Gkioxari, G., Doll\u00e1r, P., & Girshick, R. (2017). Mask r-cnn. In Proceedings of the IEEE international conference on computer vision, pp. 2961\u20132969.","DOI":"10.1109\/ICCV.2017.322"},{"key":"2502_CR71","unstructured":"Hinton, G., Vinyals, O., & Dean, J. (2015). Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531."},{"key":"2502_CR72","unstructured":"Jonathan Ho & Salimans, T. (2022). Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598,"},{"key":"2502_CR73","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., & Abbeel, P. (2020). Denoising diffusion probabilistic models. Advances in neural information processing systems, 33, 6840\u20136851.","journal-title":"Advances in neural information processing systems"},{"issue":"8","key":"2502_CR74","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural computation, 9(8), 1735\u20131780.","journal-title":"Neural computation"},{"key":"2502_CR75","doi-asserted-by":"crossref","unstructured":"Hu, R., & Singh, A. (2021). Unit: Multimodal multitask learning with a unified transformer. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00147"},{"key":"2502_CR76","unstructured":"Huang, J., Yong, S., Ma, X., Linghu, X., Li, P., Wang, Y., Li, Q., Zhu, S.-C., Jia, B., & Huang, S. (2023a). An embodied generalist agent in 3d world. arXiv preprint arXiv:2311.12871"},{"key":"2502_CR77","unstructured":"Huang, S., Dong, L., Wang, W., Hao, Y., Singhal, S., Ma, S., Lv, T., Cui, L., Mohammed, O. K., & Liu, Q. (2023b). Language is not all you need: Aligning perception with language models. arXiv preprint arXiv:2302.14045"},{"key":"2502_CR78","unstructured":"Jaegle, A., Borgeaud, S., Alayrac, J.-B., Doersch, C., Catalin Ionescu, Ding, D., Koppula, S., Zoran, D., Brock, A., Shelhamer, E., & H\u00e9naff, O. (2021a). Perceiver io: A general architecture for structured inputs & outputs. arXiv preprint arXiv:2107.14795"},{"key":"2502_CR79","unstructured":"Jaegle, A., Gimeno, F., Brock, A., Vinyals, O., Zisserman, A., & Carreira, J. (2021b). Perceiver: General perception with iterative attention. In ICML."},{"key":"2502_CR80","doi-asserted-by":"crossref","unstructured":"Jain, J., Li, J., Chiu, M. T., Hassani, A., Orlov, N., & Shi, H. (2023). Oneformer: One transformer to rule universal image segmentation. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00292"},{"key":"2502_CR81","unstructured":"Jiang, Y., Gupta, A., Zhang, Z., Wang, G., Dou, Y., Chen, Y., Li Fei-Fei, Anandkumar, A., Zhu, Y., & Fan, L. (2022). Vima: General robot manipulation with multimodal prompts. arXiv preprint arXiv:2210.03094, 2 (3): 6."},{"key":"2502_CR82","unstructured":"Jin, Y., Xu, K., Chen, L., Liao, C., Tan, J., Chen, B., Lei, C., Liu, A., Song, C., & Lei, X., 2023. Unified language-vision pretraining with dynamic discrete visual tokenization. arXiv preprint arXiv:2309.04669"},{"key":"2502_CR83","doi-asserted-by":"crossref","unstructured":"Karpathy, A., & Fei-Fei, L. (2015). Deep visual-semantic alignments for generating image descriptions. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3128\u20133137.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"2502_CR84","doi-asserted-by":"crossref","unstructured":"Kendall, A., Gal, Y., & Cipolla, R. (2018). Multi-task learning using uncertainty to weigh losses for scene geometry and semantics. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 7482\u20137491.","DOI":"10.1109\/CVPR.2018.00781"},{"key":"2502_CR85","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Alexander C Berg, Lo, W.-Y., & Doll\u00e1r, P. (2023a). Segment anything. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"2502_CR86","doi-asserted-by":"crossref","unstructured":"Kirillov, A., Mintun, E., Ravi, N., Mao, H., Rolland, C., Gustafson, L., Xiao, T., Whitehead, S., Berg, A. C., Lo, W.-Y., & Doll\u00e1r, P. 2023b. Segment anything. arXiv preprint arXiv:2304.02643","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"2502_CR87","first-page":"26295","volume":"35","author":"A Kolesnikov","year":"2022","unstructured":"Kolesnikov, A., Andr\u00e9 Pinto, S., Beyer, L., Zhai, X., Harmsen, J., & Houlsby, N. (2022). Uvim: A unified modeling approach for vision with learned guiding codes. Advances in Neural Information Processing Systems, 35, 26295\u201326308.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2502_CR88","unstructured":"Kotelnikov, A., Baranchuk, D., Rubachev, I., & Babenko, A. (2023). Tabddpm: Modelling tabular data with diffusion models. In International Conference on Machine Learning, pp. 17564\u201317579. PMLR."},{"key":"2502_CR89","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L.-J., Shamma, D. A., & Bernstein, M. S. (2017). Visual genome: Connecting language and vision using crowdsourced dense image annotations. International journal of computer vision, 123, 32\u201373.","journal-title":"International journal of computer vision"},{"key":"2502_CR90","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G. E. (2012). Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems, 25."},{"issue":"12","key":"2502_CR91","doi-asserted-by":"publisher","first-page":"2891","DOI":"10.1109\/TPAMI.2012.162","volume":"35","author":"G Kulkarni","year":"2013","unstructured":"Kulkarni, G., Premraj, V., Ordonez, V., Dhar, S., Li, S., Choi, Y., Berg, A. C., & Berg, T. L. (2013). Babytalk: Understanding and generating simple image descriptions. IEEE transactions on pattern analysis and machine intelligence, 35(12), 2891\u20132903.","journal-title":"IEEE transactions on pattern analysis and machine intelligence"},{"key":"2502_CR92","unstructured":"Lewis, M., Bhosale, S., Dettmers, T., Goyal, N., & Zettlemoyer, L. (2021). Base layers: Simplifying training of large, sparse models. In International Conference on Machine Learning, pp. 6265\u20136274. PMLR."},{"key":"2502_CR93","doi-asserted-by":"crossref","unstructured":"Li, B., Wang, R., Wang, G., Ge, Y., Ge, Y., & Shan, Y. (2023a). Seed-bench: Benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"2502_CR94","doi-asserted-by":"crossref","unstructured":"Li, C., Gan, Z., Yang, Z., Yang, J., Li, L., Wang, L., & Gao, J. (2023b). Multimodal foundation models: From specialists to general-purpose assistants. arXiv preprint arXiv:2309.10020","DOI":"10.1561\/9781638283379"},{"key":"2502_CR95","doi-asserted-by":"crossref","unstructured":"Li, F., Zhang, H., Xu, H., Liu, S., Zhang, L., Ni, L. M., & Shum, H.-Y. (2023c). Mask dino: Towards a unified transformer-based framework for object detection and segmentation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 3041\u20133050.","DOI":"10.1109\/CVPR52729.2023.00297"},{"key":"2502_CR96","doi-asserted-by":"crossref","unstructured":"Li, H., Zhu, J., Jiang, X., Zhu, X., Li, H., Yuan, C., Wang, X., Qiao, Y., Wang, X., Wang, W., Dai, J. (2023d). Uni-perceiver v2: A generalist model for large-scale vision and vision-language tasks. In CVPR.","DOI":"10.1109\/CVPR52729.2023.00264"},{"key":"2502_CR97","unstructured":"Li, J., Li, D., Savarese, S., & Hoi, S. (2023e). Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In ICML."},{"key":"2502_CR98","doi-asserted-by":"crossref","unstructured":"Li, L., Gan, Z., Lin, K., Lin, C.-C., Liu, Z., Liu, C., & Wang, L. (2023f). Lavender: Unifying video-language understanding as masked language modeling. In CVPR.","DOI":"10.1109\/CVPR52729.2023.02214"},{"key":"2502_CR99","unstructured":"Li, L. H., Yatskar, M., Yin, D., Hsieh, C.-J., & Chang, K.-W. (2019). Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557"},{"key":"2502_CR100","doi-asserted-by":"crossref","unstructured":"Li, L. H., Zhang, P., Zhang, H., Yang, J., Li, C., Zhong, Y., Wang, L., Yuan, L., Zhang, L., Hwang, J.-N., & Chang, K.W. (2022). Grounded language-image pre-training. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10965\u201310975.","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"2502_CR101","doi-asserted-by":"crossref","unstructured":"Li, X., Yuan, H., Li, W., Ding, H., Wu, S., Zhang, W., Li, Y., Chen, K., & Loy, C. C. (2024). Omg-seg: Is one model good enough for all segmentation? In CVPR.","DOI":"10.1109\/CVPR52733.2024.02640"},{"key":"2502_CR102","doi-asserted-by":"crossref","unstructured":"Liang, F., Wu, B., Dai, X., Li, K., Zhao, Y., Zhang, H., Zhang, P., Vajda, P., & Marculescu, D. (2023). Open-vocabulary semantic segmentation with mask-adapted clip. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7061\u20137070.","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"2502_CR103","doi-asserted-by":"crossref","unstructured":"Liang, J., Meyerson, E., & Miikkulainen, R. (2018). Evolutionary architecture search for deep multitask networks. In Proceedings of the genetic and evolutionary computation conference, pp. 466\u2013473.","DOI":"10.1145\/3205455.3205489"},{"key":"2502_CR104","doi-asserted-by":"crossref","unstructured":"Lin, B., Ye, Y., Zhu, B., Cui, J., Ning, M., Jin, P., & Yuan, L. (2023). Video-llava: Learning united visual representation by alignment before projection. arXiv preprint arXiv:2311.10122","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"2502_CR105","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft coco: Common objects in context. In Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pp. 740\u2013755. Springer.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2502_CR106","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., & Lee, Y. J. (2024a). Improved baselines with visual instruction tuning. In CVPR.","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"2502_CR107","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y. J. (2024b). Visual instruction tuning. NeurIPS, 36."},{"key":"2502_CR108","doi-asserted-by":"crossref","unstructured":"Liu, S., Johns, E., & Davison, A. J. (2019). End-to-end multi-task learning with attention. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 1871\u20131880.","DOI":"10.1109\/CVPR.2019.00197"},{"key":"2502_CR109","doi-asserted-by":"crossref","unstructured":"Liu, Y., Duan, H., Zhang, Y., Li, B., Zhang, S., Zhao, W., Yuan, Y., Wang, J., He, C., Liu, Z., & Chen, K. (2024c). Mmbench: Is your multi-modal model an all-around player? In European conference on computer vision, pp. 216\u2013233. Springer.","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"2502_CR110","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., Lin, S., & Guo, B. (2021). Swin transformer: Hierarchical vision transformer using shifted windows. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"2502_CR111","doi-asserted-by":"crossref","unstructured":"Liu, Z., Ning, J., Cao, Y., Wei, Y., Zhang, Z., Lin, S., & Hu, H. (2022). Video swin transformer. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"2502_CR112","unstructured":"Liu, Z., He, Y., Wang, W., Wang, W., Wang, Y., Chen, S., Zhang, Q., Lai, Z., Yang, Y., Li, Q., & Yu, J. (2023). Interngpt: Solving vision-centric tasks by interacting with chatgpt beyond language. arXiv preprint arXiv:2305.05662."},{"key":"2502_CR113","unstructured":"Lu, C., Zhou, Y., Bao, F., Chen, J., Li, C., & Zhu, J. (2022a). Dpm-solver: A fast ode solver for diffusion probabilistic model sampling in around 10 steps. NeurIPS."},{"key":"2502_CR114","doi-asserted-by":"crossref","unstructured":"Lu, J., Goswami, V., Rohrbach, M., Parikh, D., & Lee, S. (2020). 12-in-1: Multi-task vision and language representation learning. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 10437\u201310446.","DOI":"10.1109\/CVPR42600.2020.01045"},{"key":"2502_CR115","unstructured":"Lu, J., Clark, C., Zellers, R., Mottaghi, R., & Kembhavi, A. (2022b). Unified-io: A unified model for vision, language, and multi-modal tasks. arXiv preprint arXiv:2206.08916"},{"key":"2502_CR116","unstructured":"Lu, J., Clark, C., Zellers, R., Mottaghi, R., & Kembhavi, A. (2022c). Unified-io: A unified model for vision, language, and multi-modal tasks. arXiv preprint arXiv:2206.08916."},{"key":"2502_CR117","doi-asserted-by":"crossref","unstructured":"Lu, J., Clark, C., Lee, S., Zhang, Z., Khosla, S., Marten, R., Hoiem, D., & Kembhavi, A. (2024). Unified-io 2: Scaling autoregressive multimodal models with vision language audio and action. In CVPR.","DOI":"10.1109\/CVPR52733.2024.02497"},{"key":"2502_CR118","unstructured":"Lu, Y., Xu, C., Wei, X., Xie, X., Tomizuka, M., Keutzer, K., & Zhang, S. (2022d). Open-vocabulary 3d detection via image-level class and debiased cross-modal contrastive learning. arXiv preprint arXiv:2207.01987."},{"key":"2502_CR119","doi-asserted-by":"crossref","unstructured":"Maaz, M., Rasheed, H., Khan, S., & Khan, F. S. (2023). Video-chatgpt: Towards detailed video understanding via large vision and language models. arXiv preprint arXiv:2306.05424.","DOI":"10.18653\/v1\/2024.acl-long.679"},{"key":"2502_CR120","unstructured":"Mangalam, K., Akshulakov, R., & Malik, J. (2023). Egoschema: A diagnostic benchmark for very long-form video language understanding. NeurIPS."},{"key":"2502_CR121","doi-asserted-by":"crossref","unstructured":"Misra, I., Shrivastava, A., Gupta, A., & Hebert, M. (2016). Cross-stitch networks for multi-task learning. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3994\u20134003.","DOI":"10.1109\/CVPR.2016.433"},{"key":"2502_CR122","unstructured":"Mu, Y., Zhang, Q., Hu, M., Wang, W., Ding, M., Jin, J., Wang, B., Dai, J., Qiao, Y., & Luo, P. (2024). Embodiedgpt: Vision-language pre-training via embodied chain of thought. Advances in Neural Information Processing Systems, 36."},{"key":"2502_CR123","doi-asserted-by":"crossref","unstructured":"Nguyen, D.-K., & Okatani, T. (2019). Multi-task learning of hierarchical vision-language representation. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10492\u201310501.","DOI":"10.1109\/CVPR.2019.01074"},{"key":"2502_CR124","unstructured":"Nichol, A., Dhariwal, P., Ramesh, A., Shyam, P., Mishkin, P., McGrew, B., Sutskever, I., & Chen, M. (2021). Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741"},{"key":"2502_CR125","unstructured":"Nichol, A. Q., & Dhariwal, P. (2021). Improved denoising diffusion probabilistic models. In ICML."},{"key":"2502_CR126","doi-asserted-by":"crossref","unstructured":"Ning, J., Li, C., Zhang, Z., Geng, Z., Dai, Q., He, K., & Hu, H. (2023). All in tokens: Unifying output space of visual tasks via soft token. arXiv preprint arXiv:2301.02229.","DOI":"10.1109\/ICCV51070.2023.01822"},{"key":"2502_CR127","first-page":"27730","volume":"35","author":"L Ouyang","year":"2022","unstructured":"Ouyang, L., Wu, J., Jiang, X., Almeida, D., Wainwright, C., Mishkin, P., Zhang, C., Agarwal, S., Slama, K., Ray, A., & Schulman, J. (2022). Training language models to follow instructions with human feedback. NeurIPS, 35, 27730\u201327744.","journal-title":"NeurIPS"},{"key":"2502_CR128","unstructured":"Parisotto, E., Ba, J. L., & Salakhutdinov, R. (2015). Actor-mimic: Deep multitask and transfer reinforcement learning. arXiv preprint arXiv:1511.06342."},{"key":"2502_CR129","unstructured":"Peng, Z., Wang, W., Dong, L., Hao, Y., Huang, S., Ma, S., & Wei, F. (2023). Kosmos-2: Grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824."},{"key":"2502_CR130","doi-asserted-by":"crossref","unstructured":"Peters, M., Neumann, M., Iyyer, M., Gardner, M., Clark, C., Lee, K., & Zettlemoyer, L. (2018). Deep contextualized word representations. In NAACL.","DOI":"10.18653\/v1\/N18-1202"},{"key":"2502_CR131","unstructured":"Polino, A., Pascanu, R., & Alistarh, D. (2018). Model compression via distillation and quantization. arXiv preprint arXiv:1802.05668"},{"key":"2502_CR132","doi-asserted-by":"crossref","unstructured":"Qi, Z., Dong, R., Zhang, S., Geng, H., Han, C., Ge, Z., Yi, L., & Ma, K. (2024). Shapellm: Universal 3d object understanding for embodied interaction. In European Conference on Computer Vision, pp. 214\u2013238. Springer.","DOI":"10.1007\/978-3-031-72775-7_13"},{"key":"2502_CR133","unstructured":"Radford, A., Kim, J. W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., & Krueger, G. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning, pp. 8748\u20138763. PMLR."},{"key":"2502_CR134","unstructured":"Rae, J. W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, F., Aslanides, J., Henderson, S., Ring, R., Young, S., & Rutherford, E. (2021). Scaling language models: Methods, analysis & insights from training gopher. arXiv preprint arXiv:2112.11446"},{"key":"2502_CR135","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., & Liu, P. J. (2020). Exploring the limits of transfer learning with a unified text-to-text transformer. The Journal of Machine Learning Research."},{"key":"2502_CR136","unstructured":"Rahaman, N., Weiss, M., Tr\u00e4uble, F., Locatello, F., Lacoste, A., Bengio, Y., Pal, C., Li, L. E., & Sch\u00f6lkopf, B. (2022). A general purpose neural architecture for geospatial systems. arXiv preprint arXiv:2211.02348"},{"key":"2502_CR137","unstructured":"Ramesh, A., Pavlov, M., Goh, G., Gray, S., Voss, C., Radford, A., Chen, M., & Sutskever, I. (2021). Zero-shot text-to-image generation. In International Conference on Machine Learning, pp. 8821\u20138831. PMLR."},{"key":"2502_CR138","doi-asserted-by":"crossref","unstructured":"Rao, Y., Zhao, W., Chen, G., Tang, Y., Zhu, Z., Huang, G., Zhou, J., & Lu, J. (2022). Denseclip: Language-guided dense prediction with context-aware prompting. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18082\u201318091.","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"2502_CR139","doi-asserted-by":"crossref","unstructured":"Rao, Y., Liu, Z., Zhao, W., Zhou, J., & Lu, J. (2023). Dynamic spatial sparsification for efficient vision transformers and convolutional neural networks. IEEE Transactions on Pattern Analysis and Machine Intelligence.","DOI":"10.1109\/TPAMI.2023.3263826"},{"key":"2502_CR140","doi-asserted-by":"crossref","unstructured":"Raven, J. (2003). Raven progressive matrices. In Handbook of nonverbal assessment, pp. 223\u2013237. Springer.","DOI":"10.1007\/978-1-4615-0153-4_11"},{"key":"2502_CR141","unstructured":"Reed, S., Zolna, K., Parisotto, E., Colmenarejo, S. G., Novikov, A., Gabriel Barth-Maron, Gimenez, M., Sulsky, Y., Kay, J., Springenberg, J. T., & Eccles, T. (2022). A generalist agent. arXiv preprint arXiv:2205.06175."},{"key":"2502_CR142","doi-asserted-by":"crossref","unstructured":"Ren, Z., Huang, Z., Wei, Y., Zhao, Y., Fu, D., Feng, J., & Jin, X. (2024). Pixellm: Pixel reasoning with large multimodal model. In CVPR.","DOI":"10.1109\/CVPR52733.2024.02491"},{"key":"2502_CR143","first-page":"8583","volume":"34","author":"C Riquelme","year":"2021","unstructured":"Riquelme, C., Puigcerver, J., Mustafa, B., Neumann, M., Jenatton, R., Pinto, A. S., Keysers, D., & Houlsby, N. (2021). Scaling vision with sparse mixture of experts. Advances in Neural Information Processing Systems, 34, 8583\u20138595.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2502_CR144","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022a). High-resolution image synthesis with latent diffusion models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 10684\u201310695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2502_CR145","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022b). High-resolution image synthesis with latent diffusion models. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2502_CR146","unstructured":"Rosenbaum, C., Klinger, T., & Riemer, M. (2017). Routing networks: Adaptive selection of non-linear functions for multi-task learning. arXiv preprint arXiv:1711.01239"},{"key":"2502_CR147","doi-asserted-by":"publisher","first-page":"4822","DOI":"10.1609\/aaai.v33i01.33014822","volume":"33","author":"S Ruder","year":"2019","unstructured":"Ruder, S., Bingel, J., Augenstein, I., & S\u00f8gaard, A. (2019). Latent multi-task architecture learning. In Proceedings of the AAAI Conference on Artificial Intelligence, 33, 4822\u20134829.","journal-title":"In Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"2502_CR148","doi-asserted-by":"crossref","unstructured":"Rumelhart, D. E., Hinton, G. E., & Williams, R. J. (1985). Learning internal representations by error propagation.","DOI":"10.21236\/ADA164453"},{"key":"2502_CR149","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., & Berg, A. C. (2015). Imagenet large scale visual recognition challenge. International journal of computer vision, 115, 211\u2013252.","journal-title":"International journal of computer vision"},{"key":"2502_CR150","unstructured":"Rusu, A. A., Colmenarejo, S. G., Gulcehre, C., Desjardins, G., Kirkpatrick, J., Pascanu, R., Mnih, V., Kavukcuoglu, K., & Hadsell, R. Policy distillation. arXiv preprint arXiv:1511.06295, 2015."},{"key":"2502_CR151","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., Coombes, T., Katta, A., Mullis, C., Wortsman, M., & Schramowski, P. (2022). Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems, 35, 25278\u201325294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2502_CR152","unstructured":"Sener, O., & Koltun, V. (2018). Multi-task learning as multi-objective optimization. NeurIPS, 31."},{"key":"2502_CR153","doi-asserted-by":"crossref","unstructured":"Sharma, P., Ding, N., Goodman, S., & Soricut, R. (2018). Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning. In Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2556\u20132565.","DOI":"10.18653\/v1\/P18-1238"},{"key":"2502_CR154","unstructured":"Sharma, S., Jha, A., Hegde, P., & Ravindran, B. (2017). Learning to multi-task by active sampling. arXiv preprint arXiv:1702.06053."},{"key":"2502_CR155","doi-asserted-by":"crossref","unstructured":"Shih, K. J., Singh, S., & Hoiem, D. (2016). Where to look: Focus regions for visual question answering. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 4613\u20134621.","DOI":"10.1109\/CVPR.2016.499"},{"key":"2502_CR156","unstructured":"Shukor, M., Dancette, C., Rame, A., & Cord, M. (2023). Unival: Unified model for image, video, audio and language tasks. arXiv preprint arXiv:2307.16184."},{"key":"2502_CR157","unstructured":"Simonyan, K., & Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556."},{"key":"2502_CR158","unstructured":"So, J., Oh, C., Lim, Y., Byun, H., Shin, M., & Song, K. (2022). Geodesic multi-modal mixup for robust fine-tuning. arXiv preprint arXiv:2203.03897."},{"key":"2502_CR159","doi-asserted-by":"crossref","unstructured":"Sun, C., Myers, A., Vondrick, C., Murphy, K., & Schmid, C. (2019). Videobert: A joint model for video and language representation learning. In Proceedings of the IEEE\/CVF international conference on computer vision, pp. 7464\u20137473.","DOI":"10.1109\/ICCV.2019.00756"},{"key":"2502_CR160","unstructured":"Sutskever, I. (2014). Sequence to sequence learning with neural networks. arXiv preprint arXiv:1409.3215."},{"key":"2502_CR161","unstructured":"Team, C. (2024). Chameleon: Mixed-modal early-fusion foundation models. arXiv preprint arXiv:2405.09818."},{"key":"2502_CR162","unstructured":"Touvron, H., Cord, M., Douze, M., Massa, F., Sablayrolles, A., & J\u00e9gou, H. (2021). Training data-efficient image transformers & distillation through attention. In ICML."},{"key":"2502_CR163","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.-A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F., & Rodriguez, A. (2023). Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971."},{"key":"2502_CR164","unstructured":"Tsimpoukelli, M., Jacob L Menick, Cabi, S., Eslami, S., Vinyals, O., & Hill, F. (2021). Multimodal few-shot learning with frozen language models. NeruIPS."},{"key":"2502_CR165","unstructured":"Van Den Oord, A., & Vinyals, O. (2017). Neural discrete representation learning. Advances in neural information processing systems, 30."},{"key":"2502_CR166","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141., & Polosukhin, I. (2017a). Attention is all you need. Advances in neural information processing systems, 30."},{"key":"2502_CR167","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141., & Polosukhin, I. (2017b). Attention is all you need. NeurIPS."},{"key":"2502_CR168","doi-asserted-by":"crossref","unstructured":"Wang, H., Tang, H., Jiang, L., Shi, S., Naeem, M. F., Li, H., & Schiele, B. (2024).  & Wang, L. Git: Towards generalist vision transformer through universal language interface. In ECCV.","DOI":"10.1007\/978-3-031-73397-0_4"},{"key":"2502_CR169","unstructured":"Wang, J., Yang, Z., Hu, X., Li, L., Lin, K., Gan, Z., Liu, Z., Liu, C., & Wang, L. (2022a). Git: A generative image-to-text transformer for vision and language. arXiv preprint arXiv:2205.14100"},{"key":"2502_CR170","unstructured":"Wang, J., Chen, D., Wu, Z., Luo, C., Zhou, L., Zhao, Y., Xie, Y., Liu, C., Jiang, Y.-G., & Yuan, L. (2022b). Omnivl: One foundation model for image-language and video-language tasks. NeurIPS."},{"key":"2502_CR171","unstructured":"Wang, P., Yang, A., Men, R., Lin, J., Bai, S., Li, Z., Ma, J., Zhou, C., Zhou, J., & Yang, H. (2022c). Ofa: architectures, U., tasks, and modalities through a simple sequence-to-sequence learning framework. In ICML."},{"key":"2502_CR172","unstructured":"Wang, P., Wang, S., Lin, J., Bai, S., Zhou, X., Zhou, J., Wang, X., & Zhou, C. (2023a). One-peace: Exploring one general representation model toward unlimited modalities. arXiv preprint arXiv:2305.11172"},{"key":"2502_CR173","unstructured":"Wang, P., Bai, S., Tan, S., Wang, S., Fan, Z., Bai, J., Chen, K., Liu, X., Wang, J., Ge, W., & Fan, Y. (2024b). Qwen2-vl: Enhancing vision-language model\u2019s perception of the world at any resolution. arXiv preprint arXiv:2409.12191."},{"key":"2502_CR174","unstructured":"Wang, W., He, Z., Hong, W., Cheng, Y., Zhang, X., Qi, J., Gu, X., Huang, S., Xu, B., Dong, Y., & Ding, M. (2024c). Lvbench: An extreme long video understanding benchmark. arXiv preprint arXiv:2406.08035."},{"key":"2502_CR175","unstructured":"Wang, W., Chen, Z., Chen, X., Wu, J., Zhu, X., Zeng, G., Luo, P., Lu, T., Zhou, J., Qiao, Y., Dai, J. (2024d). Visionllm: Large language model is also an open-ended decoder for vision-centric tasks. NeurIPS."},{"key":"2502_CR176","doi-asserted-by":"crossref","unstructured":"Wang, X., Wang, W., Cao, Y., Shen, C., & Huang, T. (2023b). Images speak in images: A generalist painter for in-context visual learning. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6830\u20136839.","DOI":"10.1109\/CVPR52729.2023.00660"},{"key":"2502_CR177","unstructured":"Wang, Y., Li, K., Li, Y., He, Y., Huang, B., Zhao, Z., Zhang, H., Xu, J., Liu, Y., Wang, Z., Xing, S. (2022d). Internvideo: General video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191."},{"key":"2502_CR178","unstructured":"Wang, Z., Yu, J., Yu, A. W., Dai, Z., Tsvetkov, Y., & Cao, Y. (2021). Simvlm: Simple visual language model pretraining with weak supervision. arXiv preprint arXiv:2108.10904"},{"key":"2502_CR179","doi-asserted-by":"crossref","unstructured":"Westerlund, M. (2019). The emergence of deepfake technology: A review. Technology innovation management review, 9 (11).","DOI":"10.22215\/timreview\/1282"},{"key":"2502_CR180","doi-asserted-by":"crossref","unstructured":"Wolleb, J., Bieder, F., Sandk\u00fchler, R., & Cattin, P. C. (2022). Diffusion models for medical anomaly detection. In International Conference on Medical image computing and computer-assisted intervention, pp. 35\u201345. Springer.","DOI":"10.1007\/978-3-031-16452-1_4"},{"key":"2502_CR181","unstructured":"Wu, H., Li, D., Chen, B., & Li, J. (2024a). Longvideobench: A benchmark for long-context interleaved video-language understanding. NeurIPS."},{"key":"2502_CR182","unstructured":"Wu, J., Zhong, M., Xing, S., Lai, Z., Liu, Z., Wang, W., Chen, Z., Zhu, X., Lu, L., Lu, T., & Luo, P. (2024b). Visionllm v2: An end-to-end generalist multimodal large language model for hundreds of vision-language tasks. arXiv preprint arXiv:2406.08394."},{"key":"2502_CR183","doi-asserted-by":"crossref","unstructured":"Wu, K., Peng, H., Chen, M., Fu, J., & Chao, H. (2021). Rethinking and improving relative position encoding for vision transformer. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00988"},{"key":"2502_CR184","doi-asserted-by":"crossref","unstructured":"Wyatt, J., Leach, A., Schmon, S. M., & Willcocks, C. G. (2022). Anoddpm: Anomaly detection with denoising diffusion probabilistic models using simplex noise. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 650\u2013656.","DOI":"10.1109\/CVPRW56347.2022.00080"},{"key":"2502_CR185","doi-asserted-by":"crossref","unstructured":"Xu, J., Mello, S. D., Liu, S., Byeon, W., Breuel, T., Kautz, J., & Wang, X. (2022). Groupvit: Semantic segmentation emerges from text supervision. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18134\u201318144.","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"2502_CR186","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., Zemel, R., & Bengio, Y. (2015). Show, attend and tell: Neural image caption generation with visual attention. In International conference on machine learning, pp. 2048\u20132057. PMLR."},{"key":"2502_CR187","doi-asserted-by":"crossref","unstructured":"Xu, R., Wang, X., Wang, T., Chen, Y., Pang, J., & Lin, D. (2024). Pointllm: Empowering large language models to understand point clouds. In ECCV.","DOI":"10.1007\/978-3-031-72698-9_8"},{"key":"2502_CR188","doi-asserted-by":"crossref","unstructured":"Yan, B., Jiang, Y., Wu, J., Wang, D., Luo, P., Yuan, Z., & Lu, H. (2023). Universal instance perception as object discovery and retrieval. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01471"},{"key":"2502_CR189","unstructured":"Yan, S., Zhu, T., Wang, Z., Cao, Y., Zhang, M., Ghosh, S., Wu, Y., & Yu, J. (2022). Videococa: Video-text modeling with zero-shot transfer from contrastive captioners. arXiv preprint arXiv:2212.04979."},{"issue":"10","key":"2502_CR190","doi-asserted-by":"publisher","first-page":"3337","DOI":"10.3390\/s18103337","volume":"18","author":"Y Yan","year":"2018","unstructured":"Yan, Y., Mao, Y., & Li, B. (2018). Second: Sparsely embedded convolutional detection. Sensors, 18(10), 3337.","journal-title":"Sensors"},{"key":"2502_CR191","unstructured":"Yang, Y., & Hospedales, T.M. (2016). Trace norm regularised deep multi-task learning. arXiv preprint arXiv:1606.04038."},{"key":"2502_CR192","doi-asserted-by":"crossref","unstructured":"Yang, Z., Jiang, L., Sun, Y., Schiele, B., & Jia, J. (2022). A unified query-based paradigm for point cloud understanding. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8541\u20138551.","DOI":"10.1109\/CVPR52688.2022.00835"},{"key":"2502_CR193","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., & Smola, A. (2016). Stacked attention networks for image question answering. In Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 21\u201329.","DOI":"10.1109\/CVPR.2016.10"},{"key":"2502_CR194","unstructured":"You, Z., Zhong, Y., Bao, F., Sun, J., Li, C., & Zhu, J. (2024). Diffusion models and semi-supervised learners benefit mutually with few labels. Advances in Neural Information Processing Systems, 36."},{"key":"2502_CR195","first-page":"5824","volume":"33","author":"T Yu","year":"2020","unstructured":"Yu, T., Kumar, S., Gupta, A., Levine, S., Hausman, K., & Finn, C. (2020). Gradient surgery for multi-task learning. NeurIPS, 33, 5824\u20135836.","journal-title":"Gradient surgery for multi-task learning. NeurIPS"},{"key":"2502_CR196","unstructured":"Yu, W., Yang, Z., Li, L., Wang, J., Lin, K., Liu, Z., Wang, X., & Wang, L. (2023). Mm-vet: Evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490"},{"key":"2502_CR197","unstructured":"Yuan, L., Chen, D., Chen, Y.-L., Codella, N., Dai, X., Gao, J., Hu, H., Huang, X., Li, B., & Li, C. (2021). Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432"},{"key":"2502_CR198","doi-asserted-by":"crossref","unstructured":"Yue, X., Ni, Y., Zhang, K., Zheng, T., Liu, R., Zhang, G., Stevens, S., Jiang, D., Ren, W., Sun, Y., Wei C. (2024). Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9556\u20139567.","DOI":"10.1109\/CVPR52733.2024.00913"},{"key":"2502_CR199","doi-asserted-by":"crossref","unstructured":"Zareian, A., Rosa, K. D., Hu, D. H., & Chang, S.-F. (2021). Open-vocabulary object detection using captions. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14393\u201314402.","DOI":"10.1109\/CVPR46437.2021.01416"},{"key":"2502_CR200","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., & Agrawala, M. (2023a). Adding conditional control to text-to-image diffusion models. In ICCV.","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"2502_CR201","unstructured":"Zhang, R., Han, J., Liu, C., Zhou, A., Lu, P., Qiao, Y., Li, H., & Gao, P. (2024a). Llama-adapter: Efficient fine-tuning of large language models with zero-initialized attention. In ICLR."},{"key":"2502_CR202","unstructured":"Zhang, Y.-F., Zhang, H., Tian, H., Fu, C., Zhang, S., Wu, J., Li, F., Wang, K., Wen, Q., Zhang, Z., & Wang, L. (2024b). Mme-realworld: Could your multimodal llm challenge high-resolution real-world scenarios that are difficult for humans? arXiv preprint arXiv:2408.13257"},{"key":"2502_CR203","unstructured":"Zhang, Y., Gong, K., Zhang, K., Li, H., Qiao, Y., Ouyang, W., & Yue, X. (2023b). Meta-transformer: A unified framework for multimodal learning. arXiv preprint arXiv:2307.10802."},{"key":"2502_CR204","unstructured":"Zhang, Y., Gong, K., Zhang, K., Li, H., Qiao, Y., Ouyang, W., & Yue, X. (2023c). Meta-transformer: A unified framework for multimodal learning. arXiv preprint arXiv:2307.10802."},{"issue":"12","key":"2502_CR205","doi-asserted-by":"publisher","first-page":"5586","DOI":"10.1109\/TKDE.2021.3070203","volume":"34","author":"Y Zhang","year":"2021","unstructured":"Zhang, Y., & Yang, Q. (2021). A survey on multi-task learning. IEEE Transactions on Knowledge and Data Engineering, 34(12), 5586\u20135609.","journal-title":"IEEE Transactions on Knowledge and Data Engineering"},{"key":"2502_CR206","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Luo, P., Loy, C. C., & Tang, X. (2014). Facial landmark detection by deep multi-task learning. In Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part VI 13, pp. 94\u2013108. Springer.","DOI":"10.1007\/978-3-319-10599-4_7"},{"key":"2502_CR207","first-page":"27196","volume":"34","author":"Z Zhang","year":"2021","unstructured":"Zhang, Z., Ma, J., Zhou, C., Men, R., Li, Z., Ding, M., Tang, J., Zhou, J., & Yang, H. (2021). Ufc-bert: Unifying multi-modal controls for conditional image synthesis. Advances in Neural Information Processing Systems, 34, 27196\u201327208.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2502_CR208","doi-asserted-by":"crossref","unstructured":"Zhao, W., Rao, Y., Liu, Z., Liu, B., Zhou, J., & Lu, J. (2023). Unleashing text-to-image diffusion models for visual perception. arXiv preprint arXiv:2303.02153.","DOI":"10.1109\/ICCV51070.2023.00527"},{"key":"2502_CR209","doi-asserted-by":"crossref","unstructured":"Zhao, X., Li, H., Shen, X., Liang, X., & Wu, Y. (2018). A modulation module for multi-task learning with applications in image retrieval. In Proceedings of the European Conference on Computer Vision (ECCV), pp. 401\u2013416.","DOI":"10.1007\/978-3-030-01246-5_25"},{"key":"2502_CR210","unstructured":"Zheng, S., & Charoenphakdee, N. (2022). Diffusion models for missing value imputation in tabular data. arXiv preprint arXiv:2210.17128."},{"key":"2502_CR211","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Yang, J., Zhang, P., Li, C., Codella, N., Li, L. H., Zhou, L., Dai, X., Yuan, L., Li, Y., Gao, J. (2022). Regionclip: Region-based language-image pretraining. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16793\u201316803.","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"2502_CR212","doi-asserted-by":"crossref","unstructured":"Zhong, Y., Yu, L., Bai, Y., Li, S., Yan, X., & Li, Y. (2023). Learning procedure-aware video representation from instructional videos and their narrations. In CVPR.","DOI":"10.1109\/CVPR52729.2023.01424"},{"key":"2502_CR213","doi-asserted-by":"crossref","unstructured":"Zhou, C., Loy, C. C., & Dai, B. (2022a). Extract free dense labels from clip. In European Conference on Computer Vision, pp. 696\u2013712. Springer.","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"2502_CR214","doi-asserted-by":"publisher","first-page":"13041","DOI":"10.1609\/aaai.v34i07.7005","volume":"34","author":"L Zhou","year":"2020","unstructured":"Zhou, L., Palangi, H., Zhang, L., Hu, H., Corso, J., & Gao, J. (2020). Unified vision-language pre-training for image captioning and vqa. In Proceedings of the AAAI conference on artificial intelligence, 34, 13041\u201313049.","journal-title":"In Proceedings of the AAAI conference on artificial intelligence"},{"key":"2502_CR215","doi-asserted-by":"crossref","unstructured":"Zhou, X., Girdhar, R., Joulin, A., Kr\u00e4henb\u00fchl, P., & Misra, I. (2022b). Detecting twenty-thousand classes using image-level supervision. In European Conference on Computer Vision, pp. 350\u2013368. Springer.","DOI":"10.1007\/978-3-031-20077-9_21"},{"key":"2502_CR216","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., & Elhoseiny, M. (2023). Minigpt-4: Enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592"},{"key":"2502_CR217","first-page":"2664","volume":"35","author":"J Zhu","year":"2022","unstructured":"Zhu, J., Zhu, X., Wang, W., Wang, X., Li, H., Wang, X., & Dai, J. (2022). Uni-perceiver-moe: Learning sparse generalist models with conditional moes. Advances in Neural Information Processing Systems, 35, 2664\u20132678.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2502_CR218","doi-asserted-by":"crossref","unstructured":"Zhu, X., Zhu, J., Li, H., Wu, X., Li, H., Wang, X., & Dai, J. (2022b). Uni-perceiver: Pre-training unified architecture for generic perception for zero-shot and few-shot tasks. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16804\u201316815.","DOI":"10.1109\/CVPR52688.2022.01630"},{"key":"2502_CR219","doi-asserted-by":"crossref","unstructured":"Zhu, X., Zhu, J., Li, H., Wu, X., Li, H., Wang, X., & Dai, J. (2022c). Uni-perceiver: Pre-training unified architecture for generic perception for zero-shot and few-shot tasks. In CVPR.","DOI":"10.1109\/CVPR52688.2022.01630"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02502-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02502-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02502-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,10]],"date-time":"2025-10-10T08:54:10Z","timestamp":1760086450000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02502-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,18]]},"references-count":219,"journal-issue":{"issue":"10","published-print":{"date-parts":[[2025,10]]}},"alternative-id":["2502"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02502-7","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"type":"print","value":"0920-5691"},{"type":"electronic","value":"1573-1405"}],"subject":[],"published":{"date-parts":[[2025,6,18]]},"assertion":[{"value":"8 January 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"9 June 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 June 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}