{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:47:06Z","timestamp":1778082426986,"version":"3.51.4"},"publisher-location":"Cham","reference-count":61,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031727535","type":"print"},{"value":"9783031727542","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72754-2_5","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:57:07Z","timestamp":1730300227000},"page":"74-91","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":28,"title":["PSALM: Pixelwise SegmentAtion with\u00a0Large Multi-modal Model"],"prefix":"10.1007","author":[{"given":"Zheng","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Yeyao","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Enming","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Xiang","family":"Bai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"5_CR1","first-page":"23716","volume":"35","author":"JB Alayrac","year":"2022","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. Adv. Neural. Inf. Process. Syst. 35, 23716\u201323736 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5_CR2","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Caesar, H., Uijlings, J., Ferrari, V.: Coco-stuff: thing and stuff classes in context. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1209\u20131218 (2018)","DOI":"10.1109\/CVPR.2018.00132"},{"key":"5_CR4","unstructured":"Chen, K., Zhang, Z., Zeng, W., Zhang, R., Zhu, F., Zhao, R.: Shikra: unleashing multimodal llm\u2019s referential dialogue magic. arXiv preprint arXiv:2306.15195 (2023)"},{"key":"5_CR5","doi-asserted-by":"crossref","unstructured":"Chen, X., Mottaghi, R., Liu, X., Fidler, S., Urtasun, R., Yuille, A.: Detect what you can: detecting and representing objects using holistic models and body parts. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1971\u20131978 (2014)","DOI":"10.1109\/CVPR.2014.254"},{"key":"5_CR6","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1290\u20131299 (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"5_CR7","unstructured":"Chiang, W.L., et\u00a0al.: Vicuna: an open-source chatbot impressing gpt-4 with 90%* chatgpt quality (2023). See https:\/\/vicuna.lmsys.org. Accessed 14 April 2023"},{"key":"5_CR8","doi-asserted-by":"crossref","unstructured":"Ding, H., Liu, C., Wang, S., Jiang, X.: Vision-language transformer and query generation for referring segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16321\u201316330 (2021)","DOI":"10.1109\/ICCV48922.2021.01601"},{"key":"5_CR9","unstructured":"Ding, Z., Wang, J., Tu, Z.: Open-vocabulary universal image segmentation with maskclip (2023)"},{"key":"5_CR10","unstructured":"Dong, R., et\u00a0al.: Dreamllm: Synergistic multimodal comprehension and creation. arXiv preprint arXiv:2309.11499 (2023)"},{"key":"5_CR11","unstructured":"Everingham, M., Van\u00a0Gool, L., Williams, C.K.I., Winn, J., Zisserman, A.: The PASCAL Visual Object Classes Challenge 2010 (VOC2010) Results. http:\/\/www.pascal-network.org\/challenges\/VOC\/voc2010\/workshop\/index.html"},{"key":"5_CR12","unstructured":"Grauman, K., et\u00a0al.: Ego-exo4d: Understanding skilled human activity from first-and third-person perspectives. arXiv preprint arXiv:2311.18259 (2023)"},{"key":"5_CR13","doi-asserted-by":"crossref","unstructured":"Gupta, A., Dollar, P., Girshick, R.: Lvis: A dataset for large vocabulary instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5356\u20135364 (2019)","DOI":"10.1109\/CVPR.2019.00550"},{"key":"5_CR14","doi-asserted-by":"crossref","unstructured":"Jain, J., Li, J., Chiu, M.T., Hassani, A., Orlov, N., Shi, H.: Oneformer: One transformer to rule universal image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2989\u20132998 (2023)","DOI":"10.1109\/CVPR52729.2023.00292"},{"key":"5_CR15","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. arXiv preprint arXiv:2304.02643 (2023)"},{"key":"5_CR16","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vision"},{"key":"5_CR17","doi-asserted-by":"crossref","unstructured":"Lai, X., et al.: Lisa: reasoning segmentation via large language model. arXiv preprint arXiv:2308.00692 (2023)","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"5_CR18","unstructured":"Li, B., Zhang, P., Yang, J., Zhang, Y., Pu, F., Liu, Z.: Otterhd: a high-resolution multi-modality model. arXiv preprint arXiv:2311.04219 (2023)"},{"key":"5_CR19","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"5_CR20","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning, pp. 12888\u201312900. PMLR (2022)"},{"key":"5_CR21","doi-asserted-by":"crossref","unstructured":"Li, X., et al.: Omg-seg: Is one model good enough for all segmentation? arXiv preprint arXiv:2401.10229 (2024)","DOI":"10.1109\/CVPR52733.2024.02640"},{"key":"5_CR22","unstructured":"Li, Y., Bubeck, S., Eldan, R., Del\u00a0Giorno, A., Gunasekar, S., Lee, Y.T.: Textbooks are all you need ii: phi-1.5 technical report. arXiv preprint arXiv:2309.05463 (2023)"},{"key":"5_CR23","doi-asserted-by":"crossref","unstructured":"Li, Z., et al.: Monkey: image resolution and text label are important things for large multi-modal models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 26763\u201326773 (2024)","DOI":"10.1109\/CVPR52733.2024.02527"},{"key":"5_CR24","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco: Common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pp. 740\u2013755. Springer (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"5_CR25","doi-asserted-by":"crossref","unstructured":"Liu, C., Ding, H., Jiang, X.: Gres: generalized referring expression segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23592\u201323601 (2023)","DOI":"10.1109\/CVPR52729.2023.02259"},{"key":"5_CR26","unstructured":"Liu, H., et al.: Llava-next: improved reasoning, ocr, and world knowledge (2024)"},{"key":"5_CR27","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Advances in neural information processing systems 36 (2024)"},{"key":"5_CR28","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"5_CR29","doi-asserted-by":"crossref","unstructured":"Miao, J., et al.: Large-scale video panoptic segmentation in the wild: A benchmark. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21033\u201321043 (2022)","DOI":"10.1109\/CVPR52688.2022.02036"},{"key":"5_CR30","doi-asserted-by":"crossref","unstructured":"Mottaghi, R., et al.: The role of context for object detection and semantic segmentation in the wild. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2014)","DOI":"10.1109\/CVPR.2014.119"},{"key":"5_CR31","doi-asserted-by":"crossref","unstructured":"Nagaraja, V.K., Morariu, V.I., Davis, L.S.: Modeling context between objects for referring expression understanding. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part IV 14. pp. 792\u2013807. Springer (2016)","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"5_CR32","unstructured":"OpenAI: Gpt-4v(ision) system card (2023)"},{"key":"5_CR33","unstructured":"Peng, Z., Wang, W., Dong, L., Hao, Y., Huang, S., Ma, S., Wei, F.: Kosmos-2: grounding multimodal large language models to the world. arXiv preprint arXiv:2306.14824 (2023)"},{"key":"5_CR34","doi-asserted-by":"crossref","unstructured":"Pi, R., Yao, L., Gao, J., Zhang, J., Zhang, T.: Perceptiongpt: effectively fusing visual perception into llm. arXiv preprint arXiv:2311.06612 (2023)","DOI":"10.1109\/CVPR52733.2024.02561"},{"key":"5_CR35","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"5_CR36","unstructured":"Pont-Tuset, J., Perazzi, F., Caelles, S., Arbel\u00e1ez, P., Sorkine-Hornung, A., Van\u00a0Gool, L.: The 2017 davis challenge on video object segmentation. arXiv preprint arXiv:1704.00675 (2017)"},{"key":"5_CR37","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International conference on machine learning. pp. 8748\u20138763. PMLR (2021)"},{"key":"5_CR38","doi-asserted-by":"crossref","unstructured":"Ramanathan, V., et\u00a0al.: Paco: Parts and attributes of common objects. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7141\u20137151 (2023)","DOI":"10.1109\/CVPR52729.2023.00690"},{"key":"5_CR39","doi-asserted-by":"crossref","unstructured":"Rasheed, H., Maaz, M., Shaji, S., Shaker, A., Khan, S., Cholakkal, H., Anwer, R.M., Xing, E., Yang, M.H., Khan, F.S.: Glamm: Pixel grounding large multimodal model. arXiv preprint arXiv:2311.03356 (2023)","DOI":"10.1109\/CVPR52733.2024.01236"},{"key":"5_CR40","doi-asserted-by":"crossref","unstructured":"Ren, Z., Huang, Z., Wei, Y., Zhao, Y., Fu, D., Feng, J., Jin, X.: Pixellm: Pixel reasoning with large multimodal model. arXiv preprint arXiv:2312.02228 (2023)","DOI":"10.1109\/CVPR52733.2024.02491"},{"key":"5_CR41","doi-asserted-by":"crossref","unstructured":"Shao, S., et al.: Objects365: a large-scale, high-quality dataset for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8430\u20138439 (2019)","DOI":"10.1109\/ICCV.2019.00852"},{"key":"5_CR42","doi-asserted-by":"crossref","unstructured":"Sudre, C.H., Li, W., Vercauteren, T., Ourselin, S., Jorge\u00a0Cardoso, M.: Generalised dice overlap as a deep learning loss function for highly unbalanced segmentations. In: Deep Learning in Medical Image Analysis and Multimodal Learning for Clinical Decision Support: Third International Workshop, DLMIA 2017, and 7th International Workshop, ML-CDS 2017, Held in Conjunction with MICCAI 2017, Qu\u00e9bec City, QC, Canada, September 14, Proceedings 3, pp. 240\u2013248. Springer (2017)","DOI":"10.1007\/978-3-319-67558-9_28"},{"key":"5_CR43","unstructured":"Sun, Q., et al: Generative pretraining in multimodality. arXiv preprint arXiv:2307.05222 (2023)"},{"key":"5_CR44","unstructured":"Team, G., et\u00a0al.: Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805 (2023)"},{"key":"5_CR45","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"5_CR46","unstructured":"Wang, W., et\u00a0al.: Cogvlm: visual expert for pretrained language models. arXiv preprint arXiv:2311.03079 (2023)"},{"key":"5_CR47","doi-asserted-by":"crossref","unstructured":"Wu, J., Jiang, Y., Liu, Q., Yuan, Z., Bai, X., Bai, S.: General object foundation model for images and videos at scale. arXiv preprint arXiv:2312.09158 (2023)","DOI":"10.1109\/CVPR52733.2024.00363"},{"key":"5_CR48","doi-asserted-by":"crossref","unstructured":"Xia, Z., Han, D., Han, Y., Pan, X., Song, S., Huang, G.: Gsva: Generalized segmentation via multimodal large language models. arXiv preprint arXiv:2312.10103 (2023)","DOI":"10.1109\/CVPR52733.2024.00370"},{"key":"5_CR49","doi-asserted-by":"crossref","unstructured":"Xu, J., Liu, S., Vahdat, A., Byeon, W., Wang, X., De\u00a0Mello, S.: Open-vocabulary panoptic segmentation with text-to-image diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2955\u20132966 (2023)","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"5_CR50","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.: u-llava: unifying multi-modal tasks via large language model. arXiv preprint arXiv:2311.05348 (2023)","DOI":"10.3233\/FAIA240541"},{"key":"5_CR51","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhang, Z., Wei, F., Hu, H., Bai, X.: Side adapter network for open-vocabulary semantic segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2945\u20132954 (2023)","DOI":"10.1109\/CVPR52729.2023.00288"},{"key":"5_CR52","doi-asserted-by":"crossref","unstructured":"Xu, M., Zhang, Z., Wei, F., Lin, Y., Cao, Y., Hu, H., Bai, X.: A simple baseline for open-vocabulary semantic segmentation with pre-trained vision-language model. In: European Conference on Computer Vision, pp. 736\u2013753. Springer (2022)","DOI":"10.1007\/978-3-031-19818-2_42"},{"key":"5_CR53","doi-asserted-by":"crossref","unstructured":"Yan, B., et al.: Universal instance perception as object discovery and retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15325\u201315336 (2023)","DOI":"10.1109\/CVPR52729.2023.01471"},{"key":"5_CR54","unstructured":"Yang, S., Qu, T., Lai, X., Tian, Z., Peng, B., Liu, S., Jia, J.: An improved baseline for reasoning segmentation with large language model. arXiv preprint arXiv:2312.17240 (2023)"},{"key":"5_CR55","unstructured":"You, H., et al.: Ferret: refer and ground anything anywhere at any granularity. arXiv preprint arXiv:2310.07704 (2023)"},{"key":"5_CR56","doi-asserted-by":"crossref","unstructured":"Yu, L., et al.: Mattnet: modular attention network for referring expression comprehension. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1307\u20131315 (2018)","DOI":"10.1109\/CVPR.2018.00142"},{"key":"5_CR57","doi-asserted-by":"crossref","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11-14, 2016, Proceedings, Part II 14. pp. 69\u201385. Springer (2016)","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"5_CR58","unstructured":"Zhang, A., Zhao, L., Xie, C.W., Zheng, Y., Ji, W., Chua, T.S.: Next-chat: an lmm for chat, detection and segmentation. arXiv preprint arXiv:2311.04498 (2023)"},{"key":"5_CR59","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1007\/s11263-018-1140-0","volume":"127","author":"B Zhou","year":"2019","unstructured":"Zhou, B., Zhao, H., Puig, X., Xiao, T., Fidler, S., Barriuso, A., Torralba, A.: Semantic understanding of scenes through the ade20k dataset. Int. J. Comput. Vision 127, 302\u2013321 (2019)","journal-title":"Int. J. Comput. Vision"},{"key":"5_CR60","doi-asserted-by":"crossref","unstructured":"Zou, X., Dou, Z.Y., Yang, J., Gan, Z., Li, L., Li, C., Dai, X., Behl, H., Wang, J., Yuan, L., et\u00a0al.: Generalized decoding for pixel, image, and language. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15116\u201315127 (2023)","DOI":"10.1109\/CVPR52729.2023.01451"},{"key":"5_CR61","unstructured":"Zou, X., et al.: Segment everything everywhere all at once. Advances in Neural Information Processing Systems 36 (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72754-2_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:00:45Z","timestamp":1730300445000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72754-2_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031727535","9783031727542"],"references-count":61,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72754-2_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}