{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T23:10:04Z","timestamp":1751411404787,"version":"3.41.0"},"publisher-location":"Singapore","reference-count":49,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819666010","type":"print"},{"value":"9789819665990","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-6599-0_4","type":"book-chapter","created":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T22:32:27Z","timestamp":1751409147000},"page":"45-59","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Open-Vocabulary Self-interactive Semantic Segmentation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4744-5798","authenticated-orcid":false,"given":"Xinshuang","family":"Liu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-0485-1206","authenticated-orcid":false,"given":"Yue","family":"Zhao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,7,2]]},"reference":[{"key":"4_CR1","unstructured":"Bucher, M., Vu, T., Cord, M., P\u00e9rez, P.: Zero-shot semantic segmentation. In: NeurIPS, pp. 466\u2013477 (2019)"},{"key":"4_CR2","doi-asserted-by":"crossref","unstructured":"Chen, P., Xiao, Q., Xu, J., Dong, X., Sun, L.: Facial attribute editing using semantic segmentation. In: 2019 International Conference on High Performance Big Data and Intelligent Systems (HPBD &IS), pp. 97\u2013103. IEEE (2019)","DOI":"10.1109\/HPBDIS.2019.8735455"},{"key":"4_CR3","doi-asserted-by":"crossref","unstructured":"Chen, Z., Xu, Q., Cong, R., Huang, Q.: Global context-aware progressive aggregation network for salient object detection. In: AAAI, pp. 10599\u201310606. AAAI Press (2020)","DOI":"10.1609\/aaai.v34i07.6633"},{"key":"4_CR4","unstructured":"Dosovitskiy, A., et al.: An image is worth $$16\\times 16$$ words: transformers for image recognition at scale. In: ICLR. OpenReview.net (2021)"},{"issue":"1","key":"4_CR5","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1007\/s11263-014-0733-5","volume":"111","author":"M Everingham","year":"2015","unstructured":"Everingham, M., Eslami, S., Gool, L.V., Williams, C., Winn, J.M., Zisserman, A.: The pascal visual object classes challenge: a retrospective. Int. J. Comput. Vis. 111(1), 98\u2013136 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"4_CR6","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., Gu, X., Cui, Y., Lin, T.: Scaling open-vocabulary image segmentation with image-level labels. In: ECCV. Lecture Notes in Computer Science, vol. 13696, pp. 540\u2013557. Springer (2022)","DOI":"10.1007\/978-3-031-20059-5_31"},{"key":"4_CR7","doi-asserted-by":"publisher","unstructured":"Hofmarcher, M., Unterthiner, T., Arjona-Medina, J., Klambauer, G., Hochreiter, S., Nessler, B.: Visual scene understanding for autonomous driving using semantic segmentation. In: Samek, W., Montavon, G., Vedaldi, A., Hansen, L.K., M\u00fcller, K.-R. (eds.) Explainable AI: Interpreting, Explaining and Visualizing Deep Learning. LNCS (LNAI), vol. 11700, pp. 285\u2013296. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-28954-6_15","DOI":"10.1007\/978-3-030-28954-6_15"},{"key":"4_CR8","doi-asserted-by":"crossref","unstructured":"Huang, Y., et al.: Interformer real-time interactive image segmentation. In: ICCV, pp. 22244\u201322254. IEEE (2023)","DOI":"10.1109\/ICCV51070.2023.02038"},{"key":"4_CR9","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et al.: Segment anything. In: ICCV, pp. 3992\u20134003. IEEE (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"4_CR10","unstructured":"Li, B., Weinberger, K.Q., Belongie, S.J., Koltun, V., Ranftl, R.: Language-driven semantic segmentation. In: ICLR. OpenReview.net (2022)"},{"key":"4_CR11","doi-asserted-by":"crossref","unstructured":"Liang, F., et al.: Open-vocabulary semantic segmentation with mask-adapted CLIP. In: CVPR, pp. 7061\u20137070. IEEE (2023)","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"4_CR12","doi-asserted-by":"publisher","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"4_CR13","doi-asserted-by":"crossref","unstructured":"Liu, N., Zhang, N., Wan, K., Shao, L., Han, J.: Visual saliency transformer. In: ICCV, pp. 4702\u20134712. IEEE (2021)","DOI":"10.1109\/ICCV48922.2021.00468"},{"key":"4_CR14","doi-asserted-by":"crossref","unstructured":"Liu, Q., Xu, Z., Bertasius, G., Niethammer, M.: Simpleclick: interactive image segmentation with simple vision transformers. In: ICCV, pp. 22233\u201322243. IEEE (2023)","DOI":"10.1109\/ICCV51070.2023.02037"},{"key":"4_CR15","doi-asserted-by":"crossref","unstructured":"Liu, W., Zhang, C., Lin, G., Liu, F.: CRNet: cross-reference networks for few-shot segmentation. In: CVPR, pp. 4164\u20134172. Computer Vision Foundation \/ IEEE (2020)","DOI":"10.1109\/CVPR42600.2020.00422"},{"key":"4_CR16","doi-asserted-by":"crossref","unstructured":"Liu, X., Zhang, Y., Tong, X.: V-hands: touchscreen-based hand tracking for remote whiteboard interaction. arXiv:2409.13347 (2024)","DOI":"10.1145\/3654777.3676412"},{"key":"4_CR17","doi-asserted-by":"crossref","unstructured":"Liu, X., Zhao, Y.: Differentiable largest connected component layer for image matting. In: Artificial Neural Networks and Machine Learning \u2013 ICANN 2024, pp. 419\u2013431. Springer Nature Switzerland, Cham (2024)","DOI":"10.1007\/978-3-031-72332-2_27"},{"key":"4_CR18","doi-asserted-by":"crossref","unstructured":"Liu, X., Zhao, Y.: Image matting based on deep equilibrium models. In: Artificial Neural Networks and Machine Learning \u2013 ICANN 2024, pp. 379\u2013391. Springer Nature Switzerland, Cham (2024)","DOI":"10.1007\/978-3-031-72335-3_26"},{"key":"4_CR19","doi-asserted-by":"crossref","unstructured":"Liu, X., Zhao, Y.: Towards generalizable and interpretable AI-modified image detectors. In: Artificial Neural Networks and Machine Learning \u2013 ICANN 2024, pp. 246\u2013257. Springer Nature Switzerland, Cham (2024)","DOI":"10.1007\/978-3-031-72341-4_17"},{"key":"4_CR20","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2015, Boston, MA, USA, June 7\u201312, 2015, pp. 3431\u20133440. IEEE Computer Society (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"4_CR21","doi-asserted-by":"crossref","unstructured":"Ma, M., Xia, C., Li, J.: Pyramidal feature shrinking for salient object detection. In: AAAI, pp. 2311\u20132318. AAAI Press (2021)","DOI":"10.1609\/aaai.v35i3.16331"},{"key":"4_CR22","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G.S., Dean, J.: Distributed representations of words and phrases and their compositionality. In: NeurIPS, pp. 3111\u20133119 (2013)"},{"key":"4_CR23","doi-asserted-by":"crossref","unstructured":"Ojha, U., Li, Y., Lee, Y.J.: Towards universal fake image detectors that generalize across generative models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24480\u201324489. IEEE (2023)","DOI":"10.1109\/CVPR52729.2023.02345"},{"key":"4_CR24","doi-asserted-by":"crossref","unstructured":"Pang, Y., Zhao, X., Zhang, L., Lu, H.: Multi-scale interactive network for salient object detection. In: CVPR, pp. 9410\u20139419. Computer Vision Foundation \/ IEEE (2020)","DOI":"10.1109\/CVPR42600.2020.00943"},{"key":"4_CR25","doi-asserted-by":"crossref","unstructured":"Peng, B., et al.: Hierarchical dense correlation distillation for few-shot segmentation. In: CVPR, pp. 23641\u201323651. IEEE (2023)","DOI":"10.1109\/CVPR52729.2023.02264"},{"key":"4_CR26","doi-asserted-by":"publisher","first-page":"107404","DOI":"10.1016\/j.patcog.2020.107404","volume":"106","author":"X Qin","year":"2020","unstructured":"Qin, X., Zhang, Z., Huang, C., Dehghan, M., Za\u00efane, O.R., J\u00e4gersand, M.: U$$ ^{\\text{2 }}$$-net: going deeper with nested u-structure for salient object detection. Pattern Recogn. 106, 107404 (2020)","journal-title":"Pattern Recogn."},{"key":"4_CR27","doi-asserted-by":"crossref","unstructured":"Qiu, F., Yang, Y., Li, H., Fu, M., Wang, S.: Semantic motion segmentation for urban dynamic scene understanding. In: CASE, pp. 497\u2013502. IEEE (2016)","DOI":"10.1109\/COASE.2016.7743446"},{"key":"4_CR28","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML. Proceedings of Machine Learning Research, vol.\u00a0139, pp. 8748\u20138763. PMLR (2021)"},{"key":"4_CR29","doi-asserted-by":"crossref","unstructured":"Shin, C.J., Heo, Y.S.: GAN inversion with semantic segmentation map for image editing. In: ICTC, pp. 927\u2013931. IEEE (2022)","DOI":"10.1109\/ICTC55196.2022.9952548"},{"issue":"2","key":"4_CR30","doi-asserted-by":"publisher","first-page":"3870","DOI":"10.1109\/LRA.2021.3066956","volume":"6","author":"Y Sun","year":"2021","unstructured":"Sun, Y., Pan, B., Fu, Y.: Lightweight deep neural network for real-time instrument semantic segmentation in robot assisted minimally invasive surgery. IEEE Robot. Autom. Lett. 6(2), 3870\u20133877 (2021)","journal-title":"IEEE Robot. Autom. Lett."},{"issue":"2","key":"4_CR31","first-page":"69","volume":"10","author":"H Thily","year":"2010","unstructured":"Thily, H.: Using a digital presenter as a mixed solution to teaching and training. Teach. Engl. Technol. 10(2), 69\u201378 (2010)","journal-title":"Teach. Engl. Technol."},{"key":"4_CR32","doi-asserted-by":"crossref","unstructured":"Wang, K., Liew, J.H., Zou, Y., Zhou, D., Feng, J.: Panet: few-shot image semantic segmentation with prototype alignment. In: ICCV, pp. 9196\u20139205. IEEE (2019)","DOI":"10.1109\/ICCV.2019.00929"},{"key":"4_CR33","doi-asserted-by":"crossref","unstructured":"Wang, L., et al.: Learning to detect salient objects with image-level supervision. In: CVPR, pp. 3796\u20133805. IEEE Computer Society (2017)","DOI":"10.1109\/CVPR.2017.404"},{"issue":"2","key":"4_CR34","doi-asserted-by":"publisher","first-page":"923","DOI":"10.1109\/TIP.2017.2768621","volume":"27","author":"Z Wang","year":"2018","unstructured":"Wang, Z., Wei, L., Wang, L., Gao, Y., Chen, W., Shen, D.: Hierarchical vertex regression-based segmentation of head and neck CT images for radiotherapy planning. IEEE Trans. Image Process. 27(2), 923\u2013937 (2018)","journal-title":"IEEE Trans. Image Process."},{"key":"4_CR35","doi-asserted-by":"crossref","unstructured":"Wei, J., Wang, S., Huang, Q.: F$${^3}$$net: fusion, feedback and focus for salient object detection. In: AAAI, pp. 12321\u201312328. AAAI Press (2020)","DOI":"10.1609\/aaai.v34i07.6916"},{"key":"4_CR36","doi-asserted-by":"crossref","unstructured":"Wei, J., Wang, S., Wu, Z., Su, C., Huang, Q., Tian, Q.: Label decoupling framework for salient object detection. In: CVPR, pp. 13022\u201313031. Computer Vision Foundation \/ IEEE (2020)","DOI":"10.1109\/CVPR42600.2020.01304"},{"key":"4_CR37","doi-asserted-by":"publisher","first-page":"6226","DOI":"10.1109\/TIP.2021.3093380","volume":"30","author":"Z Wu","year":"2021","unstructured":"Wu, Z., Su, L., Huang, Q.: Decomposition and completion network for salient object detection. IEEE Trans. Image Process. 30, 6226\u20136239 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"4_CR38","doi-asserted-by":"crossref","unstructured":"Xian, Y., Choudhury, S., He, Y., Schiele, B., Akata, Z.: Semantic projection network for zero- and few-label semantic segmentation. In: CVPR, pp. 8256\u20138265. Computer Vision Foundation \/ IEEE (2019)","DOI":"10.1109\/CVPR.2019.00845"},{"key":"4_CR39","doi-asserted-by":"crossref","unstructured":"Xu, B., Liang, H., Liang, R., Chen, P.: Locate globally, segment locally: a progressive architecture with knowledge review network for salient object detection. In: AAAI, pp. 3004\u20133012. AAAI Press (2021)","DOI":"10.1609\/aaai.v35i4.16408"},{"issue":"12","key":"4_CR40","doi-asserted-by":"publisher","first-page":"15546","DOI":"10.1109\/TPAMI.2023.3311618","volume":"45","author":"M Xu","year":"2023","unstructured":"Xu, M., Zhang, Z., Wei, F., Hu, H., Bai, X.: SAN: side adapter network for open-vocabulary semantic segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 45(12), 15546\u201315561 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"4_CR41","doi-asserted-by":"crossref","unstructured":"Xu, M., et al.: A simple baseline for open-vocabulary semantic segmentation with pre-trained vision-language model. In: ECCV. Lecture Notes in Computer Science, vol. 13689, pp. 736\u2013753. Springer (2022)","DOI":"10.1007\/978-3-031-19818-2_42"},{"key":"4_CR42","doi-asserted-by":"crossref","unstructured":"Xu, N., Price, B.L., Cohen, S., Huang, T.S.: Deep image matting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 311\u2013320. IEEE Computer Society (2017)","DOI":"10.1109\/CVPR.2017.41"},{"key":"4_CR43","doi-asserted-by":"crossref","unstructured":"Xu, N., Price, B.L., Cohen, S., Yang, J., Huang, T.S.: Deep interactive object selection. In: CVPR, pp. 373\u2013381. IEEE Computer Society (2016)","DOI":"10.1109\/CVPR.2016.47"},{"key":"4_CR44","doi-asserted-by":"crossref","unstructured":"Xu, Q., Zhao, W., Lin, G., Long, C.: Self-calibrated cross attention network for few-shot segmentation. In: ICCV, pp. 655\u2013665. IEEE (2023)","DOI":"10.1109\/ICCV51070.2023.00067"},{"key":"4_CR45","doi-asserted-by":"publisher","first-page":"4667","DOI":"10.1109\/TMM.2023.3325731","volume":"26","author":"YK Yun","year":"2024","unstructured":"Yun, Y.K., Lin, W.: Towards a complete and detail-preserved salient object detection. IEEE Trans. Multim. 26, 4667\u20134680 (2024)","journal-title":"IEEE Trans. Multim."},{"key":"4_CR46","doi-asserted-by":"crossref","unstructured":"Zhang, M., Liu, T., Piao, Y., Yao, S., Lu, H.: Auto-MSFNet: search multi-scale fusion network for salient object detection. In: ACM Multimedia, pp. 667\u2013676. ACM (2021)","DOI":"10.1145\/3474085.3475231"},{"key":"4_CR47","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1007\/978-3-030-58536-5_3","volume-title":"Computer Vision \u2013 ECCV 2020","author":"X Zhao","year":"2020","unstructured":"Zhao, X., Pang, Y., Zhang, L., Lu, H., Zhang, L.: Suppress and balance: a simple gated network for salient object detection. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12347, pp. 35\u201351. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_3"},{"key":"4_CR48","doi-asserted-by":"crossref","unstructured":"Zhou, C., Loy, C.C., Dai, B.: Extract free dense labels from CLIP. In: ECCV. Lecture Notes in Computer Science, vol. 13688, pp. 696\u2013712. Springer (2022)","DOI":"10.1007\/978-3-031-19815-1_40"},{"key":"4_CR49","unstructured":"Zhu, H., Sun, X., Li, Y., Ma, K., Zhou, S.K., Zheng, Y.: DFTR: depth-supervised fusion transformer for salient object detection. arXiv preprint arXiv:2203.06429 (2022)"}],"container-title":["Lecture Notes in Computer Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-6599-0_4","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T22:32:36Z","timestamp":1751409156000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-6599-0_4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819666010","9789819665990"],"references-count":49,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-6599-0_4","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"2 July 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Auckland","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"New Zealand","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"31","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/iconip2024.org","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}