{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T14:46:06Z","timestamp":1742913966972,"version":"3.40.3"},"publisher-location":"Cham","reference-count":46,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031728471"},{"type":"electronic","value":"9783031728488"}],"license":[{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,29]],"date-time":"2024-11-29T00:00:00Z","timestamp":1732838400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72848-8_20","type":"book-chapter","created":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T13:35:14Z","timestamp":1732800914000},"page":"341-356","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["From Pixels to\u00a0Objects: A Hierarchical Approach for\u00a0Part and\u00a0Object Segmentation Using Local and\u00a0Global Aggregation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-8853-1751","authenticated-orcid":false,"given":"Yunfei","family":"Xie","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1243-8045","authenticated-orcid":false,"given":"Cihang","family":"Xie","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5207-9249","authenticated-orcid":false,"given":"Alan","family":"Yuille","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4710-9463","authenticated-orcid":false,"given":"Jieru","family":"Mei","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,29]]},"reference":[{"key":"20_CR1","doi-asserted-by":"crossref","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: DeepLab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs. TPAMI (2017)","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: DeepLab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs. IEEE Trans. Pattern Anal. Mach. Intell. (2017)","DOI":"10.1109\/TPAMI.2017.2699184"},{"key":"20_CR3","unstructured":"Chen, L.C., Papandreou, G., Schroff, F., Adam, H.: Rethinking atrous convolution for semantic image segmentation. arXiv:1706.05587 (2017)"},{"key":"20_CR4","doi-asserted-by":"crossref","unstructured":"Chen, L.C., Zhu, Y., Papandreou, G., Schroff, F., Adam, H.: Encoder-decoder with atrous separable convolution for semantic image segmentation (2018)","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"20_CR5","doi-asserted-by":"crossref","unstructured":"Chen, X., Mottaghi, R., Liu, X., Fidler, S., Urtasun, R., Yuille, A.: Detect what you can: detecting and representing objects using holistic models and body parts (2014)","DOI":"10.1109\/CVPR.2014.254"},{"key":"20_CR6","unstructured":"Cheng, B., Schwing, A.G., Kirillov, A.: Per-pixel classification is not all you need for semantic segmentation (2021)"},{"key":"20_CR7","unstructured":"Dosovitskiy, A., et al.: An image is worth $$16 \\times 16$$ words: transformers for image recognition at scale (2021)"},{"key":"20_CR8","unstructured":"Du, X., Zoph, B., Hung, W.C., Lin, T.Y.: Simple training strategies and model scaling for object detection (2021)"},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"Everingham, M., Gool, L.V., Williams, C.K.I., Winn, J.M., Zisserman, A.: The pascal visual object classes (VOC) challenge. Int. J. Comput. Vis. 88, 303\u2013338 (2010). https:\/\/api.semanticscholar.org\/CorpusID:4246903","DOI":"10.1007\/s11263-009-0275-4"},{"key":"20_CR10","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., et al.: Simple copy-paste is a strong data augmentation method for instance segmentation (2021)","DOI":"10.1109\/CVPR46437.2021.00294"},{"key":"20_CR11","unstructured":"Hatamizadeh, A., Yin, H., Heinrich, G., Kautz, J., Molchanov, P.: Global context vision transformers (2023)"},{"key":"20_CR12","doi-asserted-by":"crossref","unstructured":"He, J., Chen, J., Lin, M.X., Yu, Q., Yuille, A.: Compositor: bottom-up clustering and compositing for robust part and object segmentation (2023)","DOI":"10.1109\/CVPR52729.2023.01083"},{"key":"20_CR13","doi-asserted-by":"crossref","unstructured":"He, J., et al.: Partimagenet: a large, high-quality dataset of parts (2022)","DOI":"10.1007\/978-3-031-20074-8_8"},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition (2015)","DOI":"10.1109\/CVPR.2016.90"},{"key":"20_CR15","unstructured":"Howard, A.G., et al.: Mobilenets: efficient convolutional neural networks for mobile vision applications. CoRR abs\/1704.04861 (2017)"},{"key":"20_CR16","unstructured":"Huang, H., Zhou, X., Cao, J., He, R., Tan, T.: Vision transformer with super token sampling. In: CVPR (2023)"},{"key":"20_CR17","doi-asserted-by":"crossref","unstructured":"Jampani, V., Sun, D., Liu, M., Yang, M., Kautz, J.: Superpixel sampling networks. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01234-2_22"},{"key":"20_CR18","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. In: Advances in Neural Information Processing Systems (2012)"},{"issue":"6266","key":"20_CR19","doi-asserted-by":"publisher","first-page":"1332","DOI":"10.1126\/science.aab3050","volume":"350","author":"BM Lake","year":"2015","unstructured":"Lake, B.M., Salakhutdinov, R., Tenenbaum, J.B.: Human-level concept learning through probabilistic program induction. Science 350(6266), 1332\u20131338 (2015)","journal-title":"Science"},{"key":"20_CR20","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"20_CR21","unstructured":"Locatello, F., et al.: Object-centric learning with slot attention. In: Advances in Neural Information Processing Systems, vol. 33, pp. 11525\u201311538 (2020)"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"20_CR23","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization (2019)"},{"key":"20_CR24","unstructured":"Ma, X., et al.: Image as set of points. In: ICLR (2023)"},{"key":"20_CR25","unstructured":"Mei, J., Chen, L., Yuille, A.L., Xie, C.: SPFormer: enhancing vision transformer with superpixel representation. CoRR abs\/2401.02931 (2024)"},{"key":"20_CR26","unstructured":"Mei, J., et al.: Atomnas: fine-grained end-to-end neural architecture search. In: ICLR (2020)"},{"key":"20_CR27","doi-asserted-by":"crossref","unstructured":"Michieli, U., Borsato, E., Rossi, L., Zanuttigh, P.: GMNet: graph matching network for large scale part semantic segmentation in the wild (2020)","DOI":"10.1007\/978-3-030-58598-3_24"},{"key":"20_CR28","doi-asserted-by":"crossref","unstructured":"Peng, J., He, J., Kaushik, P., Xiao, Z., Mu, J., Yuille, A.: Learning part segmentation from synthetic animals. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 90\u2013101 (2024)","DOI":"10.1109\/WACVW60836.2024.00015"},{"key":"20_CR29","unstructured":"Peng, J., et al.: DSPart: a large-scale diffusion-generated synthetic dataset with annotations from 3D parts (2024)"},{"key":"20_CR30","doi-asserted-by":"crossref","unstructured":"Sandler, M., Howard, A.G., Zhu, M., Zhmoginov, A., Chen, L.: Mobilenetv2: inverted residuals and linear bottlenecks. In: CVPR, pp. 4510\u20134520 (2018)","DOI":"10.1109\/CVPR.2018.00474"},{"key":"20_CR31","doi-asserted-by":"crossref","unstructured":"Singh, R., Gupta, P., Shenoy, P., Sarvadevabhatla, R.: Float: factorized learning of object attributes for improved multi-object multi-part scene parsing (2022)","DOI":"10.1109\/CVPR52688.2022.00150"},{"issue":"2","key":"20_CR32","doi-asserted-by":"publisher","first-page":"509","DOI":"10.1007\/s10115-016-1015-z","volume":"52","author":"P Tang","year":"2017","unstructured":"Tang, P., Zhang, J., Wang, X., Feng, B., Roli, F., Liu, W.: Learning extremely shared middle-level image representation for scene classification. Knowl. Inf. Syst. 52(2), 509\u2013530 (2017)","journal-title":"Knowl. Inf. Syst."},{"key":"20_CR33","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., Sablayrolles, A., Synnaeve, G., J\u00e9gou, H.: Going deeper with image transformers (2021)","DOI":"10.1109\/ICCV48922.2021.00010"},{"key":"20_CR34","doi-asserted-by":"crossref","unstructured":"Wang, P., Shen, X., Lin, Z., Cohen, S., Price, B., Yuille, A.: Joint object and part segmentation using deep learned potentials (2015)","DOI":"10.1109\/ICCV.2015.184"},{"key":"20_CR35","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: SegFormer: simple and efficient design for semantic segmentation with transformers (2021)"},{"key":"20_CR36","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.: GroupViT: semantic segmentation emerges from text supervision (2022)","DOI":"10.1109\/CVPR52688.2022.01760"},{"key":"20_CR37","unstructured":"Yang, C., Xu, J., Mello, S.D., Crowley, E.J., Wang, X.: GPViT: a high resolution non-hierarchical vision transformer with group propagation (2023)"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Yang, F., Sun, Q., Jin, H., Zhou, Z.: Superpixel segmentation with fully convolutional networks. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01398"},{"key":"20_CR39","doi-asserted-by":"crossref","unstructured":"Yu, Q., et al.: CMT-deeplab: clustering mask transformers for panoptic segmentation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00259"},{"key":"20_CR40","unstructured":"Yu, Q., et al.: kmax-deeplab: k-means mask transformer (2023)"},{"key":"20_CR41","doi-asserted-by":"crossref","unstructured":"Yu, Q., et al.: k-means Mask Transformer. In: ECCV (2022)","DOI":"10.1007\/978-3-031-19818-2_17"},{"key":"20_CR42","unstructured":"Zhang, B., et al.: SegViT: semantic segmentation with plain vision transformers (2022)"},{"key":"20_CR43","unstructured":"Zhang, T., Yu, Q., Yuille, A., He, J.: Dictionary-based framework for interpretable and consistent object parsing (2024)"},{"key":"20_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Pang, B., Lu, C.: Semantic segmentation by early region proxy. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00132"},{"key":"20_CR45","doi-asserted-by":"crossref","unstructured":"Zhao, Y., Li, J., Zhang, Y., Tian, Y.: Multi-class part parsing with joint boundary-semantic awareness. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9177\u20139186 (2019)","DOI":"10.1109\/ICCV.2019.00927"},{"key":"20_CR46","doi-asserted-by":"crossref","unstructured":"Zhu, A.Z., et al.: Superpixel transformers for efficient semantic segmentation. IROS (2023)","DOI":"10.1109\/IROS55552.2023.10341519"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72848-8_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T14:10:39Z","timestamp":1732803039000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72848-8_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,29]]},"ISBN":["9783031728471","9783031728488"],"references-count":46,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72848-8_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,29]]},"assertion":[{"value":"29 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}