{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T21:25:13Z","timestamp":1742937913330,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":39,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819755967"},{"type":"electronic","value":"9789819755974"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-97-5597-4_17","type":"book-chapter","created":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T09:10:06Z","timestamp":1722503406000},"page":"193-206","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["SMVT: Spectrum-Driven Multi-scale Vision Transformer for Referring Image Segmentation"],"prefix":"10.1007","author":[{"given":"Tianxiao","family":"Li","sequence":"first","affiliation":[]},{"given":"Junhong","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Yiheng","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Kesi","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Qiqiang","family":"Xia","sequence":"additional","affiliation":[]},{"given":"Muhammad","family":"Asim","sequence":"additional","affiliation":[]},{"given":"Wenyin","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,8,2]]},"reference":[{"key":"17_CR1","doi-asserted-by":"crossref","unstructured":"Hu, R., Rohrbach, M., Darrell, T.: Segmentation from natural language expressions. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, 11\u201314 October 2016, Proceedings, Part I 14, pp. 108-124. Springer (2016)","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"17_CR2","doi-asserted-by":"crossref","unstructured":"Ding, H., Liu, C., Wang, S., et al.: Vision-language transformer and query generation for referring segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 16321\u201316330 (2021)","DOI":"10.1109\/ICCV48922.2021.01601"},{"key":"17_CR3","doi-asserted-by":"crossref","unstructured":"Yang, Z., Wang, J., Tang, Y., et al. Lavt: language-aware vision transformer for referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18155\u201318165 (2022)","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Li, R., Li, K., Kuo, Y.C., et al.: Referring image segmentation via recurrent refinement networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5745\u20135753 (2018)","DOI":"10.1109\/CVPR.2018.00602"},{"key":"17_CR5","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"17_CR6","unstructured":"Patro, B.N., Namboodiri, V.P., Agneeswaran, V.S.: SpectFormer: frequency and attention is what you need in a vision transformer. arXiv preprint arXiv:2304.06446 (2023)"},{"key":"17_CR7","doi-asserted-by":"crossref","unstructured":"Yu, L., Poirson, P., Yang, S., et al.: Modeling context in referring expressions. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, 11\u201314 October 2016, Proceedings, Part II 14, pp. 69\u201385. Springer (2016)","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"17_CR8","doi-asserted-by":"crossref","unstructured":"Nagaraja, V.K., Morariu, V.I., Davis, L.S.: Modeling context between objects for referring expression understanding. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, 11\u201314 October 2016, Proceedings, Part IV 14, pp. 792\u2013807. Springer (2016)","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"17_CR9","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., et al.: Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp.11\u201320 (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"17_CR10","unstructured":"Devlin, J., Chang, M.W., Lee, K., et al.: Bert: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"17_CR11","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"17_CR12","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"17_CR13","doi-asserted-by":"crossref","unstructured":"Wang, Z., Lu, Y., et al. Cris: clip-driven referring image segmentation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 11686\u201311695 (2022)","DOI":"10.1109\/CVPR52688.2022.01139"},{"key":"17_CR14","unstructured":"Radford, A., Kim, J.W., et al.: Learning transferable visual models from natural language supervision. International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"17_CR15","doi-asserted-by":"crossref","unstructured":"Ding, H., Zhang, S., Wu, Q., et al.: Bilateral knowledge interaction network for referring image segmentation. IEEE Trans. Multim. 26, 2966\u20132977 (2023)","DOI":"10.1109\/TMM.2023.3305869"},{"key":"17_CR16","doi-asserted-by":"crossref","unstructured":"Lee-Thorp, J., Ainslie, J., Eckstein, I., et al.: Fnet: mixing tokens with Fourier transforms. arXiv preprint arXiv:2105.03824 (2021)","DOI":"10.18653\/v1\/2022.naacl-main.319"},{"key":"17_CR17","first-page":"29319","volume":"35","author":"T Nguyen","year":"2022","unstructured":"Nguyen, T., Pham, M., et al.: Fourierformer: transformer meets generalized Fourier integral theorem. Adv. Neural. Inf. Process. Syst. 35, 29319\u201329335 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"17_CR18","doi-asserted-by":"crossref","unstructured":"Feng, G., Hu, Z., Zhang, L., et al.: Encoder fusion network with co-attention embedding for referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15506\u201315515 (2021)","DOI":"10.1109\/CVPR46437.2021.01525"},{"key":"17_CR19","unstructured":"Ba, J.L., Kiros, J.R., et al.: Layer normalization. arXiv preprint arXiv:1607.06450 (2016)"},{"key":"17_CR20","first-page":"24261","volume":"34","author":"IO Tolstikhin","year":"2021","unstructured":"Tolstikhin, I.O., Houlsby, N., Kolesnikov, A., et al.: MLP-mixer: an all-MLP architecture for vision. Adv. Neural. Inf. Process. Syst. 34, 24261\u201324272 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Li, X., Sun, X., Meng, Y., et al.: Dice loss for data-imbalanced NLP tasks. arXiv preprint arXiv:1911.02855 (2019)","DOI":"10.18653\/v1\/2020.acl-main.45"},{"key":"17_CR22","unstructured":"Kervadec, H., Bouchtiba, J., et al.: Boundary loss for highly unbalanced segmentation. International Conference on Medical Imaging with Deep Learning, pp. 285\u2013296. PMLR (2019)"},{"key":"17_CR23","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., et al.: Microsoft coco: common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, 6\u201312 September 2014, Proceedings, Part V 13, pp. 740\u2013755. Springer (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"17_CR24","unstructured":"Paszke, A., Gross, S., Massa, F., et al.: Pytorch: an imperative style, high-performance deep learning library. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., et al.: Imagenet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition. pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"17_CR26","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"17_CR27","doi-asserted-by":"crossref","unstructured":"Yu, L., Lin, Z., Shen, X., et al.: Mattnet: modular attention network for referring expression comprehension. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1307\u20131315 (2018)","DOI":"10.1109\/CVPR.2018.00142"},{"key":"17_CR28","doi-asserted-by":"crossref","unstructured":"Ye, L., Rochan, M., Liu, Z., et al.: Cross-modal self-attention network for referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10502\u201310511 (2019)","DOI":"10.1109\/CVPR.2019.01075"},{"key":"17_CR29","unstructured":"Chen, Y.W., Tsai, Y.H., Wang, T., et al.: Referring expression object segmentation with caption-aware consistency. arXiv preprint arXiv:1910.04748 (2019)"},{"key":"17_CR30","doi-asserted-by":"crossref","unstructured":"Chen, D.J., Jia, S., et al.: See-through-text grouping for referring image segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7454\u20137463 (2019)","DOI":"10.1109\/ICCV.2019.00755"},{"key":"17_CR31","doi-asserted-by":"crossref","unstructured":"Hu, Z., Feng, G., Sun, J., et al.: Bi-directional relationship inferring network for referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4424\u20134433 (2020)","DOI":"10.1109\/CVPR42600.2020.00448"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Hui, T., et al.: Linguistic structure guided context modeling for referring image segmentation. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, 23\u201328 August 2020, Proceedings, Part X 16, pp. 59\u201375. Springe (2020)","DOI":"10.1007\/978-3-030-58607-2_4"},{"issue":"9","key":"17_CR33","first-page":"4761","volume":"44","author":"S Liu","year":"2021","unstructured":"Liu, S., Hui, T., et al.: Cross-modal progressive comprehension for referring segmentation. IEEE Trans. Pattern Anal. Mach. Intell. 44(9), 4761\u20134775 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"17_CR34","doi-asserted-by":"crossref","unstructured":"Luo, G., Zhou, Y., Sun, X., et al.: Multi-task collaborative network for joint referring expression comprehension and segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10034\u201310043 (2020)","DOI":"10.1109\/CVPR42600.2020.01005"},{"key":"17_CR35","doi-asserted-by":"crossref","unstructured":"Luo, G., Zhou, Y., Ji, R., et al.: Cascade grouped attention network for referring expression segmentation. In: Proceedings of the 28th ACM International Conference on Multimedia, pp. 1274\u20131282 (2020)","DOI":"10.1145\/3394171.3414006"},{"key":"17_CR36","doi-asserted-by":"crossref","unstructured":"Liu, C., Jiang, X., Ding, H.: Instance-specific feature propagation for referring segmentation. IEEE Trans. Multim. 25, 3657\u20133667 (2022)","DOI":"10.1109\/TMM.2022.3163578"},{"key":"17_CR37","doi-asserted-by":"crossref","unstructured":"Jing, Y., Kong, T., Wang, W., et al.: Locate then segment: a strong pipeline for referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9858\u20139867 (2021)","DOI":"10.1109\/CVPR46437.2021.00973"},{"key":"17_CR38","doi-asserted-by":"crossref","unstructured":"Kim, N., Kim, D., Lan, C., et al.: Restr: convolution-free referring image segmentation using transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18145\u201318154 (2022)","DOI":"10.1109\/CVPR52688.2022.01761"},{"key":"17_CR39","doi-asserted-by":"crossref","unstructured":"Wu, J., Li, X., Li, X., et al.: Towards robust referring image segmentation. IEEE Trans. Image Process. 33, 1782\u20131794 (2024)","DOI":"10.1109\/TIP.2024.3371348"}],"container-title":["Lecture Notes in Computer Science","Advanced Intelligent Computing Technology and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-5597-4_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,1]],"date-time":"2024-08-01T09:13:24Z","timestamp":1722503604000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-5597-4_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9789819755967","9789819755974"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-5597-4_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"2 August 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICIC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Intelligent Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tianjin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 August 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 August 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icic2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.ic-icc.cn\/2024\/index.htm","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}