{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T18:03:53Z","timestamp":1743098633119,"version":"3.40.3"},"publisher-location":"Cham","reference-count":36,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031781094"},{"type":"electronic","value":"9783031781100"}],"license":[{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,2]],"date-time":"2024-12-02T00:00:00Z","timestamp":1733097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-78110-0_3","type":"book-chapter","created":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T21:53:54Z","timestamp":1733090034000},"page":"36-50","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Multi-frequency Fine-Grained Matching for\u00a0Audio-Visual Segmentation"],"prefix":"10.1007","author":[{"given":"Yinhao","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Tianyang","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Xiao-Jun","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Shaochuan","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Josef","family":"Kittler","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,2]]},"reference":[{"key":"3_CR1","doi-asserted-by":"crossref","unstructured":"Chen, H., Xie, W., Afouras, T., Nagrani, A., Vedaldi, A., Zisserman, A.: Localizing visual sounds the hard way. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16867\u201316876 (2021)","DOI":"10.1109\/CVPR46437.2021.01659"},{"issue":"4","key":"3_CR2","doi-asserted-by":"publisher","first-page":"834","DOI":"10.1109\/TPAMI.2017.2699184","volume":"40","author":"LC Chen","year":"2017","unstructured":"Chen, L.C., Papandreou, G., Kokkinos, I., Murphy, K., Yuille, A.L.: Deeplab: semantic image segmentation with deep convolutional nets, atrous convolution, and fully connected CRFs. IEEE Trans. Pattern Anal. Mach. Intell. 40(4), 834\u2013848 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"3_CR3","doi-asserted-by":"crossref","unstructured":"Chen, T., et al.: Bootstrapping audio-visual segmentation by strengthening audio cues. arXiv preprint arXiv:2402.02327 (2024)","DOI":"10.1109\/TCSVT.2024.3486344"},{"key":"3_CR4","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. arXiv (2021)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"3_CR5","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"3_CR6","doi-asserted-by":"crossref","unstructured":"Gao, R., Grauman, K.: 2.5 D visual sound. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 324\u2013333 (2019)","DOI":"10.1109\/CVPR.2019.00041"},{"key":"3_CR7","doi-asserted-by":"crossref","unstructured":"Gao, S., Chen, Z., Chen, G., Wang, W., Lu, T.: Avsegformer: audio-visual segmentation with transformer. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 12155\u201312163 (2024)","DOI":"10.1609\/aaai.v38i11.29104"},{"key":"3_CR8","doi-asserted-by":"crossref","unstructured":"Gemmeke, J.F., et al.: Audio set: an ontology and human-labeled dataset for audio events. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 776\u2013780. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952261"},{"key":"3_CR9","doi-asserted-by":"crossref","unstructured":"Hao, D., Mao, Y., He, B., Han, X., Dai, Y., Zhong, Y.: Improving audio-visual segmentation with bidirectional generation. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 2067\u20132075 (2024)","DOI":"10.1609\/aaai.v38i3.27978"},{"key":"3_CR10","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"3_CR11","doi-asserted-by":"crossref","unstructured":"Hershey, S., et\u00a0al.: CNN architectures for large-scale audio classification. In: 2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 131\u2013135. IEEE (2017)","DOI":"10.1109\/ICASSP.2017.7952132"},{"key":"3_CR12","doi-asserted-by":"crossref","unstructured":"Huang, S., et al.: Discovering sounding objects by audio queries for audio visual segmentation. arXiv preprint arXiv:2309.09501 (2023)","DOI":"10.24963\/ijcai.2023\/97"},{"key":"3_CR13","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4015\u20134026 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"3_CR14","unstructured":"Li, F., et al.: Semantic-SAM: segment and recognize anything at any granularity. arXiv preprint arXiv:2307.04767 (2023)"},{"key":"3_CR15","doi-asserted-by":"crossref","unstructured":"Li, K., Yang, Z., Chen, L., Yang, Y., Xiao, J.: CATR: combinatorial-dependence audio-queried transformer for audio-visual video segmentation. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 1485\u20131494 (2023)","DOI":"10.1145\/3581783.3611724"},{"key":"3_CR16","doi-asserted-by":"crossref","unstructured":"Lin, Y.B., Sung, Y.L., Lei, J., Bansal, M., Bertasius, G.: Vision transformers are parameter-efficient audio-visual learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2299\u20132309 (2023)","DOI":"10.1109\/CVPR52729.2023.00228"},{"key":"3_CR17","doi-asserted-by":"crossref","unstructured":"Liu, C., et al.: BAVS: bootstrapping audio-visual segmentation by integrating foundation knowledge. arXiv preprint arXiv:2308.10175 (2023)","DOI":"10.1109\/TMM.2024.3405622"},{"key":"3_CR18","doi-asserted-by":"crossref","unstructured":"Liu, C., et al.: Audio-visual segmentation by exploring cross-modal mutual semantics. In: Proceedings of the 31st ACM International Conference on Multimedia, pp. 7590\u20137598 (2023)","DOI":"10.1145\/3581783.3612373"},{"key":"3_CR19","unstructured":"Liu, L., Chang, J., Yu, B.X., Lin, L., Tian, Q., Chen, C.W.: Prompt-matched semantic segmentation. arXiv preprint arXiv:2208.10159 (2022)"},{"key":"3_CR20","doi-asserted-by":"publisher","unstructured":"Liu, P., Yuan, W., Fu, J., Jiang, Z., Hayashi, H., Neubig, G.: Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing. ACM Comput. Surv. 1\u201335 (2023). https:\/\/doi.org\/10.1145\/3560815","DOI":"10.1145\/3560815"},{"key":"3_CR21","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"3_CR22","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3431\u20133440 (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"3_CR23","unstructured":"Mao, Y., Zhang, J., Xiang, M., Lv, Y., Zhong, Y., Dai, Y.: Contrastive conditional latent diffusion for audio-visual segmentation. arXiv preprint arXiv:2307.16579 (2023)"},{"key":"3_CR24","doi-asserted-by":"crossref","unstructured":"Mao, Y., Zhang, J., Xiang, M., Zhong, Y., Dai, Y.: Multimodal variational auto-encoder based audio-visual segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 954\u2013965 (2023)","DOI":"10.1109\/ICCV51070.2023.00094"},{"key":"3_CR25","unstructured":"Morgado, P., Nvasconcelos, N., Langlois, T., Wang, O.: Self-supervised generation of spatial audio for 360 video. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"3_CR26","doi-asserted-by":"crossref","unstructured":"Owens, A., Efros, A.A.: Audio-visual scene analysis with self-supervised multisensory features. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 631\u2013648 (2018)","DOI":"10.1007\/978-3-030-01231-1_39"},{"key":"3_CR27","doi-asserted-by":"crossref","unstructured":"Qian, R., Hu, D., Dinkel, H., Wu, M., Xu, N., Lin, W.: Multiple sound sources localization from coarse to fine. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XX 16, pp. 292\u2013308. Springer, Cham (2020)","DOI":"10.1007\/978-3-030-58565-5_18"},{"key":"3_CR28","doi-asserted-by":"crossref","unstructured":"Strudel, R., Garcia, R., Laptev, I., Schmid, C.: Segmenter: transformer for semantic segmentation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7262\u20137272 (2021)","DOI":"10.1109\/ICCV48922.2021.00717"},{"issue":"3","key":"3_CR29","doi-asserted-by":"publisher","first-page":"415","DOI":"10.1007\/s41095-022-0274-8","volume":"8","author":"W Wang","year":"2022","unstructured":"Wang, W., et al.: PVT V2: improved baselines with pyramid vision transformer. Comput. Visual Media 8(3), 415\u2013424 (2022)","journal-title":"Comput. Visual Media"},{"key":"3_CR30","doi-asserted-by":"crossref","unstructured":"Wang, Y., Liu, W., Li, G., Ding, J., Hu, D., Li, X.: Prompting segmentation with sound is generalizable audio-visual source localizer. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a038, pp. 5669\u20135677 (2024)","DOI":"10.1609\/aaai.v38i6.28378"},{"key":"3_CR31","unstructured":"Xie, E., Wang, W., Yu, Z., Anandkumar, A., Alvarez, J.M., Luo, P.: Segformer: simple and efficient design for semantic segmentation with transformers. In: Neural Information Processing Systems (NeurIPS) (2021)"},{"key":"3_CR32","doi-asserted-by":"crossref","unstructured":"Yang, Q., et al.: Cooperation does matter: exploring multi-order bilateral relations for audio-visual segmentation. arXiv preprint arXiv:2312.06462 (2023)","DOI":"10.1109\/CVPR52733.2024.02562"},{"key":"3_CR33","doi-asserted-by":"crossref","unstructured":"Zhao, H., Shi, J., Qi, X., Wang, X., Jia, J.: Pyramid scene parsing network. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2881\u20132890 (2017)","DOI":"10.1109\/CVPR.2017.660"},{"key":"3_CR34","doi-asserted-by":"crossref","unstructured":"Zheng, S., et\u00a0al.: Rethinking semantic segmentation from a sequence-to-sequence perspective with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6881\u20136890 (2021)","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"3_CR35","doi-asserted-by":"crossref","unstructured":"Zhou, J., et al.: Audio\u2013visual segmentation. In: Computer Vision\u2013ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings, Part XXXVII, pp. 386\u2013403. Springer, Cham (2022)","DOI":"10.1007\/978-3-031-19836-6_22"},{"key":"3_CR36","doi-asserted-by":"crossref","unstructured":"Zhu, J., Lai, S., Chen, X., Wang, D., Lu, H.: Visual prompt multi-modal tracking. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9516\u20139526 (2023)","DOI":"10.1109\/CVPR52729.2023.00918"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-78110-0_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T23:31:12Z","timestamp":1733095872000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-78110-0_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,2]]},"ISBN":["9783031781094","9783031781100"],"references-count":36,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-78110-0_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,12,2]]},"assertion":[{"value":"2 December 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Kolkata","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"India","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 December 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"27","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icpr2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icpr2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}