{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T04:48:42Z","timestamp":1769143722617,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":38,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819557158","type":"print"},{"value":"9789819557165","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5716-5_20","type":"book-chapter","created":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T13:07:21Z","timestamp":1769087241000},"page":"318-332","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Improve Visual Grounding with\u00a0Dynamic Gating and\u00a0Dual Stream Attention"],"prefix":"10.1007","author":[{"given":"Qianqian","family":"Lin","sequence":"first","affiliation":[]},{"given":"Xiaoxu","family":"Song","sequence":"additional","affiliation":[]},{"given":"Jing","family":"Shan","sequence":"additional","affiliation":[]},{"given":"Jiaying","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Junyi","family":"Jiang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,23]]},"reference":[{"key":"20_CR1","doi-asserted-by":"publisher","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Deng, C., Wu, Q., Wu, Q., Hu, F., Lyu, F., Tan, M.: Visual grounding via accumulated attention. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7746\u20137755 (2018)","DOI":"10.1109\/CVPR.2018.00808"},{"key":"20_CR3","doi-asserted-by":"crossref","unstructured":"Deng, J., Yang, Z., Chen, T., Zhou, W., Li, H.: TRANSVG: end-to-end visual grounding with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1769\u20131779 (2021)","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"20_CR4","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., et\u00a0al.: An image is worth 16x16 words: transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)"},{"key":"20_CR5","doi-asserted-by":"crossref","unstructured":"Du, Y., Fu, Z., Liu, Q., Wang, Y.: Visual grounding with transformers. In: IEEE International Conference on Multimedia and Expo, ICME 2022, Taipei, Taiwan, July 18-22, 2022, pp.\u00a01\u20136. IEEE (2022)","DOI":"10.1109\/ICME52920.2022.9859880"},{"key":"20_CR6","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"2","key":"20_CR7","doi-asserted-by":"publisher","first-page":"684","DOI":"10.1109\/TPAMI.2019.2911066","volume":"44","author":"R Hong","year":"2019","unstructured":"Hong, R., Liu, D., Mo, X., He, X., Zhang, H.: Learning to compose and reason with language tree structures for visual grounding. IEEE Trans. Patt. Anal. Mach. Intell. 44(2), 684\u2013696 (2019)","journal-title":"IEEE Trans. Patt. Anal. Mach. Intell."},{"key":"20_CR8","doi-asserted-by":"crossref","unstructured":"Hu, R., Rohrbach, M., Andreas, J., Darrell, T., Saenko, K.: Modeling relationships in referential expressions with compositional modular networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1115\u20131124 (2017)","DOI":"10.1109\/CVPR.2017.470"},{"issue":"10s","key":"20_CR9","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3505244","volume":"54","author":"S Khan","year":"2022","unstructured":"Khan, S., Naseer, M., Hayat, M., Zamir, S.W., Khan, F.S., Shah, M.: Transformers in vision: a survey. ACM Comput. Surv. (CSUR) 54(10s), 1\u201341 (2022)","journal-title":"ACM Comput. Surv. (CSUR)"},{"key":"20_CR10","doi-asserted-by":"crossref","unstructured":"Li, L.H., Zhang, P., Zhang, H., Yang, J., Li, C., Zhong, Y., et\u00a0al.: Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10965\u201310975 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"20_CR11","doi-asserted-by":"crossref","unstructured":"Liao, Y., Liu, S., Li, G., Wang, F., Chen, Y., Qian, C.: A real-time cross-modality correlation filtering method for referring expression comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10880\u201310889 (2020)","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"20_CR12","doi-asserted-by":"publisher","unstructured":"Lin, T., Maire, M., Belongie, S.J., Hays, J., Perona, P., Ramanan, D., et\u00a0al.: Microsoft COCO: common objects in context. In: Fleet, D.J., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol.\u00a08693, pp. 740\u2013755. Springer (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"20_CR13","doi-asserted-by":"crossref","unstructured":"Liu, T., Liu, X., Huang, S., Chen, H., Yin, Q., Qin, L., et\u00a0al.: Dara: Domain-and relation-aware adapters make parameter-efficient tuning for visual grounding. In: 2024 IEEE International Conference on Multimedia and Expo (ICME), pp.\u00a01\u20136. IEEE (2024)","DOI":"10.1109\/ICME57554.2024.10688132"},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"Liu, Z., Lin, Y., Cao, Y., Hu, H., Wei, Y., Zhang, Z., et\u00a0al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"20_CR15","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A.L., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 11\u201320 (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"20_CR16","doi-asserted-by":"publisher","first-page":"4426","DOI":"10.1109\/TMM.2020.3042066","volume":"23","author":"Y Qiao","year":"2020","unstructured":"Qiao, Y., Deng, C., Wu, Q.: Referring expression comprehension: a survey of methods and datasets. IEEE Trans. Multimedia 23, 4426\u20134440 (2020)","journal-title":"IEEE Trans. Multimedia"},{"key":"20_CR17","unstructured":"Redmon, J., Farhadi, A.: Yolov3: An incremental improvement. arXiv preprint arXiv:1804.02767 (2018)"},{"key":"20_CR18","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. Adv. Neural Inf. Process. Syst. 28 (2015)"},{"issue":"2","key":"20_CR19","doi-asserted-by":"publisher","first-page":"1181","DOI":"10.1109\/TPAMI.2023.3328185","volume":"46","author":"F Shi","year":"2023","unstructured":"Shi, F., Gao, R., Huang, W., Wang, L.: Dynamic mdetr: a dynamic multimodal transformer decoder for visual grounding. IEEE Trans. Patt. Anal. Mach. Intell. 46(2), 1181\u20131198 (2023)","journal-title":"IEEE Trans. Patt. Anal. Mach. Intell."},{"key":"20_CR20","doi-asserted-by":"crossref","unstructured":"Su, W., Miao, P., Dou, H., Fu, Y., Li, X.: Referring expression comprehension using language adaptive inference. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol.\u00a037, pp. 2357\u20132365 (2023)","DOI":"10.1609\/aaai.v37i2.25331"},{"key":"20_CR21","doi-asserted-by":"crossref","unstructured":"Su, W., Miao, P., Dou, H., Wang, G., Qiao, L., Li, Z., et\u00a0al.: Language adaptive weight generation for multi-task visual grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10857\u201310866 (2023)","DOI":"10.1109\/CVPR52729.2023.01045"},{"key":"20_CR22","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., et\u00a0al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Wang, J., Hao, S., Shan, J., Song, X.: Visual language\u2013let the product say what you want. In: Proceedings of the AAAI Conference on Artificial Intelligence. vol.\u00a038, pp. 23841\u201323843 (2024)","DOI":"10.1609\/aaai.v38i21.30583"},{"key":"20_CR24","doi-asserted-by":"crossref","unstructured":"Wang, P., Wu, Q., Cao, J., Shen, C., Gao, L., Hengel, A.v.d.: Neighbourhood watch: Referring expression comprehension via language-guided graph attention networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1960\u20131968 (2019)","DOI":"10.1109\/CVPR.2019.00206"},{"key":"20_CR25","unstructured":"Xiao, L., Yang, X., Lan, X., Wang, Y., Xu, C.: Towards visual grounding: a survey. arXiv preprint arXiv:2412.20206 (2024)"},{"key":"20_CR26","doi-asserted-by":"crossref","unstructured":"Yang, S., Li, G., Yu, Y.: Dynamic graph attention for referring expression comprehension. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4644\u20134653 (2019)","DOI":"10.1109\/ICCV.2019.00474"},{"key":"20_CR27","doi-asserted-by":"crossref","unstructured":"Yang, Z., Wang, J., Tang, Y., Chen, K., Zhao, H., Torr, P.H.: Lavt: language-aware vision transformer for referring image segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18155\u201318165 (2022)","DOI":"10.1109\/CVPR52688.2022.01762"},{"key":"20_CR28","doi-asserted-by":"publisher","unstructured":"Yang, Z., Chen, T., Wang, L., Luo, J.: Improving one-stage visual grounding by recursive sub-query construction. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J. (eds.) ECCV 2020. Lecture Notes in Computer Science, vol. 12359, pp. 387\u2013404. Springer (2020). https:\/\/doi.org\/10.1007\/978-3-030-58568-6_23","DOI":"10.1007\/978-3-030-58568-6_23"},{"key":"20_CR29","doi-asserted-by":"crossref","unstructured":"Yang, Z., Gong, B., Wang, L., Huang, W., Yu, D., Luo, J.: A fast and accurate one-stage approach to visual grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4683\u20134693 (2019)","DOI":"10.1109\/ICCV.2019.00478"},{"key":"20_CR30","doi-asserted-by":"crossref","unstructured":"Yu, L., Lin, Z., Shen, X., Yang, J., Lu, X., Bansal, M., et\u00a0al.: Mattnet: modular attention network for referring expression comprehension. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1307\u20131315 (2018)","DOI":"10.1109\/CVPR.2018.00142"},{"key":"20_CR31","doi-asserted-by":"publisher","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol.\u00a09906, pp. 69\u201385. Springer (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"20_CR32","first-page":"36067","volume":"35","author":"H Zhang","year":"2022","unstructured":"Zhang, H., Zhang, P., Hu, X., Chen, Y.C., Li, L., Dai, X., et al.: Glipv2: unifying localization and vision-language understanding. Adv. Neural. Inf. Process. Syst. 35, 36067\u201336080 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"20_CR33","doi-asserted-by":"crossref","unstructured":"Zheng, M., Zhang, J., Chen, Q., Peng, Y., Liu, Y.: Resvg: enhancing relation and semantic understanding in multiple instances for visual grounding. In: Proceedings of the 32nd ACM International Conference on Multimedia, pp. 1187\u20131196 (2024)","DOI":"10.1145\/3664647.3681660"},{"issue":"1","key":"20_CR34","doi-asserted-by":"publisher","first-page":"134","DOI":"10.1109\/TNNLS.2021.3090426","volume":"34","author":"Y Zhou","year":"2021","unstructured":"Zhou, Y., Ji, R., Luo, G., Sun, X., Su, J., Ding, X., et al.: A real-time global inference network for one-stage referring expression comprehension. IEEE Trans. Neural Netw. Learn. Syst. 34(1), 134\u2013143 (2021)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"20_CR35","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Ren, T., Zhu, C., Sun, X., Liu, J., Ding, X., et\u00a0al.: Trar: routing the attention spans in transformer for visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2074\u20132084 (2021)","DOI":"10.1109\/ICCV48922.2021.00208"},{"key":"20_CR36","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)"},{"key":"20_CR37","doi-asserted-by":"crossref","unstructured":"Zhuang, B., Wu, Q., Shen, C., Reid, I., Van Den\u00a0Hengel, A.: Parallel attention: a unified framework for visual object discovery through dialogs and queries. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4252\u20134261 (2018)","DOI":"10.1109\/CVPR.2018.00447"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Zou, X., Dou, Z.Y., Yang, J., Gan, Z., Li, L., Li, C., et\u00a0al.: Generalized decoding for pixel, image, and language. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15116\u201315127 (2023)","DOI":"10.1109\/CVPR52729.2023.01451"}],"container-title":["Lecture Notes in Computer Science","Web and Big Data"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5716-5_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T13:07:35Z","timestamp":1769087255000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5716-5_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819557158","9789819557165"],"references-count":38,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5716-5_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"23 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"APWeb-WAIM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asia-Pacific Web (APWeb) and Web-Age Information Management (WAIM) Joint International Conference on Web and Big Data","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shenyang","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"apwebwaim2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/apweb2025.sau.edu.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}