{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,19]],"date-time":"2025-12-19T09:58:06Z","timestamp":1766138286539,"version":"3.37.3"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2023,4,17]],"date-time":"2023-04-17T00:00:00Z","timestamp":1681689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,4,17]],"date-time":"2023-04-17T00:00:00Z","timestamp":1681689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"University Synergy Innovation Program of Anhui Province","award":["GXXT-2022-043"],"award-info":[{"award-number":["GXXT-2022-043"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"crossref","award":["62105002","61902104"],"award-info":[{"award-number":["62105002","61902104"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"crossref"}]},{"name":"Anhui Provincial Key Research and Development Program","award":["2022a05020042"],"award-info":[{"award-number":["2022a05020042"]}]},{"name":"Anhui International Joint Research Center for Ancient Architecture Intellisencing and Multi-Dimensional Modeling","award":["GJZZX2021KF01"],"award-info":[{"award-number":["GJZZX2021KF01"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2023,8]]},"DOI":"10.1007\/s00530-023-01097-8","type":"journal-article","created":{"date-parts":[[2023,4,17]],"date-time":"2023-04-17T05:02:31Z","timestamp":1681707751000},"page":"2073-2083","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Hierarchical cross-modal contextual attention network for visual grounding"],"prefix":"10.1007","volume":"29","author":[{"given":"Xin","family":"Xu","sequence":"first","affiliation":[]},{"given":"Gang","family":"Lv","sequence":"additional","affiliation":[]},{"given":"Yining","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Yuxia","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Fudong","family":"Nian","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2023,4,17]]},"reference":[{"key":"1097_CR1","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., et\u00a0al.: End-to-end object detection with transformers. In: European conference on computer vision, Springer, pp 213\u2013229 (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"1097_CR2","doi-asserted-by":"crossref","unstructured":"Chen, L., Ma, W., Xiao, J., et\u00a0al.: Ref-nms: Breaking proposal bottlenecks in two-stage referring expression grounding. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 1036\u20131044 (2021)","DOI":"10.1609\/aaai.v35i2.16188"},{"key":"1097_CR3","unstructured":"Chen, X., Ma, L., Chen, J., et\u00a0al.: Real-time referring expression comprehension by single-stage grounding network. arXiv preprint arXiv:1812.03426 (2018)"},{"key":"1097_CR4","doi-asserted-by":"crossref","unstructured":"Cui, R., Qian, T., Peng, P., et\u00a0al.: Video moment retrieval from text queries via single frame annotation. arXiv preprint arXiv:2204.09409 (2022)","DOI":"10.1145\/3477495.3532078"},{"key":"1097_CR5","doi-asserted-by":"crossref","unstructured":"Deng, C., Wu, Q., Wu, Q., et\u00a0al.: Visual grounding via accumulated attention. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7746\u20137755 (2018)","DOI":"10.1109\/CVPR.2018.00808"},{"key":"1097_CR6","doi-asserted-by":"crossref","unstructured":"Deng, J., Yang, Z., Chen, T., et\u00a0al.: Transvg: End-to-end visual grounding with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 1769\u20131779 (2021)","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"1097_CR7","unstructured":"Devlin, J., Chang, M.W., Lee, K., et\u00a0al.: Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"1097_CR8","doi-asserted-by":"crossref","unstructured":"Du, Y., Fu, Z., Liu, Q., et\u00a0al.: Visual grounding with transformers. In: 2022 IEEE International Conference on Multimedia and Expo (ICME), IEEE, pp 1\u20136 (2022)","DOI":"10.1109\/ICME52920.2022.9859880"},{"key":"1097_CR9","doi-asserted-by":"crossref","unstructured":"Gabeur, V., Sun, C., Alahari, K., et\u00a0al.: Multi-modal transformer for video retrieval. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part IV 16, Springer, pp 214\u2013229 (2020)","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"1097_CR10","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast r-cnn. In: Proceedings of the IEEE international conference on computer vision, pp 1440\u20131448 (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"1097_CR11","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1109\/TPAMI.2022.3152247","volume":"45","author":"K Han","year":"2022","unstructured":"Han, K., Wang, Y., Chen, H., et al.: A survey on vision transformer. IEEE Transact. Patt. Anal. Mach. Intell. 45, 87\u2013110 (2022)","journal-title":"IEEE Transact. Patt. Anal. Mach. Intell."},{"key":"1097_CR12","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., et\u00a0al.: Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"1097_CR13","doi-asserted-by":"publisher","first-page":"684","DOI":"10.1109\/TPAMI.2019.2911066","volume":"44","author":"R Hong","year":"2019","unstructured":"Hong, R., Liu, D., Mo, X., et al.: Learning to compose and reason with language tree structures for visual grounding. IEEE Transact. Patt. Anal. Mach. Intell. 44, 684\u201396 (2019)","journal-title":"IEEE Transact. Pattern Anal. Mach. Intellig."},{"key":"1097_CR14","doi-asserted-by":"crossref","unstructured":"Hu, R., Rohrbach, M., Andreas, J., et\u00a0al.: Modeling relationships in referential expressions with compositional modular networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1115\u20131124 (2017)","DOI":"10.1109\/CVPR.2017.470"},{"key":"1097_CR15","doi-asserted-by":"crossref","unstructured":"Huang, B., Lian, D., Luo, W., et\u00a0al.: Look before you leap: Learning landmark features for one-stage visual grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 16,888\u201316,897 (2021)","DOI":"10.1109\/CVPR46437.2021.01661"},{"key":"1097_CR16","doi-asserted-by":"crossref","unstructured":"Jiao, Y., Jie, Z., Chen, J., et\u00a0al.: Suspected object matters: Rethinking model\u2019s prediction for one-stage visual grounding. arXiv preprint arXiv:2203.05186 (2022)","DOI":"10.1145\/3581783.3611721"},{"key":"1097_CR17","doi-asserted-by":"crossref","unstructured":"Kovaleva, O., Romanov, A., Rogers, A., et\u00a0al.: Revealing the dark secrets of bert. arXiv preprint arXiv:1908.08593 (2019)","DOI":"10.18653\/v1\/D19-1445"},{"key":"1097_CR18","doi-asserted-by":"crossref","unstructured":"Kovvuri, R., Nevatia, R.: Pirc net: Using proposal indexing, relationships and context for phrase grounding. In: Asian Conference on Computer Vision, Springer, pp 451\u2013467 (2018)","DOI":"10.1007\/978-3-030-20870-7_28"},{"key":"1097_CR19","doi-asserted-by":"crossref","unstructured":"Liao, Y., Liu, S., Li, G., et\u00a0al.: A real-time cross-modality correlation filtering method for referring expression comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 10,880\u201310,889 (2020)","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"1097_CR20","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Maire, M., Belongie, S., et\u00a0al.: Microsoft coco: Common objects in context. In: European conference on computer vision, Springer, pp 740\u2013755 (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1097_CR21","doi-asserted-by":"crossref","unstructured":"Liu, D., Zhang, H., Wu, F., et\u00a0al.: Learning to assemble neural module tree networks for visual grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 4673\u20134682 (2019a)","DOI":"10.1109\/ICCV.2019.00477"},{"key":"1097_CR22","doi-asserted-by":"crossref","unstructured":"Liu, X., Wang, Z., Shao, J., et\u00a0al.: Improving referring expression grounding with cross-modal attention-guided erasing. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 1950\u20131959 (2019b)","DOI":"10.1109\/CVPR.2019.00205"},{"key":"1097_CR23","doi-asserted-by":"crossref","unstructured":"Liu, Y., Li, S., Wu, Y., et\u00a0al.: Umt: Unified multi-modal transformers for joint video moment retrieval and highlight detection. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 3042\u20133051 (2022)","DOI":"10.1109\/CVPR52688.2022.00305"},{"key":"1097_CR24","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"1097_CR25","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., et\u00a0al.: Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 11\u201320 (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"1097_CR26","doi-asserted-by":"crossref","unstructured":"Nagaraja, V.K., Morariu, V.I., Davis, L.S.: Modeling context between objects for referring expression understanding. In: Computer Vision\u2013ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part IV 14, Springer, pp 792\u2013807 (2016)","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"1097_CR27","unstructured":"Parmar, N., Vaswani, A., Uszkoreit, J., et\u00a0al.: Image transformer. In: International conference on machine learning, PMLR, pp 4055\u20134064 (2018)"},{"key":"1097_CR28","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., et\u00a0al.: Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE international conference on computer vision, pp 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"1097_CR29","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Kordas, P., Kiapour, M.H., et\u00a0al.: Conditional image-text embedding networks. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 249\u2013264 (2018)","DOI":"10.1007\/978-3-030-01258-8_16"},{"key":"1097_CR30","doi-asserted-by":"crossref","unstructured":"Qian, S., Wang, J., Hu, J., et\u00a0al.: Hierarchical multi-modal contextual attention network for fake news detection. In: Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp 153\u2013162 (2021)","DOI":"10.1145\/3404835.3462871"},{"key":"1097_CR31","doi-asserted-by":"publisher","first-page":"4426","DOI":"10.1109\/TMM.2020.3042066","volume":"23","author":"Y Qiao","year":"2020","unstructured":"Qiao, Y., Deng, C., Wu, Q.: Referring expression comprehension: a survey of methods and datasets. IEEE Transact. Multimedia 23, 4426\u20134440 (2020)","journal-title":"IEEE Transact. Multimedia"},{"key":"1097_CR32","unstructured":"Redmon, J., Farhadi, A.: Yolov3: An incremental improvement. arXiv preprint arXiv:1804.02767 (2018)"},{"key":"1097_CR33","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., et\u00a0al.: Generalized intersection over union: A metric and a loss for bounding box regression. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 658\u2013666 (2019)","DOI":"10.1109\/CVPR.2019.00075"},{"key":"1097_CR34","doi-asserted-by":"crossref","unstructured":"Sadhu, A., Chen, K., Nevatia, R.: Zero-shot grounding of objects from natural language queries. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 4694\u20134703 (2019)","DOI":"10.1109\/ICCV.2019.00479"},{"key":"1097_CR35","unstructured":"Song, Y., Wang, J., Liang, Z., et\u00a0al.: Utilizing bert intermediate layers for aspect based sentiment analysis and natural language inference. arXiv preprint arXiv:2002.04815 (2020)"},{"issue":"2","key":"1097_CR36","doi-asserted-by":"publisher","first-page":"394","DOI":"10.1109\/TPAMI.2018.2797921","volume":"41","author":"L Wang","year":"2018","unstructured":"Wang, L., Li, Y., Huang, J., et al.: Learning two-branch neural networks for image-text matching tasks. IEEE Transact. Pattern Anal. Mach. Intell. 41(2), 394\u2013407 (2018)","journal-title":"IEEE Transact. Pattern Anal. Mach. Intell."},{"key":"1097_CR37","doi-asserted-by":"crossref","unstructured":"Wang, P., Wu, Q., Cao, J., et\u00a0al.: Neighbourhood watch: Referring expression comprehension via language-guided graph attention networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 1960\u20131968 (2019)","DOI":"10.1109\/CVPR.2019.00206"},{"key":"1097_CR38","unstructured":"Wu, P., He, X., Tang, M., et\u00a0al.: Hanet: Hierarchical alignment networks for video-text retrieval. In: Proceedings of the 29th ACM international conference on Multimedia, pp 3518\u20133527 (2021)"},{"key":"1097_CR39","unstructured":"Yang, C., Wang, G., Li, D., et\u00a0al.: Ppgn: Phrase-guided proposal generation network for referring expression comprehension. arXiv preprint arXiv:2012.10890 (2020a)"},{"key":"1097_CR40","doi-asserted-by":"crossref","unstructured":"Yang, S., Li, G., Yu, Y.: Dynamic graph attention for referring expression comprehension. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 4644\u20134653 (2019a)","DOI":"10.1109\/ICCV.2019.00474"},{"key":"1097_CR41","doi-asserted-by":"crossref","unstructured":"Yang, S., Li, G., Yu, Y.: Propagating over phrase relations for one-stage visual grounding. In: European Conference on Computer Vision, Springer, pp 589\u2013605 (2020b)","DOI":"10.1007\/978-3-030-58529-7_35"},{"key":"1097_CR42","doi-asserted-by":"crossref","unstructured":"Yang, Z., Gong, B., Wang, L., et\u00a0al.: A fast and accurate one-stage approach to visual grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 4683\u20134693 (2019b)","DOI":"10.1109\/ICCV.2019.00478"},{"key":"1097_CR43","doi-asserted-by":"crossref","unstructured":"Yang, Z., Chen, T., Wang, L., et\u00a0al.: Improving one-stage visual grounding by recursive sub-query construction. In: European Conference on Computer Vision, Springer, pp 387\u2013404 (2020c)","DOI":"10.1007\/978-3-030-58568-6_23"},{"key":"1097_CR44","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., et al.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Transact. Assoc. Computat. Linguistics 2, 67\u201378 (2014)","journal-title":"Transact. Assoc. Computat. Linguistics"},{"key":"1097_CR45","doi-asserted-by":"crossref","unstructured":"Yu, L., Poirson, P., Yang, S., et\u00a0al.: Modeling context in referring expressions. In: European Conference on Computer Vision, Springer, pp 69\u201385 (2016)","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"1097_CR46","doi-asserted-by":"crossref","unstructured":"Yu, L., Lin, Z., Shen, X., et\u00a0al.: Mattnet: Modular attention network for referring expression comprehension. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 1307\u20131315 (2018a)","DOI":"10.1109\/CVPR.2018.00142"},{"key":"1097_CR47","doi-asserted-by":"crossref","unstructured":"Yu, Z., Yu, J., Xiang, C., et\u00a0al.: Rethinking diversified and discriminative proposal generation for visual grounding. arXiv preprint arXiv:1805.03508 (2018b)","DOI":"10.24963\/ijcai.2018\/155"},{"key":"1097_CR48","doi-asserted-by":"crossref","unstructured":"Zhang, H., Niu, Y., Chang, S.F.: Grounding referring expressions in images by variational context. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 4158\u20134166 (2018)","DOI":"10.1109\/CVPR.2018.00437"},{"key":"1097_CR49","doi-asserted-by":"crossref","unstructured":"Zhuang, B., Wu, Q., Shen, C., et\u00a0al.: Parallel attention: A unified framework for visual object discovery through dialogs and queries. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 4252\u20134261 (2018)","DOI":"10.1109\/CVPR.2018.00447"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-023-01097-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-023-01097-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-023-01097-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,10]],"date-time":"2023-12-10T19:22:25Z","timestamp":1702236145000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-023-01097-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,4,17]]},"references-count":49,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2023,8]]}},"alternative-id":["1097"],"URL":"https:\/\/doi.org\/10.1007\/s00530-023-01097-8","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2023,4,17]]},"assertion":[{"value":"11 January 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 April 2023","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 April 2023","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}