{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T17:15:08Z","timestamp":1743095708578,"version":"3.40.3"},"publisher-location":"Cham","reference-count":57,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031705489"},{"type":"electronic","value":"9783031705496"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-70549-6_15","type":"book-chapter","created":{"date-parts":[[2024,9,8]],"date-time":"2024-09-08T09:02:15Z","timestamp":1725786135000},"page":"245-262","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Knowledge Mining of\u00a0Scene Text for\u00a0Referring Expression Comprehension"],"prefix":"10.1007","author":[{"given":"Chenyang","family":"Gao","sequence":"first","affiliation":[]},{"given":"Biao","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Wenwen","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Yuliang","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Xiang","family":"Bai","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,9]]},"reference":[{"key":"15_CR1","doi-asserted-by":"crossref","unstructured":"Bajaj, M., Wang, L., Sigal, L.: G3raphGround: graph-based language grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4281\u20134290 (2019)","DOI":"10.1109\/ICCV.2019.00438"},{"key":"15_CR2","doi-asserted-by":"crossref","unstructured":"Biten, A.F., Litman, R., Xie, Y., Appalaraju, S., Manmatha, R.: LaTr: layout-aware transformer for scene-text VQA. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16548\u201316558 (2022)","DOI":"10.1109\/CVPR52688.2022.01605"},{"key":"15_CR3","doi-asserted-by":"crossref","unstructured":"Biten, A.F., et al.: Scene text visual question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4291\u20134301 (2019)","DOI":"10.1109\/ICCV.2019.00439"},{"key":"15_CR4","doi-asserted-by":"publisher","first-page":"7208","DOI":"10.1109\/TMM.2022.3219642","volume":"25","author":"Y Bu","year":"2022","unstructured":"Bu, Y., et al.: Scene-text oriented referring expression comprehension. IEEE Trans. Multimedia 25, 7208\u20137221 (2022)","journal-title":"IEEE Trans. Multimedia"},{"key":"15_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-030-58452-8_13","volume-title":"Computer Vision \u2013 ECCV 2020","author":"N Carion","year":"2020","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12346, pp. 213\u2013229. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13"},{"key":"15_CR6","doi-asserted-by":"crossref","unstructured":"Chen, L., Ma, W., Xiao, J., Zhang, H., Chang, S.F.: Ref-NMS: breaking proposal bottlenecks in two-stage referring expression grounding. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 1036\u20131044 (2021)","DOI":"10.1609\/aaai.v35i2.16188"},{"key":"15_CR7","doi-asserted-by":"crossref","unstructured":"Deng, J., Yang, Z., Chen, T., Zhou, W., Li, H.: TransVG: end-to-end visual grounding with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1769\u20131779 (2021)","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"15_CR8","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"15_CR9","doi-asserted-by":"crossref","unstructured":"Fang, S., Xie, H., Wang, Y., Mao, Z., Zhang, Y.: Read like humans: autonomous, bidirectional and iterative language modeling for scene text recognition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7098\u20137107 (2021)","DOI":"10.1109\/CVPR46437.2021.00702"},{"key":"15_CR10","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"402","DOI":"10.1007\/978-3-031-41682-8_25","volume-title":"Document Analysis and Recognition - ICDAR 2023","author":"C Gao","year":"2023","unstructured":"Gao, C., et al.: Textrec: A dataset for referring expression comprehension with reading comprehension. In: Fink, G.A., Jain, R., Kise, K., Zanibbi, R. (eds.) ICDAR 2023. LNCS, vol. 14189, pp. 402\u2013420. Springer, Cham (2023). https:\/\/doi.org\/10.1007\/978-3-031-41682-8_25"},{"key":"15_CR11","doi-asserted-by":"crossref","unstructured":"Gao, D., Li, K., Wang, R., Shan, S., Chen, X.: Multi-modal graph neural network for joint reasoning on vision and scene text. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12746\u201312756 (2020)","DOI":"10.1109\/CVPR42600.2020.01276"},{"key":"15_CR12","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"417","DOI":"10.1007\/978-3-030-58520-4_25","volume-title":"Computer Vision \u2013 ECCV 2020","author":"D Gurari","year":"2020","unstructured":"Gurari, D., Zhao, Y., Zhang, M., Bhattacharya, N.: Captioning images taken by people who are blind. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12362, pp. 417\u2013434. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58520-4_25"},{"key":"15_CR13","doi-asserted-by":"crossref","unstructured":"Hu, R., Singh, A., Darrell, T., Rohrbach, M.: Iterative answer prediction with pointer-augmented multimodal transformers for textVQA. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9992\u201310002 (2020)","DOI":"10.1109\/CVPR42600.2020.01001"},{"key":"15_CR14","doi-asserted-by":"crossref","unstructured":"Hu, R., Xu, H., Rohrbach, M., Feng, J., Saenko, K., Darrell, T.: Natural language object retrieval. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4555\u20134564 (2016)","DOI":"10.1109\/CVPR.2016.493"},{"key":"15_CR15","doi-asserted-by":"crossref","unstructured":"Huang, B., Lian, D., Luo, W., Gao, S.: Look before you leap: learning landmark features for one-stage visual grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16888\u201316897 (2021)","DOI":"10.1109\/CVPR46437.2021.01661"},{"key":"15_CR16","doi-asserted-by":"crossref","unstructured":"Jin, L., et al.: RefCLIP: a universal teacher for weakly supervised referring expression comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2681\u20132690 (2023)","DOI":"10.1109\/CVPR52729.2023.00263"},{"key":"15_CR17","doi-asserted-by":"crossref","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., Carion, N.: MDETR-modulated detection for end-to-end multi-modal understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1780\u20131790 (2021)","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"15_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"715","DOI":"10.1007\/978-3-030-58545-7_41","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y Kant","year":"2020","unstructured":"Kant, Y., et al.: Spatially aware multimodal transformers for TextVQA. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12354, pp. 715\u2013732. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58545-7_41"},{"key":"15_CR19","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"706","DOI":"10.1007\/978-3-030-58621-8_41","volume-title":"Computer Vision \u2013 ECCV 2020","author":"M Liao","year":"2020","unstructured":"Liao, M., Pang, G., Huang, J., Hassner, T., Bai, X.: Mask TextSpotter v3: segmentation proposal network for robust scene text spotting. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12356, pp. 706\u2013722. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58621-8_41"},{"issue":"8","key":"15_CR20","doi-asserted-by":"publisher","first-page":"3676","DOI":"10.1109\/TIP.2018.2825107","volume":"27","author":"M Liao","year":"2018","unstructured":"Liao, M., Shi, B., Bai, X.: Textboxes++: a single-shot oriented scene text detector. IEEE Trans. Image Process. 27(8), 3676\u20133690 (2018)","journal-title":"IEEE Trans. Image Process."},{"key":"15_CR21","doi-asserted-by":"crossref","unstructured":"Liao, M., Shi, B., Bai, X., Wang, X., Liu, W.: Textboxes: a fast text detector with a single deep neural network. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a031 (2017)","DOI":"10.1609\/aaai.v31i1.11196"},{"key":"15_CR22","doi-asserted-by":"crossref","unstructured":"Liao, M., Wan, Z., Yao, C., Chen, K., Bai, X.: Real-time scene text detection with differentiable binarization. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 11474\u201311481 (2020)","DOI":"10.1609\/aaai.v34i07.6812"},{"issue":"1","key":"15_CR23","doi-asserted-by":"publisher","first-page":"919","DOI":"10.1109\/TPAMI.2022.3155612","volume":"45","author":"M Liao","year":"2022","unstructured":"Liao, M., Zou, Z., Wan, Z., Yao, C., Bai, X.: Real-time scene text detection with differentiable binarization and adaptive scale fusion. IEEE Trans. Pattern Anal. Mach. Intell. 45(1), 919\u2013931 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"15_CR24","doi-asserted-by":"crossref","unstructured":"Liao, Y., et al.: A real-time cross-modality correlation filtering method for referring expression comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10880\u201310889 (2020)","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"15_CR25","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"71","DOI":"10.1007\/978-3-030-01264-9_5","volume-title":"Computer Vision \u2013 ECCV 2018","author":"P Lyu","year":"2018","unstructured":"Lyu, P., Liao, M., Yao, C., Wu, W., Bai, X.: Mask TextSpotter: an end-to-end trainable neural network for spotting text with arbitrary shapes. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) Computer Vision \u2013 ECCV 2018. LNCS, vol. 11218, pp. 71\u201388. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01264-9_5"},{"key":"15_CR26","doi-asserted-by":"crossref","unstructured":"Mafla, A., de\u00a0Rezende, R.S., G\u2019omez, L., Larlus, D., Karatzas, D.: StacMR: scene-text aware cross-modal retrieval. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 2219\u20132229 (2021)","DOI":"10.1109\/WACV48630.2021.00227"},{"key":"15_CR27","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A.L., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11\u201320 (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"15_CR28","doi-asserted-by":"crossref","unstructured":"Mu, Z., Tang, S., Tan, J., Yu, Q., Zhuang, Y.: Disentangled motif-aware graph learning for phrase grounding. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 13587\u201313594 (2021)","DOI":"10.1609\/aaai.v35i15.17602"},{"key":"15_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"792","DOI":"10.1007\/978-3-319-46493-0_48","volume-title":"Computer Vision \u2013 ECCV 2016","author":"VK Nagaraja","year":"2016","unstructured":"Nagaraja, V.K., Morariu, V.I., Davis, L.S.: Modeling context between objects for referring expression understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9908, pp. 792\u2013807. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46493-0_48"},{"key":"15_CR30","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"258","DOI":"10.1007\/978-3-030-01258-8_16","volume-title":"Computer Vision \u2013 ECCV 2018","author":"BA Plummer","year":"2018","unstructured":"Plummer, B.A., Kordas, P., Kiapour, M.H., Zheng, S., Piramuthu, R., Lazebnik, S.: Conditional image-text embedding networks. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11216, pp. 258\u2013274. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01258-8_16"},{"key":"15_CR31","unstructured":"Redmon, J., Farhadi, A.: YOLOv3: an incremental improvement. arXiv preprint arXiv:1804.02767 (2018)"},{"key":"15_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"817","DOI":"10.1007\/978-3-319-46448-0_49","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Rohrbach","year":"2016","unstructured":"Rohrbach, A., Rohrbach, M., Hu, R., Darrell, T., Schiele, B.: Grounding of textual phrases in images by reconstruction. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 817\u2013834. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_49"},{"issue":"11","key":"15_CR33","doi-asserted-by":"publisher","first-page":"2298","DOI":"10.1109\/TPAMI.2016.2646371","volume":"39","author":"B Shi","year":"2016","unstructured":"Shi, B., Bai, X., Yao, C.: An end-to-end trainable neural network for image-based sequence recognition and its application to scene text recognition. IEEE Trans. Pattern Anal. Mach. Intell. 39(11), 2298\u20132304 (2016)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"9","key":"15_CR34","doi-asserted-by":"publisher","first-page":"2035","DOI":"10.1109\/TPAMI.2018.2848939","volume":"41","author":"B Shi","year":"2018","unstructured":"Shi, B., Yang, M., Wang, X., Lyu, P., Yao, C., Bai, X.: ASTER: an attentional scene text recognizer with flexible rectification. IEEE Trans. Pattern Anal. Mach. Intell. 41(9), 2035\u20132048 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"15_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"742","DOI":"10.1007\/978-3-030-58536-5_44","volume-title":"Computer Vision \u2013 ECCV 2020","author":"O Sidorov","year":"2020","unstructured":"Sidorov, O., Hu, R., Rohrbach, M., Singh, A.: TextCaps: a dataset for image captioning with reading comprehension. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12347, pp. 742\u2013758. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_44"},{"key":"15_CR36","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: Towards VQA models that can read. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8317\u20138326 (2019)","DOI":"10.1109\/CVPR.2019.00851"},{"key":"15_CR37","doi-asserted-by":"crossref","unstructured":"Su, W., et al.: Language adaptive weight generation for multi-task visual grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10857\u201310866 (2023)","DOI":"10.1109\/CVPR52729.2023.01045"},{"key":"15_CR38","doi-asserted-by":"crossref","unstructured":"Wang, H., Bai, X., Yang, M., Zhu, S., Wang, J., Liu, W.: Scene text retrieval via joint text detection and similarity learning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4556\u20134565 (2021)","DOI":"10.1109\/CVPR46437.2021.00453"},{"key":"15_CR39","doi-asserted-by":"crossref","unstructured":"Wang, J., Tang, J., Luo, J.: Multimodal attention with image text spatial relationship for OCR-based image captioning. In: Proceedings of the ACM International Conference on Multimedia, pp. 4337\u20134345 (2020)","DOI":"10.1145\/3394171.3413753"},{"key":"15_CR40","doi-asserted-by":"crossref","unstructured":"Wang, J., Tang, J., Yang, M., Bai, X., Luo, J.: Improving OCR-based image captioning by incorporating geometrical relationship. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 1306\u20131315 (2021)","DOI":"10.1109\/CVPR46437.2021.00136"},{"issue":"2","key":"15_CR41","doi-asserted-by":"publisher","first-page":"394","DOI":"10.1109\/TPAMI.2018.2797921","volume":"41","author":"L Wang","year":"2018","unstructured":"Wang, L., Li, Y., Huang, J., Lazebnik, S.: Learning two-branch neural networks for image-text matching tasks. IEEE Trans. Pattern Anal. Mach. Intell. 41(2), 394\u2013407 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"15_CR42","doi-asserted-by":"crossref","unstructured":"Wang, L., Li, Y., Lazebnik, S.: Learning deep structure-preserving image-text embeddings. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5005\u20135013 (2016)","DOI":"10.1109\/CVPR.2016.541"},{"key":"15_CR43","doi-asserted-by":"crossref","unstructured":"Wang, Z., Bao, R., Wu, Q., Liu, S.: Confidence-aware non-repetitive multimodal transformers for textcaps. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 2835\u20132843 (2021)","DOI":"10.1609\/aaai.v35i4.16389"},{"key":"15_CR44","doi-asserted-by":"crossref","unstructured":"Yang, M., et al.: Reading and writing: Discriminative and generative modeling for self-supervised text recognition. In: Proceedings of the ACM International Conference on Multimedia, pp. 4214\u20134223 (2022)","DOI":"10.1145\/3503161.3547784"},{"key":"15_CR45","doi-asserted-by":"crossref","unstructured":"Yang, S., Li, G., Yu, Y.: Dynamic graph attention for referring expression comprehension. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4644\u20134653 (2019)","DOI":"10.1109\/ICCV.2019.00474"},{"key":"15_CR46","doi-asserted-by":"crossref","unstructured":"Yang, S., Li, G., Yu, Y.: Graph-structured referring expression reasoning in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp. 9952\u20139961 (2020)","DOI":"10.1109\/CVPR42600.2020.00997"},{"key":"15_CR47","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"387","DOI":"10.1007\/978-3-030-58568-6_23","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Z Yang","year":"2020","unstructured":"Yang, Z., Chen, T., Wang, L., Luo, J.: Improving one-stage visual grounding by recursive sub-query construction. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12359, pp. 387\u2013404. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58568-6_23"},{"key":"15_CR48","doi-asserted-by":"crossref","unstructured":"Yang, Z., Gong, B., Wang, L., Huang, W., Yu, D., Luo, J.: A fast and accurate one-stage approach to visual grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4683\u20134693 (2019)","DOI":"10.1109\/ICCV.2019.00478"},{"key":"15_CR49","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: Tap: Text-aware pre-training for text-VQA and text-caption. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8751\u20138761 (2021)","DOI":"10.1109\/CVPR46437.2021.00864"},{"key":"15_CR50","doi-asserted-by":"crossref","unstructured":"Ye, J., Lin, X., He, L., Li, D., Chen, Q.: One-stage visual grounding via semantic-aware feature filter. In: Proceedings of the ACM International Conference on Multimedia, pp. 1702\u20131711 (2021)","DOI":"10.1145\/3474085.3475313"},{"key":"15_CR51","doi-asserted-by":"crossref","unstructured":"Ye, J., et al.: Shifting more attention to visual backbone: query-modulated refinement networks for end-to-end visual grounding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15502\u201315512 (2022)","DOI":"10.1109\/CVPR52688.2022.01506"},{"key":"15_CR52","doi-asserted-by":"crossref","unstructured":"Yu, L., et al.: MAttNet: modular attention network for referring expression comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1307\u20131315 (2018)","DOI":"10.1109\/CVPR.2018.00142"},{"key":"15_CR53","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1007\/978-3-319-46475-6_5","volume-title":"Computer Vision \u2013 ECCV 2016","author":"L Yu","year":"2016","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 69\u201385. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5"},{"key":"15_CR54","doi-asserted-by":"crossref","unstructured":"Yu, W., Liu, Y., Hua, W., Jiang, D., Ren, B., Bai, X.: Turning a clip model into a scene text detector. arXiv preprint arXiv:2302.14338 (2023)","DOI":"10.1109\/CVPR52729.2023.00674"},{"key":"15_CR55","doi-asserted-by":"crossref","unstructured":"Zhang, H., Niu, Y., Chang, S.F.: Grounding referring expressions in images by variational context. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4158\u20134166 (2018)","DOI":"10.1109\/CVPR.2018.00437"},{"key":"15_CR56","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Yuan, L., Guo, Y., He, Z., Huang, I.A., Lee, H.: Discriminative bimodal networks for visual localization and detection with natural language queries. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 557\u2013566 (2017)","DOI":"10.1109\/CVPR.2017.122"},{"key":"15_CR57","doi-asserted-by":"crossref","unstructured":"Zhu, Q., Gao, C., Wang, P., Wu, Q.: Simple is not easy: a simple strong baseline for TextVQA and TextCaps. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a035, pp. 3608\u20133615 (2021)","DOI":"10.1609\/aaai.v35i4.16476"}],"container-title":["Lecture Notes in Computer Science","Document Analysis and Recognition - ICDAR 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-70549-6_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,8]],"date-time":"2024-09-08T09:06:42Z","timestamp":1725786402000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-70549-6_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031705489","9783031705496"],"references-count":57,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-70549-6_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"9 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICDAR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Document Analysis and Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Athens","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Greece","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30 August 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 September 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icdar2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icdar2024.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}