{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T12:40:13Z","timestamp":1758976813163,"version":"3.44.0"},"reference-count":52,"publisher":"Springer Science and Business Media LLC","issue":"7","license":[{"start":{"date-parts":[[2025,7,11]],"date-time":"2025-07-11T00:00:00Z","timestamp":1752192000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"},{"start":{"date-parts":[[2025,7,11]],"date-time":"2025-07-11T00:00:00Z","timestamp":1752192000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0"}],"funder":[{"DOI":"10.13039\/501100005057","name":"National Tsing Hua University","doi-asserted-by":"crossref","id":[{"id":"10.13039\/501100005057","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Data Sci Anal"],"published-print":{"date-parts":[[2025,11]]},"DOI":"10.1007\/s41060-025-00835-7","type":"journal-article","created":{"date-parts":[[2025,7,11]],"date-time":"2025-07-11T14:29:24Z","timestamp":1752244164000},"page":"6489-6505","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["GViG: low-resource generative visual grounding using prompt-based language modeling for visual question answering"],"prefix":"10.1007","volume":"20","author":[{"given":"Yi-Ting","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ying-Jia","family":"Lin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hung-Yu","family":"Kao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,7,11]]},"reference":[{"key":"835_CR1","unstructured":"Ustalov, D., Pavlichenko, N., Likhobaba, D., Smirnova, A.: Wsdm cup 2023 challenge on visual question answering (2023)"},{"key":"835_CR2","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., Parikh, D.: Vqa: visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2425\u20132433 (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"835_CR3","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., Mottaghi, R.: Ok-vqa: a visual question answering benchmark requiring external knowledge. In: Proceedings of the IEEE\/cvf Conference on Computer Vision and Pattern Recognition, pp. 3195\u20133204 (2019)","DOI":"10.1109\/CVPR.2019.00331"},{"key":"835_CR4","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L.-J., Shamma, D.A., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123, 32\u201373 (2017)","journal-title":"Int. J. Comput. Vis."},{"key":"835_CR5","doi-asserted-by":"crossref","unstructured":"Kafle, K., Kanan, C.: An analysis of visual question answering algorithms. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1965\u20131973 (2017)","DOI":"10.1109\/ICCV.2017.217"},{"key":"835_CR6","doi-asserted-by":"crossref","unstructured":"Huang, S., Hui, T., Liu, S., Li, G., Wei, Y., Han, J., Liu, L., Li, B.: Referring image segmentation via cross-modal progressive comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10488\u201310497 (2020)","DOI":"10.1109\/CVPR42600.2020.01050"},{"key":"835_CR7","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., Wang, L., Cervantes, C.M., Caicedo, J.C., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2641\u20132649 (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"835_CR8","doi-asserted-by":"crossref","unstructured":"Liu, R., Liu, C., Bai, Y., Yuille, A.L.: Clevr-ref+: diagnosing visual reasoning with referring expressions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4185\u20134194 (2019)","DOI":"10.1109\/CVPR.2019.00431"},{"key":"835_CR9","doi-asserted-by":"crossref","unstructured":"Deng, J., Yang, Z., Chen, T., Zhou, W., Li, H.: Transvg: end-to-end visual grounding with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1769\u20131779 (2021)","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"835_CR10","doi-asserted-by":"crossref","unstructured":"Yang, Z., Gong, B., Wang, L., Huang, W., Yu, D., Luo, J.: A fast and accurate one-stage approach to visual grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4683\u20134693 (2019)","DOI":"10.1109\/ICCV.2019.00478"},{"issue":"3","key":"835_CR11","doi-asserted-by":"publisher","first-page":"691","DOI":"10.6688\/JISE.202305_39(3).0015","volume":"39","author":"C-Y Wang","year":"2023","unstructured":"Wang, C.-Y., Yeh, I.-H., Liao, H.-Y.M.: You only learn one representation: unified network for multiple tasks. J. Inf. Sci. Eng. 39(3), 691\u2013709 (2023). https:\/\/doi.org\/10.6688\/JISE.202305_39(3).0015","journal-title":"J. Inf. Sci. Eng."},{"key":"835_CR12","first-page":"32","volume":"28","author":"S Ren","year":"2015","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster r-cnn: towards real-time object detection with region proposal networks. Adv. Neural Inf. Process. Syst. 28, 32 (2015)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"835_CR13","doi-asserted-by":"crossref","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A.C., Berg, T.L.: Modeling context in referring expressions. In: Computer Vision-ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part II 14, pp. 69\u201385. Springer (2016)","DOI":"10.1007\/978-3-319-46475-6_5"},{"issue":"2","key":"835_CR14","doi-asserted-by":"publisher","first-page":"394","DOI":"10.1109\/TPAMI.2018.2797921","volume":"41","author":"L Wang","year":"2018","unstructured":"Wang, L., Li, Y., Huang, J., Lazebnik, S.: Learning two-branch neural networks for image-text matching tasks. IEEE Trans. Pattern Anal. Mach. Intell. 41(2), 394\u2013407 (2018)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"835_CR15","doi-asserted-by":"crossref","unstructured":"Nagaraja, V.K., Morariu, V.I., Davis, L.S.: Modeling context between objects for referring expression understanding. In: Computer Vision-ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11\u201314, 2016, Proceedings, Part IV 14, pp. 792\u2013807. Springer (2016)","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"835_CR16","doi-asserted-by":"crossref","unstructured":"Liao, Y., Liu, S., Li, G., Wang, F., Chen, Y., Qian, C., Li, B.: A real-time cross-modality correlation filtering method for referring expression comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10880\u201310889 (2020)","DOI":"10.1109\/CVPR42600.2020.01089"},{"key":"835_CR17","unstructured":"Chen, X., Ma, L., Chen, J., Jie, Z., Liu, W., Luo, J.: Real-time referring expression comprehension by single-stage grounding network (2018). arXiv preprint arXiv:1812.03426"},{"key":"835_CR18","unstructured":"Zhang, H., Wong, K.: VQA. GitHub (2023)"},{"key":"835_CR19","unstructured":"Komleva, E.: WSDM2023 VQA. GitHub (2023)"},{"key":"835_CR20","unstructured":"Gao, S., Chen, Z., Chen, G., Wang, W., Lu, T.: Champion solution for the wsdm2023 toloka vqa challenge (2023). arXiv preprint arXiv:2301.09045"},{"key":"835_CR21","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et al.: Improving language understanding by generative pre-training (2018)"},{"key":"835_CR22","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., Savarese, S.: Generalized intersection over union: a metric and a loss for bounding box regression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 658\u2013666 (2019)","DOI":"10.1109\/CVPR.2019.00075"},{"key":"835_CR23","doi-asserted-by":"crossref","unstructured":"Kamath, A., Singh, M., LeCun, Y., Synnaeve, G., Misra, I., Carion, N.: Mdetr-modulated detection for end-to-end multi-modal understanding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1780\u20131790 (2021)","DOI":"10.1109\/ICCV48922.2021.00180"},{"key":"835_CR24","doi-asserted-by":"publisher","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part I, pp. 213\u2013229. Springer, Berlin (2020). https:\/\/doi.org\/10.1007\/978-3-030-58452-8_13","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"835_CR25","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., Stoyanov, V.: Roberta: a robustly optimized Bert pretraining approach (2019). arXiv preprint arXiv:1907.11692"},{"key":"835_CR26","doi-asserted-by":"publisher","first-page":"598","DOI":"10.1007\/978-3-031-19833-5_35","volume-title":"Computer Vision-ECCV 2022","author":"C Zhu","year":"2022","unstructured":"Zhu, C., Zhou, Y., Shen, Y., Luo, G., Pan, X., Lin, M., Chen, C., Cao, L., Sun, X., Ji, R.: Seqtr: a simple yet universal network for visual grounding. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) Computer Vision-ECCV 2022, pp. 598\u2013615. Springer, Cham (2022)"},{"issue":"2","key":"835_CR27","doi-asserted-by":"publisher","first-page":"1523","DOI":"10.1109\/TNNLS.2022.3183827","volume":"35","author":"H Zhao","year":"2024","unstructured":"Zhao, H., Zhou, J.T., Ong, Y.-S.: Word2pix: word to pixel cross-attention transformer in visual grounding. IEEE Trans. Neural Netw. Learn. Syst. 35(2), 1523\u20131533 (2024). https:\/\/doi.org\/10.1109\/TNNLS.2022.3183827","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"835_CR28","unstructured":"Chen, L., Sharma, A., Shirore, C., Zhang, C., Buddharaju, B.R.: How to backpropagate through Hungarian in your Detr? (2022). arXiv preprint arXiv:2211.14448"},{"key":"835_CR29","unstructured":"Chen, T., Saxena, S., Li, L., Fleet, D.J., Hinton, G.: Pix2seq: a language modeling framework for object detection. In: International Conference on Learning Representations (2021)"},{"key":"835_CR30","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J.D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"835_CR31","doi-asserted-by":"crossref","unstructured":"Gao, T., Fisch, A., Chen, D.: Making pre-trained language models better few-shot learners. In: Joint Conference of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, ACL-IJCNLP 2021, pp. 3816\u20133830. Association for Computational Linguistics (ACL) (2021)","DOI":"10.18653\/v1\/2021.acl-long.295"},{"issue":"9","key":"835_CR32","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3560815","volume":"55","author":"P Liu","year":"2023","unstructured":"Liu, P., Yuan, W., Fu, J., Jiang, Z., Hayashi, H., Neubig, G.: Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing. ACM Comput. Surv. 55(9), 1\u201335 (2023)","journal-title":"ACM Comput. Surv."},{"key":"835_CR33","doi-asserted-by":"crossref","unstructured":"Schick, T., Sch\u00fctze, H.: Exploiting cloze-questions for few-shot text classification and natural language inference. In: Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume, pp. 255\u2013269 (2021)","DOI":"10.18653\/v1\/2021.eacl-main.20"},{"key":"835_CR34","doi-asserted-by":"crossref","unstructured":"Ju, C., Han, T., Zheng, K., Zhang, Y., Xie, W.: Prompting visual-language models for efficient video understanding. In: European Conference on Computer Vision, pp. 105\u2013124. Springer (2022)","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"835_CR35","doi-asserted-by":"crossref","unstructured":"Jin, W., Cheng, Y., Shen, Y., Chen, W., Ren, X.: A good prompt is worth millions of parameters: low-resource prompt-based learning for vision-language models. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2763\u20132775 (2022)","DOI":"10.18653\/v1\/2022.acl-long.197"},{"key":"835_CR36","first-page":"200","volume":"34","author":"M Tsimpoukelli","year":"2021","unstructured":"Tsimpoukelli, M., Menick, J.L., Cabi, S., Eslami, S., Vinyals, O., Hill, F.: Multimodal few-shot learning with frozen language models. Adv. Neural. Inf. Process. Syst. 34, 200\u2013212 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"835_CR37","unstructured":"Wang, Z., Yu, J., Yu, A.W., Dai, Z., Tsvetkov, Y., Cao, Y.: Simvlm: simple visual language model pretraining with weak supervision. In: International Conference on Learning Representations (2021)"},{"key":"835_CR38","unstructured":"Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2021)"},{"key":"835_CR39","first-page":"3965","volume":"34","author":"Z Dai","year":"2021","unstructured":"Dai, Z., Liu, H., Le, Q.V., Tan, M.: Coatnet: marrying convolution and attention for all data sizes. Adv. Neural. Inf. Process. Syst. 34, 3965\u20133977 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"835_CR40","unstructured":"Wang, P., Yang, A., Men, R., Lin, J., Bai, S., Li, Z., Ma, J., Zhou, C., Zhou, J., Yang, H.: Ofa: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR (2022)"},{"key":"835_CR41","doi-asserted-by":"crossref","unstructured":"Li, C., Xu, H., Tian, J., Wang, W., Yan, M., Bi, B., Ye, J., Chen, H., Xu, G., Cao, Z., Zhang, J., Huang, S., Huang, F., Zhou, J., Si, L.: mPLUG: effective and efficient vision-language learning by cross-modal skip-connections. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp. 7241\u20137259. Association for Computational Linguistics (2022). https:\/\/aclanthology.org\/2022.emnlp-main.488","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"835_CR42","doi-asserted-by":"crossref","unstructured":"Liu, J., Ding, H., Cai, Z., Zhang, Y., Satzoda, R.K., Mahadevan, V., Manmatha, R.: Polyformer: referring image segmentation as sequential polygon generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18653\u201318663 (2023)","DOI":"10.1109\/CVPR52729.2023.01789"},{"key":"835_CR43","doi-asserted-by":"crossref","unstructured":"Carion, N., Massa, F., Synnaeve, G., Usunier, N., Kirillov, A., Zagoruyko, S.: End-to-end object detection with transformers. In: Computer Vision\u2013ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part I 16, pp. 213\u2013229. Springer (2020)","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"835_CR44","first-page":"64","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, \u0141, Polosukhin, I.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30, 64 (2017)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"835_CR45","doi-asserted-by":"crossref","unstructured":"Chen, Y.-C., Li, L., Yu, L., El\u00a0Kholy, A., Ahmed, F., Gan, Z., Cheng, Y., Liu, J.: Uniter: universal image-text representation learning. In: Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXX, pp. 104\u2013120. Springer (2020)","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"835_CR46","first-page":"35","volume":"32","author":"J Lu","year":"2019","unstructured":"Lu, J., Batra, D., Parikh, D., Lee, S.: Vilbert: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Adv. Neural Inf. Process. Syst. 32, 35 (2019)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"835_CR47","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"835_CR48","unstructured":"Wang, Z., Yu, J., Yu, A.W., Dai, Z., Tsvetkov, Y., Cao, Y.: Simvlm: simple visual language model pretraining with weak supervision. In: International Conference on Learning Representations (2021)"},{"key":"835_CR49","unstructured":"Kenton, J.D.M.-W.C., Toutanova, L.K.: Bert: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL-HLT, pp. 4171\u20134186 (2019)"},{"key":"835_CR50","doi-asserted-by":"crossref","unstructured":"Sennrich, R., Haddow, B., Birch, A.: Neural machine translation of rare words with subword units. In: Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 1715\u20131725 (2016)","DOI":"10.18653\/v1\/P16-1162"},{"key":"835_CR51","unstructured":"Chen, T., Saxena, S., Li, L., Lin, T.-Y., Fleet, D.J., Hinton, G.E.: A unified sequence interface for vision tasks. Adv. Neural. Inf. Process. Syst. 35, 31333\u201331346 (2022)"},{"key":"835_CR52","unstructured":"Radford, A., Wu, J., Child, R., Luan, D., Amodei, D., Sutskever, I., et al.: Language models are unsupervised multitask learners. OpenAI Blog 1(8), 9 (2019)"}],"container-title":["International Journal of Data Science and Analytics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s41060-025-00835-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s41060-025-00835-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s41060-025-00835-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,27]],"date-time":"2025-09-27T11:58:53Z","timestamp":1758974333000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s41060-025-00835-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,11]]},"references-count":52,"journal-issue":{"issue":"7","published-print":{"date-parts":[[2025,11]]}},"alternative-id":["835"],"URL":"https:\/\/doi.org\/10.1007\/s41060-025-00835-7","relation":{},"ISSN":["2364-415X","2364-4168"],"issn-type":[{"type":"print","value":"2364-415X"},{"type":"electronic","value":"2364-4168"}],"subject":[],"published":{"date-parts":[[2025,7,11]]},"assertion":[{"value":"30 November 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 May 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"11 July 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}