{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T12:11:14Z","timestamp":1743077474457,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":23,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819722655"},{"type":"electronic","value":"9789819722662"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-97-2266-2_7","type":"book-chapter","created":{"date-parts":[[2024,4,24]],"date-time":"2024-04-24T09:02:31Z","timestamp":1713949351000},"page":"83-94","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["GViG: Generative Visual Grounding Using Prompt-Based Language Modeling for\u00a0Visual Question Answering"],"prefix":"10.1007","author":[{"given":"Yi-Ting","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ying-Jia","family":"Lin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chia-Jen","family":"Yeh","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chun-Yi","family":"Lin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hung-Yu","family":"Kao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,4,25]]},"reference":[{"key":"7_CR1","unstructured":"Chen, T., Saxena, S., Li, L., Fleet, D.J., Hinton, G.: Pix2seq: A language modeling framework for object detection. In: International Conference on Learning Representations (2021)"},{"key":"7_CR2","first-page":"3965","volume":"34","author":"Z Dai","year":"2021","unstructured":"Dai, Z., Liu, H., Le, Q.V., Tan, M.: CoAtNet: marrying convolution and attention for all data sizes. Adv. Neural. Inf. Process. Syst. 34, 3965\u20133977 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"7_CR3","doi-asserted-by":"crossref","unstructured":"Deng, J., Yang, Z., Chen, T., Zhou, W., Li, H.: TransVG: end-to-end visual grounding with transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 1769\u20131779 (2021)","DOI":"10.1109\/ICCV48922.2021.00179"},{"key":"7_CR4","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth 16$$\\times $$16 words: transformers for image recognition at scale. In: International Conference on Learning Representations (2021)"},{"key":"7_CR5","unstructured":"Gao, S., Chen, Z., Chen, G., Wang, W., Lu, T.: Champion solution for the WSDM2023 toloka VQA challenge. arXiv preprint arXiv:2301.09045 (2023)"},{"key":"7_CR6","doi-asserted-by":"crossref","unstructured":"Gao, T., Fisch, A., Chen, D.: Making pre-trained language models better few-shot learners. In: Joint Conference of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing, ACL-IJCNLP 2021, pp. 3816\u20133830. Association for Computational Linguistics (ACL) (2021)","DOI":"10.18653\/v1\/2021.acl-long.295"},{"key":"7_CR7","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"7_CR8","doi-asserted-by":"crossref","unstructured":"Huang, S., et al.: Referring image segmentation via cross-modal progressive comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10488\u201310497 (2020)","DOI":"10.1109\/CVPR42600.2020.01050"},{"key":"7_CR9","doi-asserted-by":"crossref","unstructured":"Jin, W., Cheng, Y., Shen, Y., Chen, W., Ren, X.: A good prompt is worth millions of parameters: low-resource prompt-based learning for vision-language models. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 2763\u20132775 (2022)","DOI":"10.18653\/v1\/2022.acl-long.197"},{"key":"7_CR10","unstructured":"Kenton, J.D.M.W.C., Toutanova, L.K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL-HLT, pp. 4171\u20134186 (2019)"},{"key":"7_CR11","unstructured":"Komleva, E.: WSDM2023 VQA. https:\/\/github.com\/EvgeniaKomleva\/WSDM2023_VQA (2023)"},{"issue":"1","key":"7_CR12","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual Genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017). https:\/\/doi.org\/10.1007\/s11263-016-0981-7","journal-title":"Int. J. Comput. Vis."},{"key":"7_CR13","doi-asserted-by":"crossref","unstructured":"Li, C., et al.: mPLUG: effective and efficient vision-language learning by cross-modal skip-connections. In: Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, pp. 7241\u20137259. Association for Computational Linguistics (Dec 2022). https:\/\/aclanthology.org\/2022.emnlp-main.488","DOI":"10.18653\/v1\/2022.emnlp-main.488"},{"key":"7_CR14","doi-asserted-by":"crossref","unstructured":"Liu, J., et al.: PolyFormer: referring image segmentation as sequential polygon generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18653\u201318663 (2023)","DOI":"10.1109\/CVPR52729.2023.01789"},{"issue":"9","key":"7_CR15","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3560815","volume":"55","author":"P Liu","year":"2023","unstructured":"Liu, P., Yuan, W., Fu, J., Jiang, Z., Hayashi, H., Neubig, G.: Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing. ACM Comput. Surv. 55(9), 1\u201335 (2023)","journal-title":"ACM Comput. Surv."},{"key":"7_CR16","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training (2018)"},{"key":"7_CR17","doi-asserted-by":"crossref","unstructured":"Sennrich, R., Haddow, B., Birch, A.: Neural machine translation of rare words with subword units. In: Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 1715\u20131725 (2016)","DOI":"10.18653\/v1\/P16-1162"},{"key":"7_CR18","first-page":"200","volume":"34","author":"M Tsimpoukelli","year":"2021","unstructured":"Tsimpoukelli, M., Menick, J.L., Cabi, S., Eslami, S., Vinyals, O., Hill, F.: Multimodal few-shot learning with frozen language models. Adv. Neural. Inf. Process. Syst. 34, 200\u2013212 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"7_CR19","unstructured":"Ustalov, D., Pavlichenko, N., Likhobaba, D., Smirnova, A.: WSDM cup 2023 challenge on visual question answering (2023)"},{"key":"7_CR20","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in neural information processing systems, vol. 30 (2017)"},{"key":"7_CR21","unstructured":"Wang, P., et al.: OFA: unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, pp. 23318\u201323340. PMLR (2022)"},{"key":"7_CR22","doi-asserted-by":"crossref","unstructured":"Yang, Z., Gong, B., Wang, L., Huang, W., Yu, D., Luo, J.: A fast and accurate one-stage approach to visual grounding. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4683\u20134693 (2019)","DOI":"10.1109\/ICCV.2019.00478"},{"key":"7_CR23","unstructured":"Zhang, H., Wong, K.: VQA. https:\/\/github.com\/Hyu-Zhang\/VQA (2023)"}],"container-title":["Lecture Notes in Computer Science","Advances in Knowledge Discovery and Data Mining"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-2266-2_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,4,24]],"date-time":"2024-04-24T23:20:37Z","timestamp":1714000837000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-2266-2_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9789819722655","9789819722662"],"references-count":23,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-2266-2_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"25 April 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PAKDD","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Pacific-Asia Conference on Knowledge Discovery and Data Mining","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Taipei","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Taiwan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 May 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 May 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"pakdd2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/pakdd2024.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}