{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,19]],"date-time":"2026-03-19T23:40:05Z","timestamp":1773963605057,"version":"3.50.1"},"publisher-location":"Singapore","reference-count":39,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819720910","type":"print"},{"value":"9789819720927","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-97-2092-7_16","type":"book-chapter","created":{"date-parts":[[2024,3,29]],"date-time":"2024-03-29T06:02:18Z","timestamp":1711692138000},"page":"316-331","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Object Category-Based Visual Dialog for\u00a0Effective Question Generation"],"prefix":"10.1007","author":[{"given":"Feifei","family":"Xu","sequence":"first","affiliation":[]},{"given":"Yingchen","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Zheng","family":"Zhong","sequence":"additional","affiliation":[]},{"given":"Guangzhen","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,3,30]]},"reference":[{"key":"16_CR1","unstructured":"Abbasnejad, E., Wu, Q., Abbasnejad, I., Shi, J.Q., van\u00a0den Hengel, A.: An active information seeking model for goal-oriented vision-and-language tasks. ArXiv abs\/1812.06398 (2018)"},{"key":"16_CR2","doi-asserted-by":"crossref","unstructured":"Abbasnejad, E., Wu, Q., Shi, J.Q., van\u00a0den Hengel, A.: What\u2019s to know? Uncertainty as a guide to asking goal-oriented questions. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4150\u20134159 (2018)","DOI":"10.1109\/CVPR.2019.00428"},{"key":"16_CR3","doi-asserted-by":"crossref","unstructured":"Agarwal, S., Bui, T., Lee, J.Y., Konstas, I., Rieser, V.: History for visual dialog: do we really need it? In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 8182\u20138197 (2020)","DOI":"10.18653\/v1\/2020.acl-main.728"},{"key":"16_CR4","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Vision-and-language navigation: interpreting visually-grounded navigation instructions in real environments. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp. 3674\u20133683 (2018)","DOI":"10.1109\/CVPR.2018.00387"},{"key":"16_CR5","doi-asserted-by":"crossref","unstructured":"Bani, G., et al.: Adding object detection skills to visual dialogue agents. In: ECCV Workshops (2018)","DOI":"10.1007\/978-3-030-11018-5_17"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Chattopadhyay, P., et al.: Evaluating visual conversational agents via cooperative human-AI games. In: Proceedings of the AAAI Conference on Human Computation and Crowdsourcing, vol.\u00a05, pp. 2\u201310 (2017)","DOI":"10.1609\/hcomp.v5i1.13312"},{"key":"16_CR7","doi-asserted-by":"crossref","unstructured":"Chen, C., Tan, Z., Cheng, Q., Jiang, X., Liu, Q., Zhu, Y., Gu, X.: UTC: a unified transformer with inter-task contrastive learning for visual dialog. In: Proceedings of the IEEE\/CVF Conference on computer vision and pattern recognition, pp. 18103\u201318112 (2022)","DOI":"10.1109\/CVPR52688.2022.01757"},{"key":"16_CR8","doi-asserted-by":"crossref","unstructured":"Chen, S., Guhur, P.L., Tapaswi, M., Schmid, C., Laptev, I.: Think global, act local: dual-scale graph transformer for vision-and-language navigation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16537\u201316547 (2022)","DOI":"10.1109\/CVPR52688.2022.01604"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Das, A., et al.: Visual dialog. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 326\u2013335 (2017)","DOI":"10.1109\/CVPR.2017.121"},{"key":"16_CR10","doi-asserted-by":"crossref","unstructured":"Das, A., Kottur, S., Moura, J.M.F., Lee, S., Batra, D.: Learning cooperative visual dialog agents with deep reinforcement learning. In: 2017 IEEE International Conference on Computer Vision (ICCV), pp. 2970\u20132979 (2017)","DOI":"10.1109\/ICCV.2017.321"},{"key":"16_CR11","doi-asserted-by":"crossref","unstructured":"De\u00a0Vries, H., Strub, F., Chandar, S., Pietquin, O., Larochelle, H., Courville, A.: Guesswhat?! visual object discovery through multi-modal dialogue. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5503\u20135512 (2017)","DOI":"10.1109\/CVPR.2017.475"},{"key":"16_CR12","doi-asserted-by":"publisher","first-page":"2451","DOI":"10.1162\/089976600300015015","volume":"12","author":"FA Gers","year":"2000","unstructured":"Gers, F.A., Schmidhuber, J., Cummins, F.: Learning to forget: continual prediction with LSTM. Neural Comput. 12, 2451\u20132471 (2000)","journal-title":"Neural Comput."},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Guo, D., Wang, H., Zhang, H., Zha, Z.J., Wang, M.: Iterative context-aware graph inference for visual dialog. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10055\u201310064 (2020)","DOI":"10.1109\/CVPR42600.2020.01007"},{"key":"16_CR14","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. CoRR abs\/1412.6980 (2014)"},{"key":"16_CR15","unstructured":"Lee, S.W., Heo, Y.J., Zhang, B.T.: Answerer in questioner\u2019s mind: information theoretic approach to goal-oriented visual dialog. In: Neural Information Processing Systems (2018)"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Oshima, R., Shinagawa, S., Tsunashima, H., Feng, Q., Morishima, S.: Pointing out human answer mistakes in a goal-oriented visual dialogue. arXiv preprint arXiv:2309.10375 (2023)","DOI":"10.1109\/ICCVW60793.2023.00503"},{"key":"16_CR17","doi-asserted-by":"crossref","unstructured":"Pang, W., Wang, X.: Visual dialogue state tracking for question generation. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a034, pp. 11831\u201311838 (2020)","DOI":"10.1609\/aaai.v34i07.6856"},{"key":"16_CR18","doi-asserted-by":"crossref","unstructured":"Pashevich, A., Schmid, C., Sun, C.: Episodic transformer for vision-and-language navigation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15942\u201315952 (2021)","DOI":"10.1109\/ICCV48922.2021.01564"},{"key":"16_CR19","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems, vol. 28 (2015)"},{"key":"16_CR20","unstructured":"Sang-Woo, L., Tong, G., Sohee, Y., Jaejun, Y., Jung-Woo, H.: Large-scale answerer in questioner\u2019s mind for visual dialog question generation. In: Proceedings of International Conference on Learning Representations. ICLR (2019)"},{"key":"16_CR21","unstructured":"Serban, I., Sordoni, A., Bengio, Y., Courville, A.C., Pineau, J.: Hierarchical neural network generative models for movie dialogues. ArXiv abs\/1507.04808 (2015)"},{"key":"16_CR22","unstructured":"Shekhar, R., Baumg\u00e4rtner, T., Venkatesh, A., Bruni, E., Bernardi, R., Fern\u00e1ndez, R.: Ask no more: deciding when to guess in referential visual dialogue. In: Proceedings of the 27th International Conference on Computational Linguistics, pp. 1218\u20131233 (2019)"},{"key":"16_CR23","doi-asserted-by":"crossref","unstructured":"Shekhar, R., et al.: Beyond task success: a closer look at jointly learning to see, ask, and guessWhat. In: North American Chapter of the Association for Computational Linguistics (2018)","DOI":"10.18653\/v1\/N19-1265"},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Shekhar, R., et al.: Beyond task success: a closer look at jointly learning to see, ask, and guessWhat. In: Proceedings of NAACL-HLT, pp. 2578\u20132587 (2019)","DOI":"10.18653\/v1\/N19-1265"},{"key":"16_CR25","doi-asserted-by":"crossref","unstructured":"Shukla, P., Elmadjian, C., Sharan, R., Kulkarni, V., Turk, M., Wang, W.Y.: What should i ask? Using conversationally informative rewards for goal-oriented visual dialog. In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 6442\u20136451 (2020)","DOI":"10.18653\/v1\/P19-1646"},{"key":"16_CR26","doi-asserted-by":"crossref","unstructured":"Sicilia, A., Alikhani, M.: Learning to generate equitable text in dialogue from biased training data. arXiv preprint arXiv:2307.04303 (2023)","DOI":"10.18653\/v1\/2023.acl-long.163"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Strub, F., de\u00a0Vries, H., Mary, J., Piot, B., Courville, A., Pietquin, O.: End-to-end optimization of goal-driven and visually grounded dialogue systems. In: International Joint Conference on Artificial Intelligence (2017)","DOI":"10.24963\/ijcai.2017\/385"},{"key":"16_CR28","doi-asserted-by":"crossref","unstructured":"Testoni, A., Bernardi, R.: Looking for confirmations: an effective and human-like visual dialogue strategy. In: Conference on Empirical Methods in Natural Language Processing (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.736"},{"key":"16_CR29","doi-asserted-by":"crossref","unstructured":"Testoni, A., Bernardi, R.: Garbage in, flowers out: noisy training data help generative models at test time. IJCoL. Italian J. Comput. Linguist. 8, 8\u20131 (2022)","DOI":"10.4000\/ijcol.974"},{"key":"16_CR30","doi-asserted-by":"crossref","unstructured":"Tu, T., Ping, Q., Thattai, G., Tur, G., Natarajan, P.: Learning better visual dialog agents with pretrained visual-linguistic representation. In: 2021 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5618\u20135627 (2021)","DOI":"10.1109\/CVPR46437.2021.00557"},{"key":"16_CR31","doi-asserted-by":"crossref","unstructured":"Wang, Y., Xu, J., Sun, Y.: End-to-end transformer based model for image captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a036, pp. 2585\u20132594 (2022)","DOI":"10.1609\/aaai.v36i3.20160"},{"key":"16_CR32","doi-asserted-by":"crossref","unstructured":"Wang, Y., Joty, S., Lyu, M., King, I., Xiong, C., Hoi, S.C.: VD-BERT: A unified vision and dialog transformer with BERT. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 3325\u20133338 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.269"},{"key":"16_CR33","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: International Conference on Machine Learning, pp. 2048\u20132057. PMLR (2015)"},{"key":"16_CR34","doi-asserted-by":"crossref","unstructured":"Xu, Z., Feng, F., Wang, X., Yang, Y., Jiang, H., Ouyang, Z.: Answer-driven visual state estimator for goal-oriented visual dialogue. In: Proceedings of the 28th ACM International Conference on Multimedia (2020)","DOI":"10.1145\/3394171.3413668"},{"key":"16_CR35","unstructured":"Yanan, S., Yanxin, T., Fangxiang, F., Chunping, Z., Xiaojie, W.: Category-based strategy-driven question generator for visual dialogue. In: Proceedings of the 20th Chinese National Conference on Computational Linguistics, pp. 1000\u20131011 (2022)"},{"key":"16_CR36","doi-asserted-by":"crossref","unstructured":"Yuan, Z., et al.: X-trans2cap: cross-modal knowledge transfer using transformer for 3D dense captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8563\u20138573 (2022)","DOI":"10.1109\/CVPR52688.2022.00837"},{"key":"16_CR37","doi-asserted-by":"crossref","unstructured":"Zhang, J., Wu, Q., Shen, C., Zhang, J., Lu, J., van\u00a0den Hengel, A.: Asking the difficult questions: goal-oriented visual question generation via intermediate rewards. In: European Conference on Computer Vision (2017)","DOI":"10.1007\/978-3-030-01228-1_12"},{"key":"16_CR38","unstructured":"Zhao, R., Tresp, V.: Improving goal-oriented visual dialog agents via advanced recurrent nets with tempered policy gradient. In: LaCATODA@ IJCAI, pp.\u00a01\u20137 (2018)"},{"key":"16_CR39","doi-asserted-by":"crossref","unstructured":"Zheng, D., Xu, Z., Meng, F., Wang, X., Wang, J., Zhou, J.: Enhancing visual dialog questioner with entity-based strategy learning and augmented guesser. In: Findings of the Association for Computational Linguistics: EMNLP 2021, pp. 1839\u20131851 (2021)","DOI":"10.18653\/v1\/2021.findings-emnlp.158"}],"container-title":["Lecture Notes in Computer Science","Computational Visual Media"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-2092-7_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,29]],"date-time":"2024-03-29T06:07:08Z","timestamp":1711692428000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-2092-7_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9789819720910","9789819720927"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-2092-7_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"30 March 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CVM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Computational Visual Media","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Wellington","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"New Zealand","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"10 April 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"12 April 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cvm2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CVM submission system","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"212","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"34","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"16% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}