{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T11:04:16Z","timestamp":1742987056377,"version":"3.40.3"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031533075"},{"type":"electronic","value":"9783031533082"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-53308-2_30","type":"book-chapter","created":{"date-parts":[[2024,1,27]],"date-time":"2024-01-27T21:37:36Z","timestamp":1706391456000},"page":"409-422","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A Coarse and\u00a0Fine Grained Masking Approach for\u00a0Video-Grounded Dialogue"],"prefix":"10.1007","author":[{"given":"Feifei","family":"Xu","sequence":"first","affiliation":[]},{"given":"Wang","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Tao","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Jiahao","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Ziheng","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Guangzhen","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,1,28]]},"reference":[{"key":"30_CR1","doi-asserted-by":"crossref","unstructured":"Das, A., et al.: Visual dialog. In: CVPR, pp. 326\u2013335 (2017)","DOI":"10.1109\/CVPR.2017.121"},{"key":"30_CR2","doi-asserted-by":"crossref","unstructured":"Lei, J., Yu, L., Bansal, M., Berg, T.L.: Tvqa: localized, compositional video question answering. arXiv preprint arXiv:1809.01696 (2018)","DOI":"10.18653\/v1\/D18-1167"},{"key":"30_CR3","doi-asserted-by":"crossref","unstructured":"Alamri, H., et al.: Audio visual scene-aware dialog. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7558\u20137567 (2019)","DOI":"10.1109\/CVPR.2019.00774"},{"key":"30_CR4","doi-asserted-by":"crossref","unstructured":"Hori, C., et al.: End-to-end audio visual scene-aware dialog using multimodal attention-based video features. In: ICASSP 2019\u20132019 IEEE ICASSP, pp. 2352\u20132356. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8682583"},{"key":"30_CR5","unstructured":"Nguyen, D.T., Sharma, S., Schulz, H., Asri, L.E., et al.: From film to video: Multi-turn question answering with multi-modal context. arXiv preprint arXiv:1812.07023, 2018"},{"key":"30_CR6","unstructured":"Sanabria, R., Palaskar, S., Metze, F.: Cmu sinbad\u2019s submission for the dstc7 avsd challenge. In: DSTC7 at AAAI2019 workshop, vol. 6 (2019)"},{"key":"30_CR7","doi-asserted-by":"crossref","unstructured":"Le, H., Sahoo, D., Chen, N.F., Hoi, S.C., et al.: Multimodal transformer networks for end-to-end video-grounded dialogue systems. arXiv preprint arXiv:1907.01166, 2019","DOI":"10.18653\/v1\/P19-1564"},{"key":"30_CR8","doi-asserted-by":"crossref","unstructured":"JKim, J., Yoon, S., Kim, D., Yoo, C.D.: Structured co-reference graph attention for video-grounded dialogue. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, pp. 1789\u20131797 (2021)","DOI":"10.1609\/aaai.v35i2.16273"},{"key":"30_CR9","doi-asserted-by":"crossref","unstructured":"Le, H., Chen, N., Hoi, S., et al.: Vgnmn: Video-grounded neural module networks for video-grounded dialogue systems. In: Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pp. 3377\u20133393 (2022)","DOI":"10.18653\/v1\/2022.naacl-main.247"},{"issue":"8","key":"30_CR10","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Jeffrey, W., Child, R., Luan, D., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"key":"30_CR11","doi-asserted-by":"publisher","first-page":"2476","DOI":"10.1109\/TASLP.2021.3065823","volume":"29","author":"Z Li","year":"2021","unstructured":"Li, Z., Li, Z., Zhang, J., et al.: Bridging text and video: a universal multimodal transformer for audio-visual scene-aware dialog. IEEE\/ACM Trans. Audio, Speech, Lang. Process. 29, 2476\u20132483 (2021)","journal-title":"IEEE\/ACM Trans. Audio, Speech, Lang. Process."},{"key":"30_CR12","doi-asserted-by":"crossref","unstructured":"Alamri, H., Hori, C., Marks, T.K., Batra, D., Parikh, D.: Audio visual scene-aware dialog (avsd) track for natural language generation in dstc7. In: DSTC7 at AAAI2019 Workshop, vol. 2 (2018)","DOI":"10.1109\/CVPR.2019.00774"},{"key":"30_CR13","doi-asserted-by":"crossref","unstructured":"Ji, Z., et al.: Survey of hallucination in natural language generation. ACM Comput. Surv. 55(12), 1\u201338 (2023)","DOI":"10.1145\/3571730"},{"key":"30_CR14","unstructured":"Biderman, S., et al.: Pythia: a suite for analyzing large language models across training and scaling. In: International Conference on Machine Learning, pp. 2397\u20132430. PMLR (2023)"},{"issue":"9","key":"30_CR15","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3560815","volume":"55","author":"P Liu","year":"2023","unstructured":"Liu, P., Yuan, W., Jinlan, F., Jiang, Z., et al.: Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing. ACM Comput. Surv. 55(9), 1\u201335 (2023)","journal-title":"ACM Comput. Surv."},{"key":"30_CR16","doi-asserted-by":"crossref","unstructured":"Le, H., Hoi, S.C.: Video-grounded dialogues with pretrained generation language models. arXiv preprint arXiv:2006.15319 (2020)","DOI":"10.18653\/v1\/2020.acl-main.518"},{"key":"30_CR17","unstructured":"Wu, Y., et al.: Google\u2019s neural machine translation system: bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144 (2016)"},{"key":"30_CR18","unstructured":"Wang, H., et al.: Foundation transformers. arXiv preprint arXiv:2210.06423 (2022)"},{"key":"30_CR19","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., et al.: Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"30_CR20","unstructured":"Shazeer, N.: Glu variants improve transformer. arXiv preprint arXiv:2002.05202 (2020)"},{"issue":"240","key":"30_CR21","first-page":"1","volume":"24","author":"A Chowdhery","year":"2023","unstructured":"Chowdhery, A., Narang, S., et al.: Palm: scaling language modeling with pathways. J. Mach. Learn. Res. 24(240), 1\u2013113 (2023)","journal-title":"J. Mach. Learn. Res."},{"key":"30_CR22","doi-asserted-by":"crossref","unstructured":"Touvron, H., Cord, M., Sablayrolles, A., Synnaeve, G., J\u00e9gou, H.: Going deeper with image transformers. In: Proceedings of the IEEE\/CVF International Conference On Computer Vision, pp. 32\u201342 (2021)","DOI":"10.1109\/ICCV48922.2021.00010"},{"key":"30_CR23","unstructured":"Lee, H., Yoon, S., Dernoncourt, F., Kim, D.S., Bui, T., Jung, K.: Dstc8-avsd: multimodal semantic transformer network with retrieval style word generator. arXiv preprint arXiv:2004.08299 (2020)"},{"key":"30_CR24","doi-asserted-by":"crossref","unstructured":"Lee, H., Yoon, S., Dernoncourt, F., Kim, D.S., Bui, T., Jung, K.: Bist: Bi-directional spatio-temporal reasoning for video-grounded dialogues. arXiv preprint arXiv:2010.10095 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.145"},{"key":"30_CR25","doi-asserted-by":"crossref","unstructured":"Shah, A., et al.: Audio-visual scene-aware dialog and reasoning using audio-visual transformers with joint student-teacher learning. In: ICASSP 2022\u20132022 IEEE ICASSP, pages 7732\u20137736. IEEE (2022)","DOI":"10.1109\/ICASSP43922.2022.9746481"},{"key":"30_CR26","unstructured":"Chu, Y.W., Lin, K.Y., Hsu, C.C., Ku, L.W., et al.: Multi-step joint-modality attention network for scene-aware dialogue system. arXiv preprint arXiv:2001.06206 (2020)"},{"key":"30_CR27","unstructured":"Xie, H., Iacobacci, I.: Audio visual scene-aware dialog system using dynamic memory networks. In: DSTC8 at AAAI2020 workshop (2020)"},{"key":"30_CR28","unstructured":"Geng, S., Gao, P., Marks, T., Hori, C., Cherian, A.: Spatio-temporal scene graph reasoning for audio visual scene-aware dialog at dstc8. In: DSTC8 at AAAI2020 workshop (2020)"},{"key":"30_CR29","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: Proceedings of the 40th annual meeting of the ACL, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"30_CR30","doi-asserted-by":"crossref","unstructured":"Denkowski, M., Lavie, A.: Meteor universal: Language specific translation evaluation for any target language. In: Proceedings of the Ninth Workshop on Statistical Machine Translation, pp. 376\u2013380 (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"30_CR31","unstructured":"Lin, C.Y.: Rouge: A package for automatic evaluation of summaries. In: Text summarization branches out, pp. 74\u201381 (2004)"},{"key":"30_CR32","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: Cider: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"30_CR33","unstructured":"Wolf, T., et al.: Transformers: state-of-the-art natural language processing. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pp. 38\u201345 (2020)"},{"key":"30_CR34","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-53308-2_30","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T11:12:24Z","timestamp":1710241944000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-53308-2_30"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031533075","9783031533082"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-53308-2_30","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"28 January 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 January 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 February 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"ConfTool Pro","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"297","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"112","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"38% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.2","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.2","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}