{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T19:12:45Z","timestamp":1743016365863,"version":"3.40.3"},"publisher-location":"Cham","reference-count":43,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031171888"},{"type":"electronic","value":"9783031171895"}],"license":[{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T00:00:00Z","timestamp":1640995200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022]]},"DOI":"10.1007\/978-3-031-17189-5_29","type":"book-chapter","created":{"date-parts":[[2022,9,23]],"date-time":"2022-09-23T23:20:50Z","timestamp":1663975250000},"page":"328-335","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Overview of\u00a0the\u00a0NLPCC 2022 Shared Task: Multi-modal Dialogue Understanding and\u00a0Generation"],"prefix":"10.1007","author":[{"given":"Yuxuan","family":"Wang","sequence":"first","affiliation":[]},{"given":"Xueliang","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Dongyan","family":"Zhao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,9,24]]},"reference":[{"key":"29_CR1","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1007\/s11263-016-0966-6","volume":"123","author":"A Agrawal","year":"2015","unstructured":"Agrawal, A., et al.: VQA: visual question answering. Int. J. Comput. Vis. 123, 4\u201331 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"29_CR2","doi-asserted-by":"crossref","unstructured":"AlAmri, H., et al.: Audio visual scene-aware dialog. In: 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7550\u20137559 (2019)","DOI":"10.1109\/CVPR.2019.00774"},{"key":"29_CR3","doi-asserted-by":"crossref","unstructured":"Baraldi, L., Grana, C., Cucchiara, R.: A deep siamese network for scene detection in broadcast videos. In: Proceedings of the 23rd ACM International Conference on Multimedia (2015)","DOI":"10.1145\/2733373.2806316"},{"key":"29_CR4","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1109\/TMM.2008.2008924","volume":"11","author":"V Chasanis","year":"2009","unstructured":"Chasanis, V., Likas, A.C., Galatsanos, N.P.: Scene detection in videos using shot clustering and sequence alignment. IEEE Trans. Multimedia 11, 89\u2013100 (2009)","journal-title":"IEEE Trans. Multimedia"},{"key":"29_CR5","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/0004-3702(71)90002-6","volume":"2","author":"KM Colby","year":"1971","unstructured":"Colby, K.M., Weber, S., Hilf, F.D.: Artificial paranoia. Artif. Intell. 2, 1\u201325 (1971)","journal-title":"Artif. Intell."},{"key":"29_CR6","doi-asserted-by":"crossref","unstructured":"Das, A., et al.: Visual dialog. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 326\u2013335 (2017)","DOI":"10.1109\/CVPR.2017.121"},{"key":"29_CR7","doi-asserted-by":"crossref","unstructured":"Galley, M., McKeown, K., Fosler-Lussier, E., Jing, H.: Discourse segmentation of multi-party conversation. In: ACL (2003)","DOI":"10.3115\/1075096.1075167"},{"key":"29_CR8","unstructured":"Gao, H., Mao, J., Zhou, J., Huang, Z., Wang, L., Xu, W.: Are you talking to a machine? Dataset and methods for multilingual image question. In: NIPS (2015)"},{"key":"29_CR9","doi-asserted-by":"crossref","unstructured":"Gao, J., Galley, M., Li, L.: Neural approaches to conversational AI. arXiv arXiv:1809.08267 (2019)","DOI":"10.1561\/9781680835533"},{"key":"29_CR10","doi-asserted-by":"crossref","unstructured":"Han, B., Wu, W.: Video scene segmentation using a novel boundary evaluation criterion and dynamic programming. In: 2011 IEEE International Conference on Multimedia and Expo, pp. 1\u20136 (2011)","DOI":"10.1109\/ICME.2011.6012001"},{"key":"29_CR11","first-page":"33","volume":"23","author":"MA Hearst","year":"1997","unstructured":"Hearst, M.A.: Text tiling: segmenting text into multi-paragraph subtopic passages. Comput. Linguist. 23, 33\u201364 (1997)","journal-title":"Comput. Linguist."},{"key":"29_CR12","doi-asserted-by":"crossref","unstructured":"Hori, C., et al.: End-to-end audio visual scene-aware dialog using multimodal attention-based video features. In: 2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), ICASSP 2019, pp. 2352\u20132356. IEEE (2019)","DOI":"10.1109\/ICASSP.2019.8682583"},{"key":"29_CR13","doi-asserted-by":"crossref","unstructured":"Huang, Q., Xiong, Y., Rao, A., Wang, J., Lin, D.: MovieNet: a holistic dataset for movie understanding. arXiv arXiv:2007.10937 (2020)","DOI":"10.1007\/978-3-030-58548-8_41"},{"key":"29_CR14","doi-asserted-by":"crossref","unstructured":"Khan, O.Z., Robichaud, J.P., Crook, P.A., Sarikaya, R.: Hypotheses ranking and state tracking for a multi-domain dialog system using multiple ASR alternates. In: INTERSPEECH (2015)","DOI":"10.21437\/Interspeech.2015-459"},{"key":"29_CR15","doi-asserted-by":"crossref","unstructured":"Lavie, A., Agarwal, A.: METEOR: an automatic metric for MT evaluation with high levels of correlation with human judgments. In: Proceedings of the 2nd Workshop on Statistical Machine Translation, pp. 228\u2013231 (2007)","DOI":"10.3115\/1626355.1626389"},{"key":"29_CR16","unstructured":"Le, H., Chen, N.F., Hoi, S.: Learning reasoning paths over semantic graphs for video-grounded dialogues. In: International Conference on Learning Representations (2021)"},{"key":"29_CR17","doi-asserted-by":"crossref","unstructured":"Le, H., Sahoo, D., Chen, N., Hoi, S.C.: BiST: bi-directional spatio-temporal reasoning for video-grounded dialogues. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 1846\u20131859 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.145"},{"key":"29_CR18","doi-asserted-by":"crossref","unstructured":"Le, H., Sahoo, D., Chen, N.F., Hoi, S.C.H.: Multimodal transformer networks for end-to-end video-grounded dialogue systems. In: ACL (2019)","DOI":"10.18653\/v1\/P19-1564"},{"key":"29_CR19","doi-asserted-by":"crossref","unstructured":"Li, J., Monroe, W., Shi, T., Jean, S., Ritter, A., Jurafsky, D.: Adversarial learning for neural dialogue generation. In: EMNLP (2017)","DOI":"10.18653\/v1\/D17-1230"},{"key":"29_CR20","doi-asserted-by":"crossref","unstructured":"Li, Z., Li, Z., Zhang, J., Feng, Y., Niu, C., Zhou, J.: Bridging text and video: a universal multimodal transformer for video-audio scene-aware dialog. arXiv preprint arXiv:2002.00163 (2020)","DOI":"10.1109\/TASLP.2021.3065823"},{"key":"29_CR21","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"29_CR22","unstructured":"Malinowski, M., Fritz, M.: A multi-world approach to question answering about real-world scenes based on uncertain input. In: NIPS (2014)"},{"key":"29_CR23","unstructured":"Meng, Y., et al.: OpenViDial: a large-scale, open-domain dialogue dataset with visual contexts. arXiv arXiv:2012.15015 (2020)"},{"key":"29_CR24","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting on Association for Computational Linguistics, pp. 311\u2013318. Association for Computational Linguistics (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"29_CR25","doi-asserted-by":"publisher","first-page":"991","DOI":"10.1007\/s11760-018-1244-6","volume":"12","author":"S Protasov","year":"2018","unstructured":"Protasov, S., Khan, A., Sozykin, K., Ahmad, M.: Using deep features for video scene detection and annotation. Sig. Image Video Process. 12, 991\u2013999 (2018)","journal-title":"Sig. Image Video Process."},{"key":"29_CR26","doi-asserted-by":"crossref","unstructured":"Rao, A., et al.: A local-to-global approach to multi-modal movie scene segmentation. In: 2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10143\u201310152 (2020)","DOI":"10.1109\/CVPR42600.2020.01016"},{"key":"29_CR27","doi-asserted-by":"crossref","unstructured":"Rasheed, Z., Shah, M.: Scene detection in hollywood movies and tv shows. In: 2003 Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition, vol. 2, p. II-343 (2003)","DOI":"10.1109\/CVPR.2003.1211489"},{"key":"29_CR28","doi-asserted-by":"publisher","first-page":"193","DOI":"10.1142\/S1793351X17400086","volume":"11","author":"D Rotman","year":"2017","unstructured":"Rotman, D., Porat, D., Ashour, G.: Optimal sequential grouping for robust video scene detection using multiple modalities. Int. J. Semant. Comput. 11, 193\u2013208 (2017)","journal-title":"Int. J. Semant. Comput."},{"key":"29_CR29","unstructured":"Rui, Y., Huang, T.S., Mehrotra, S.: Exploring video structure beyond the shots. In: Proceedings of the IEEE International Conference on Multimedia Computing and Systems (Cat. No.98TB100241), pp. 237\u2013240 (1998)"},{"key":"29_CR30","doi-asserted-by":"publisher","first-page":"10","DOI":"10.1631\/FITEE.1700826","volume":"19","author":"H Shum","year":"2018","unstructured":"Shum, H., He, X., Li, D.: From Eliza to Xiaoice: challenges and opportunities with social chatbots. Front. Inf. Technol. Electron. Eng. 19, 10\u201326 (2018)","journal-title":"Front. Inf. Technol. Electron. Eng."},{"key":"29_CR31","doi-asserted-by":"publisher","first-page":"1163","DOI":"10.1109\/TCSVT.2011.2138830","volume":"21","author":"P Sidiropoulos","year":"2011","unstructured":"Sidiropoulos, P., Mezaris, V., Kompatsiaris, Y., Meinedo, H., Bugalho, M.M.F., Trancoso, I.: Temporal video segmentation to scenes using high-level audiovisual features. IEEE Trans. Circ. Syst. Video Technol. 21, 1163\u20131177 (2011)","journal-title":"IEEE Trans. Circ. Syst. Video Technol."},{"key":"29_CR32","doi-asserted-by":"crossref","unstructured":"Song, Y., Mou, L., Yan, R., Yi, L., Zhu, Z., Hu, X., Zhang, M.: Dialogue session segmentation by embedding-enhanced texttiling. arXiv arXiv:1610.03955 (2016)","DOI":"10.21437\/Interspeech.2016-1234"},{"key":"29_CR33","doi-asserted-by":"crossref","unstructured":"Sordoni, A., et al.: A neural network approach to context-sensitive generation of conversational responses. In: NAACL (2015)","DOI":"10.3115\/v1\/N15-1020"},{"key":"29_CR34","doi-asserted-by":"crossref","unstructured":"Takanobu, R., et al.: A weakly supervised method for topic segmentation and labeling in goal-oriented dialogues via reinforcement learning. In: IJCAI (2018)","DOI":"10.24963\/ijcai.2018\/612"},{"key":"29_CR35","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence Zitnick, C., Parikh, D.: CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"29_CR36","unstructured":"Wang, S., Meng, Y., Li, X., Sun, X., Ouyang, R., Li, J.: OpenViDial 2.0: a larger-scale, open-domain dialogue generation dataset with visual contexts. arXiv arXiv:2109.12761 (2021)"},{"key":"29_CR37","unstructured":"Wang, S., et al.: Modeling text-visual mutual dependency for multi-modal dialog generation. arXiv arXiv:2105.14445 (2021)"},{"key":"29_CR38","doi-asserted-by":"publisher","first-page":"36","DOI":"10.1145\/365153.365168","volume":"9","author":"J Weizenbaum","year":"1966","unstructured":"Weizenbaum, J.: Eliza\u2014a computer program for the study of natural language communication between man and machine. Commun. ACM 9, 36\u201345 (1966)","journal-title":"Commun. ACM"},{"key":"29_CR39","doi-asserted-by":"crossref","unstructured":"Xing, L., Carenini, G.: Improving unsupervised dialogue topic segmentation with utterance-pair coherence scoring. In: SIGDIAL (2021)","DOI":"10.18653\/v1\/2021.sigdial-1.18"},{"key":"29_CR40","doi-asserted-by":"crossref","unstructured":"Xu, Y., Zhao, H., Zhang, Z.: Topic-aware multi-turn dialogue modeling. In: AAAI (2021)","DOI":"10.1609\/aaai.v35i16.17668"},{"key":"29_CR41","doi-asserted-by":"crossref","unstructured":"Zhao, T., Zhao, R., Esk\u00e9nazi, M.: Learning discourse-level diversity for neural dialog models using conditional variational autoencoders. In: ACL (2017)","DOI":"10.18653\/v1\/P17-1061"},{"key":"29_CR42","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1162\/coli_a_00368","volume":"46","author":"L Zhou","year":"2020","unstructured":"Zhou, L., Gao, J., Li, D., Shum, H.: The design and implementation of XiaoIce, an empathetic social chatbot. Comput. Linguist. 46, 53\u201393 (2020)","journal-title":"Comput. Linguist."},{"key":"29_CR43","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Groth, O., Bernstein, M.S., Fei-Fei, L.: Visual7W: grounded question answering in images. In: 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 4995\u20135004 (2016)","DOI":"10.1109\/CVPR.2016.540"}],"container-title":["Lecture Notes in Computer Science","Natural Language Processing and Chinese Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-17189-5_29","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,4]],"date-time":"2024-10-04T14:59:15Z","timestamp":1728053955000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-17189-5_29"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022]]},"ISBN":["9783031171888","9783031171895"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-17189-5_29","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2022]]},"assertion":[{"value":"24 September 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"NLPCC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF International Conference on Natural Language Processing and Chinese Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Guilin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2022","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24 September 2022","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"25 September 2022","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"nlpcc2022","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/tcci.ccf.org.cn\/conference\/2022\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Softconf","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"327","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"73","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"22% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1.5","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}