{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,11]],"date-time":"2025-10-11T08:28:38Z","timestamp":1760171318674,"version":"3.40.3"},"publisher-location":"Cham","reference-count":71,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031729973"},{"type":"electronic","value":"9783031729980"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72998-0_20","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T18:01:58Z","timestamp":1727632918000},"page":"348-365","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Multi-modal Video Dialog State Tracking in\u00a0the\u00a0Wild"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9489-6340","authenticated-orcid":false,"given":"Adnen","family":"Abdessaied","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1628-1559","authenticated-orcid":false,"given":"Lei","family":"Shi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6317-7303","authenticated-orcid":false,"given":"Andreas","family":"Bulling","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"20_CR1","unstructured":"Abdessaied, A., Hochmeister, M., Bulling, A.: OLViT: multi-modal state tracking via attention-based embeddings for video-grounded dialog. In: LREC-COLING (2024)"},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Abdessaied, A., Shi, L., Bulling, A.: VD-GR: boosting visual dialog with cascaded spatial-temporal multi-modal graphs. In: WACV (2024)","DOI":"10.1109\/WACV57701.2024.00570"},{"key":"20_CR3","unstructured":"Alamri, H., Bilic, A., Hu, M., Beedu, A., Essa, I.: End-to-end multimodal representation learning for video dialog. In: NeurIPS (2022)"},{"key":"20_CR4","doi-asserted-by":"crossref","unstructured":"Alamri, H., et\u00a0al.: Audio visual scene-aware dialog. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00774"},{"key":"20_CR5","unstructured":"Alayrac, J.B., et al.: Flamingo: a visual language model for few-shot learning. In: NeurIPS (2022)"},{"key":"20_CR6","doi-asserted-by":"crossref","unstructured":"Andreas, J., Rohrbach, M., Darrell, T., Klein, D.: Deep compositional question answering with neural module networks. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.12"},{"key":"20_CR7","doi-asserted-by":"crossref","unstructured":"Andreas, J., Rohrbach, M., Darrell, T., Klein, D.: Learning to compose neural networks for question answering. In: NAACL (2016)","DOI":"10.18653\/v1\/N16-1181"},{"key":"20_CR8","doi-asserted-by":"crossref","unstructured":"Antol, S., et al.: VQA: visual Question Answering. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"20_CR9","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for mt evaluation with improved correlation with human judgments. In: ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization (2005)"},{"key":"20_CR10","doi-asserted-by":"crossref","unstructured":"Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.502"},{"key":"20_CR11","unstructured":"Chen, Y., Wu, L., Zaki, M.: Iterative deep graph learning for graph neural networks: better and robust node embeddings. In: NeurIPS (2020)"},{"key":"20_CR12","doi-asserted-by":"publisher","first-page":"753","DOI":"10.1109\/TASLP.2023.3284511","volume":"32","author":"Z Chen","year":"2023","unstructured":"Chen, Z., Liu, H., Wang, Y.: DialogMCF: multimodal context flow for audio visual scene-aware dialog. IEEE\/ACM Trans. Audio, Speech. Lang. Process. 32, 753\u2013764 (2023)","journal-title":"IEEE\/ACM Trans. Audio, Speech. Lang. Process."},{"key":"20_CR13","unstructured":"Chu, Y.W., Lin, K.Y., Hsu, C.C., Ku, L.W.: Multi-step joint-modality attention network for scene-aware dialogue system. In: DSTC Workshop @ AAAI (2020)"},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"Colson, B., Marcotte, P., Savard, G.: An overview of bilevel optimization. annals of operations research (2007)","DOI":"10.1007\/s10479-007-0176-2"},{"key":"20_CR15","doi-asserted-by":"crossref","unstructured":"Das, A., et al.: Visual dialog. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.121"},{"key":"20_CR16","unstructured":"Elinas, P., Bonilla, E.V., Tiao, L.: Variational inference for graph convolutional networks in the absence of graph data and adversarial settings. In: NeurIPS (2020)"},{"key":"20_CR17","unstructured":"Franceschi, L., Niepert, M., Pontil, M., He, X.: Learning discrete structures for graph neural networks. In: ICML (2019)"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"Gasteiger, J., Bojchevski, A., G\u00fcnnemann, S.: Predict then propagate: graph neural networks meet personalized pagerank. In: ICLR (2019)","DOI":"10.1145\/3394486.3403296"},{"key":"20_CR19","unstructured":"Girdhar, R., Ramanan, D.: CATER: a diagnostic dataset for compositional actions and temporal reasoning. In: ICLR (2020)"},{"key":"20_CR20","unstructured":"Guo, X., Wu, H., Cheng, Y., Rennie, S., Tesauro, G., Feris, R.: Dialog-based interactive image retrieval. NeurIPS 31 (2018)"},{"key":"20_CR21","doi-asserted-by":"crossref","unstructured":"Hori, C., et al.: End-to-end audio visual scene-aware dialog using multimodal attention-based video features. In: ICASSP (2019)","DOI":"10.1109\/ICASSP.2019.8682583"},{"key":"20_CR22","unstructured":"Huang, X., et al.: Investigation on transformer-based multi-modal fusion for audio-visual scene-aware dialog. In: DSTC10 Workshop @ AAAI (2022)"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Jiang, P., Han, Y.: Reasoning with heterogeneous graph alignment for video question answering. In: Proceedings of the AAAI Conference on Artificial Intelligence (2020)","DOI":"10.1609\/aaai.v34i07.6767"},{"key":"20_CR24","doi-asserted-by":"crossref","unstructured":"Jin, Y., Niu, G., Xiao, X., Zhang, J., Peng, X., Yu, J.: Knowledge-constrained answer generation for open-ended video question answering. In: AAAI (2023)","DOI":"10.1609\/aaai.v37i7.25983"},{"key":"20_CR25","unstructured":"Kay, W., et\u00a0al.: the kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)"},{"key":"20_CR26","doi-asserted-by":"crossref","unstructured":"Kim, J., Yoon, S., Kim, D., Yoo, C.D.: Structured co-reference graph attention for video-grounded dialogue. In: AAAI (2021)","DOI":"10.1609\/aaai.v35i2.16273"},{"key":"20_CR27","unstructured":"Kim, S., et\u00a0al.: The eighth dialog system technology challenge . arXiv preprint arXiv:1911.06394 (2019)"},{"key":"20_CR28","unstructured":"Kirillov, A., et al.: Segment anything . arXiv preprint arXiv:2304.02643 (2023)"},{"key":"20_CR29","doi-asserted-by":"crossref","unstructured":"Kottur, S., Moon, S., Geramifard, A., Damavandi, B.: SIMMC 2.0: a task-oriented dialog dataset for immersive multimodal conversations. In: EMNLP (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.401"},{"key":"20_CR30","unstructured":"Le, H., Chen, N.F., Hoi, S.: Learning reasoning paths over semantic graphs for video-grounded dialogues. In: ICLR (2021)"},{"key":"20_CR31","doi-asserted-by":"crossref","unstructured":"Le, H., Chen, N.F., Hoi, S.C.H.: VGNMN: video-grounded neural module network to video-grounded language tasks. In: NAACL (2022)","DOI":"10.18653\/v1\/2022.naacl-main.247"},{"key":"20_CR32","doi-asserted-by":"crossref","unstructured":"Le, H., Chen, N.F., Hoi, S.C.: Multimodal dialogue state tracking. In: NAACL (2022)","DOI":"10.18653\/v1\/2022.naacl-main.248"},{"key":"20_CR33","doi-asserted-by":"crossref","unstructured":"Le, H., Hoi, S.C.: Video-grounded dialogues with pretrained generation language models. In: ACL (2020)","DOI":"10.18653\/v1\/2020.acl-main.518"},{"key":"20_CR34","doi-asserted-by":"crossref","unstructured":"Le, H., Sahoo, D., Chen, N., Hoi, S.: Multimodal transformer networks for end-to-end video-grounded dialogue systems. In: ACL (2019)","DOI":"10.18653\/v1\/P19-1564"},{"key":"20_CR35","doi-asserted-by":"crossref","unstructured":"Le, H., Sahoo, D., Chen, N., Hoi, S.C.: BiST: bi-directional spatio-temporal reasoning for video-grounded dialogues. In: EMNLP (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.145"},{"key":"20_CR36","doi-asserted-by":"crossref","unstructured":"Le, H., Sankar, C., Moon, S., Beirami, A., Geramifard, A., Kottur, S.: DVD: a diagnostic dataset for multi-step reasoning in video grounded dialogue. In: ACL (2021)","DOI":"10.18653\/v1\/2021.acl-long.439"},{"key":"20_CR37","unstructured":"Le, H., Socher, R., Hoi, S.C.: Non-autoregressive dialog state tracking. In: ICLR (2020)"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Le, T.M., Le, V., Venkatesh, S., Tran, T.: Hierarchical conditional relation networks for video question answering. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00999"},{"key":"20_CR39","doi-asserted-by":"crossref","unstructured":"Lee, H., et al.: Learning to embed multi-modal contexts for situated conversational agents. In: NAACL-Findings (2022)","DOI":"10.18653\/v1\/2022.findings-naacl.61"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Lee, H., Lee, J., Kim, T.Y.: SUMBT: slot-utterance matching for universal and scalable belief tracking. In: ACL (2019)","DOI":"10.18653\/v1\/P19-1546"},{"key":"20_CR41","doi-asserted-by":"crossref","unstructured":"Lewis, M., et al..: BART: denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension. In: ACL (2020)","DOI":"10.18653\/v1\/2020.acl-main.703"},{"key":"20_CR42","doi-asserted-by":"crossref","unstructured":"Li, Y., Hui, B., Yin, Z., Yang, M., Huang, F., Li, Y.: PaCE: unified multi-modal dialogue pre-training with progressive and compositional experts. In: ACL (2023)","DOI":"10.18653\/v1\/2023.acl-long.749"},{"key":"20_CR43","doi-asserted-by":"publisher","first-page":"2476","DOI":"10.1109\/TASLP.2021.3065823","volume":"29","author":"Z Li","year":"2021","unstructured":"Li, Z., Li, Z., Zhang, J., Feng, Y., Zhou, J.: Bridging text and video: a universal multimodal transformer for audio-visual scene-aware dialog. Trans. Audio Speech Lang. Process 29, 2476\u20132483 (2021)","journal-title":"Trans. Audio Speech Lang. Process"},{"key":"20_CR44","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: Text Summarization Branches Out (2004)"},{"key":"20_CR45","unstructured":"Malinowski, M., Fritz, M.: A Multi-world approach to question answering about real-world scenes based on uncertain input. In: NeurIPS (2014)"},{"key":"20_CR46","doi-asserted-by":"crossref","unstructured":"Moon, S., et al.: Situated and interactive multimodal conversations. In: COLING (2020)","DOI":"10.18653\/v1\/2020.coling-main.96"},{"key":"20_CR47","unstructured":"Mou, X., Sigouin, B., Steenstra, I., Su, H.: Multimodal dialogue state tracking by QA approach with data augmentation. In: DSTC8 Workshop @ AAAI (2020)"},{"key":"20_CR48","doi-asserted-by":"crossref","unstructured":"Mrk\u0161i\u0107, N., \u00d3\u00a0S\u00e9aghdha, D., Wen, T.H., Thomson, B., Young, S.: Neural belief tracker: data-driven dialogue state tracking. In: ACL (2017)","DOI":"10.18653\/v1\/P17-1163"},{"key":"20_CR49","doi-asserted-by":"crossref","unstructured":"Pang, W., Wang, X.: Visual dialogue state tracking for question generation. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.6856"},{"key":"20_CR50","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"20_CR51","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"710","DOI":"10.1007\/978-3-031-19842-7_41","volume-title":"ECCV 2022","author":"HA Pham","year":"2022","unstructured":"Pham, H.A., Le, T.M., Le, V., Phuong, T.M., Tran, T.: Video dialog as conversation about objects living in space-time. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13699, pp. 710\u2013726. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19842-7_41"},{"key":"20_CR52","unstructured":"Radford, A., et\u00a0al.: Language models are unsupervised multitask learners. OpenAI blog (2019)"},{"key":"20_CR53","unstructured":"Rezende, D.J., Mohamed, S., Wierstra, D.: Stochastic backpropagation and approximate inference in deep generative models. In: ICML (2014)"},{"key":"20_CR54","doi-asserted-by":"crossref","unstructured":"Shah, A., et al.: Audio-visual scene-aware dialog and reasoning using audio-visual transformers with joint student-teacher learning. In: ICASSP (2022)","DOI":"10.1109\/ICASSP43922.2022.9746481"},{"key":"20_CR55","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"510","DOI":"10.1007\/978-3-319-46448-0_31","volume-title":"Computer Vision \u2013 ECCV 2016","author":"GA Sigurdsson","year":"2016","unstructured":"Sigurdsson, G.A., Varol, G., Wang, X., Farhadi, A., Laptev, I., Gupta, A.: Hollywood in homes: crowdsourcing data collection for activity understanding. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9905, pp. 510\u2013526. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46448-0_31"},{"key":"20_CR56","doi-asserted-by":"crossref","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: ICLR (2015)","DOI":"10.1109\/ICCV.2015.314"},{"key":"20_CR57","unstructured":"Sun, Q., et al.: Generative Pretraining in Multimodality . arXiv preprint arXiv:2307.05222 (2023)"},{"key":"20_CR58","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: CIDEr: consensus-based image description evaluation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"20_CR59","doi-asserted-by":"crossref","unstructured":"Wu, C.S., Madotto, A., Hosseini-Asl, E., Xiong, C., Socher, R., Fung, P.: Transferable multi-domain state generator for task-oriented dialogue systems. In: ACL (2019)","DOI":"10.18653\/v1\/P19-1078"},{"key":"20_CR60","unstructured":"Wu, Q., Zhao, W., Li, Z., Wipf, D., Yan, J.: Nodeformer: a scalable graph structure learning transformer for node classification. In: NeurIPS (2022)"},{"key":"20_CR61","doi-asserted-by":"crossref","unstructured":"Wu, Y., Macdonald, C., Ounis, I.: Multi-modal dialog state tracking for interactive fashion recommendation. In: ACM RecSys (2022)","DOI":"10.1145\/3523227.3546774"},{"key":"20_CR62","doi-asserted-by":"crossref","unstructured":"Xiao, J., Shang, X., Yao, A., Chua, T.: Next-QA: next phase of question-answering to explaining temporal actions. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"20_CR63","doi-asserted-by":"crossref","unstructured":"Xu, D., et al.: Video question answering via gradually refined attention over appearance and motion. In: ACM MM (2017)","DOI":"10.1145\/3123266.3123427"},{"key":"20_CR64","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"20_CR65","doi-asserted-by":"crossref","unstructured":"Xu, P., Hu, Q.: An end-to-end approach for handling unknown slot values in dialogue state tracking. In: ACL (2018)","DOI":"10.18653\/v1\/P18-1134"},{"key":"20_CR66","unstructured":"Yang, J., et al.: GraphFormers: gnn-nested transformers for representation learning on textual graph. In: NeurIPS (2021)"},{"key":"20_CR67","unstructured":"Ying, C., et al.: Do transformers really perform badly for graph representation? In: NeurIPS (2021)"},{"key":"20_CR68","doi-asserted-by":"crossref","unstructured":"Yoon, S., Yoon, E., Yoon, H.S., Kim, J., Yoo, C.: Information-theoretic text hallucination reduction for video-grounded dialogue. In: EMNLP (2022)","DOI":"10.18653\/v1\/2022.emnlp-main.280"},{"key":"20_CR69","unstructured":"Yoshino, K., et\u00a0al.: Dialog system technology challenge 7. arXiv preprint arXiv:1901.03461 (2019)"},{"key":"20_CR70","unstructured":"Yu, Y., Chen, J., Gao, T., Yu, M.: DAG-GNN: DAG structure learning with graph neural networks. In: ICML (2019)"},{"key":"20_CR71","unstructured":"Zhang, H., Liu, M., Wang, Y., Cao, D., Guan, W., Nie, L.: Uncovering hidden connections: iterative tracking and reasoning for video-grounded dialog. IEEE Trans. Pattern Anal. Mach. Intell. (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72998-0_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:29:16Z","timestamp":1732829356000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72998-0_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031729973","9783031729980"],"references-count":71,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72998-0_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}