{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:04:49Z","timestamp":1765343089465,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755633","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"4837-4845","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Seeing Through Ambiguity: Effective Video-guided Machine Translation via Chaotic Fusion and Causally Aligned Spatio-temporal Attention"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-9643-3643","authenticated-orcid":false,"given":"Jiawei","family":"Zheng","sequence":"first","affiliation":[{"name":"School of Informatics, Xiamen University, Xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-8277-6380","authenticated-orcid":false,"given":"Feiyan","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Informatics, Xiamen University, Xiamen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8677-9080","authenticated-orcid":false,"given":"Xiaoli","family":"Wang","sequence":"additional","affiliation":[{"name":"Super Intelligent Medical Innovation Research Center, School of Informatics, Xiamen University, Xiamen, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Uri M Ascher and Linda R Petzold. 1998. Computer methods for ordinary differential equations and differential-algebraic equations. SIAM.","DOI":"10.1137\/1.9781611971392"},{"key":"e_1_3_2_1_2_1","volume-title":"Estimating or propagating gradients through stochastic neurons for conditional computation. arXiv preprint arXiv:1308.3432","author":"Bengio Yoshua","year":"2013","unstructured":"Yoshua Bengio, Nicholas L\u00e9onard, and Aaron Courville. 2013. Estimating or propagating gradients through stochastic neurons for conditional computation. arXiv preprint arXiv:1308.3432 (2013)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.112"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1175"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.608"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-3348"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-3210"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.390"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.329"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.295"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-srw.9"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the 31st International Conference on Computational Linguistics. 8215-8231","author":"Guan Boyu","year":"2025","unstructured":"Boyu Guan, Yining Zhang, Yang Zhao, and Chengqing Zong. 2025. TriFine: A Large-Scale Dataset of Vision-Audio-Subtitle for Tri-Modal Machine Translation and Benchmark with Fine-Grained Annotated Tags. In Proceedings of the 31st International Conference on Computational Linguistics. 8215-8231."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.173"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00269"},{"key":"e_1_3_2_1_16_1","volume-title":"Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415","author":"Hendrycks Dan","year":"2016","unstructured":"Dan Hendrycks and Kevin Gimpel. 2016. Gaussian error linear units (gelus). arXiv preprint arXiv:1606.08415 (2016)."},{"key":"e_1_3_2_1_17_1","volume-title":"Proceedings of the First Workshop on Advances in Language and Vision Research (ALVR).","author":"Hirasawa Tosho","year":"2020","unstructured":"Tosho Hirasawa, Zhishen Yang, Mamoru Komachi, and Naoaki Okazaki. 2020. Keyframe Segmentation and Positional Encoding for Video-guided Machine Translation Challenge 2020. In Proceedings of the First Workshop on Advances in Language and Vision Research (ALVR)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.731"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-2360"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1653"},{"key":"e_1_3_2_1_21_1","volume-title":"Categorical Reparameterization with Gumbel-Softmax. In International Conference on Learning Representations.","author":"Jang Eric","year":"2017","unstructured":"Eric Jang, Shixiang Gu, and Ben Poole. 2017. Categorical Reparameterization with Gumbel-Softmax. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.findings-acl.535"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.438"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20865-2_12"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIPR51284.2021.00050"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3181116"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00515"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.260"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the Thirteenth Language Resources and Evaluation Conference. 6735-6743","author":"Li Yihang","year":"2022","unstructured":"Yihang Li, Shuichiro Shimizu, Weiqi Gu, Chenhui Chu, and Sadao Kurohashi. 2022 e. VISA: An Ambiguous Subtitles Dataset for Visual Scene-aware Machine Translation. In Proceedings of the Thirteenth Language Resources and Evaluation Conference. 6735-6743."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413715"},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311-318."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.213"},{"key":"e_1_3_2_1_33_1","unstructured":"Ramon Sanabria Ozan Caglayan Shruti Palaskar Desmond Elliott Lo\u00efc Barrault Lucia Specia and Florian Metze. 2018. How2: A Large-scale Dataset for Multimodal Language Understanding. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation. 15888-15898","author":"Shurtz Ammon","year":"2024","unstructured":"Ammon Shurtz, Lawry Sorenson, and Stephen D Richardson. 2024. The Effects of Pretraining in Video-Guided Machine Translation. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation. 15888-15898."},{"key":"e_1_3_2_1_35_1","volume-title":"Attention is all you need. Advances in neural information processing systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems (2017)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00468"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF02684770"},{"key":"e_1_3_2_1_38_1","first-page":"6153","article-title":"Good for Misconceived Reasons: An Empirical Revisiting on the Need for Visual Context in Multimodal Machine Translation. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)","author":"Wu Z","year":"2021","unstructured":"Z Wu, L Kong, W Bi, X Li, and CM Kao. 2021. Good for Misconceived Reasons: An Empirical Revisiting on the Need for Visual Context in Multimodal Machine Translation. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers). Association for Computational Linguistics., 6153-6166.","journal-title":"Association for Computational Linguistics."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-022-03331-8"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the International Conference on Machine Learning","volume":"17","author":"Yu Rose","year":"2017","unstructured":"Rose Yu, Stephan Zheng, and Yan Liu. 2017. Learning chaotic dynamics using tensor recurrent neural networks. In Proceedings of the International Conference on Machine Learning, Vol. 17."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755633","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T05:00:32Z","timestamp":1765342832000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755633"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":40,"alternative-id":["10.1145\/3746027.3755633","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755633","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}