{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,6]],"date-time":"2026-04-06T14:40:41Z","timestamp":1775486441956,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":62,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,8,24]],"date-time":"2024-08-24T00:00:00Z","timestamp":1724457600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Scientific Research Laboratory of AI Technology and Applications, University of International Business and Economics"},{"name":"the Fundamental Research Funds for the Central Universities in UIBE","award":["22QN01"],"award-info":[{"award-number":["22QN01"]}]},{"name":"the Open Projects Program of State Key Laboratory of Multimodal Artificial Intelligence Systems"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,8,25]]},"DOI":"10.1145\/3637528.3671693","type":"proceedings-article","created":{"date-parts":[[2024,8,25]],"date-time":"2024-08-25T04:54:55Z","timestamp":1724561695000},"page":"3024-3035","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Routing Evidence for Unseen Actions in Video Moment Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4874-2639","authenticated-orcid":false,"given":"Guolong","family":"Wang","sequence":"first","affiliation":[{"name":"School of Information Technology &amp; Management, University of International Business and Economics, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1243-0641","authenticated-orcid":false,"given":"Xun","family":"Wu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7090-7869","authenticated-orcid":false,"given":"Zheng","family":"Qin","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7033-4207","authenticated-orcid":false,"given":"Liangliang","family":"Shi","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence &amp; Department of Computer Science and Engineering &amp; MoE Lab of AI, Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2024,8,24]]},"reference":[{"key":"e_1_3_2_2_1_1","first-page":"14927","article-title":"Deep evidential regression","volume":"33","author":"Amini Alexander","year":"2020","unstructured":"Alexander Amini, Wilko Schwarting, Ava Soleimany, and Daniela Rus. 2020. Deep evidential regression. Advances in Neural Information Processing Systems, Vol. 33 (2020), 14927--14937.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.618"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01310"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00299"},{"key":"e_1_3_2_2_5_1","volume-title":"Action Recognition? A New Model and the Kinetics Dataset. arXiv preprint arXiv:1705.07750","author":"Carreira Joao","year":"2018","unstructured":"Joao Carreira and Andrew Zisserman. 2018. Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset. arXiv preprint arXiv:1705.07750 (2018)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01591"},{"key":"e_1_3_2_2_7_1","first-page":"34600","article-title":"On the representation collapse of sparse mixture of experts","volume":"35","author":"Chi Zewen","year":"2022","unstructured":"Zewen Chi, Li Dong, Shaohan Huang, Damai Dai, Shuming Ma, Barun Patra, Saksham Singhal, Payal Bajaj, Xia Song, Xian-Ling Mao, et al. 2022. On the representation collapse of sparse mixture of experts. Advances in Neural Information Processing Systems, Vol. 35 (2022), 34600--34613.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_8_1","volume-title":"Proceedings of the International Conference on Machine Learning. 4057--4086","author":"Clark Aidan","year":"2022","unstructured":"Aidan Clark, Diego de Las Casas, Aurelia Guy, Arthur Mensch, Michela Paganini, Jordan Hoffmann, Bogdan Damoc, Blake Hechtman, Trevor Cai, Sebastian Borgeaud, et al. 2022. Unified scaling laws for routed language models. In Proceedings of the International Conference on Machine Learning. 4057--4086."},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3481539"},{"key":"e_1_3_2_2_10_1","volume-title":"Proceedings of the International Conference on Machine Learning. 5547--5569","author":"Du Nan","year":"2022","unstructured":"Nan Du, Yanping Huang, Andrew M Dai, Simon Tong, Dmitry Lepikhin, Yuanzhong Xu, Maxim Krikun, Yanqi Zhou, Adams Wei Yu, Orhan Firat, et al. 2022. Glam: Efficient scaling of language models with mixture-of-experts. In Proceedings of the International Conference on Machine Learning. 5547--5569."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Jiachang Hao Haifeng Sun Pengfei Ren Jingyu Wang Qi Qi and Jianxin Liao. 2022. Can shuffling video benefit temporal bias problem: A novel training framework for temporal grounding. In Visualizing and Understanding Convolutional Networks. 130--147.","DOI":"10.1007\/978-3-031-20059-5_8"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1061\/40671(2003)117"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01273"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01590"},{"key":"e_1_3_2_2_16_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_2_18_1","first-page":"1","article-title":"A closer look at debiased temporal sentence grounding in videos: Dataset, metric, and approach","volume":"19","author":"Lan Xiaohan","year":"2023","unstructured":"Xiaohan Lan, Yitian Yuan, Xin Wang, Long Chen, Zhi Wang, Lin Ma, and Wenwu Zhu. 2023. A closer look at debiased temporal sentence grounding in videos: Dataset, metric, and approach. ACM Transactions on Multimedia Computing, Communications and Applications, Vol. 19, 6 (2023), 1--23.","journal-title":"ACM Transactions on Multimedia Computing, Communications and Applications"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00304"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403072"},{"key":"e_1_3_2_2_21_1","volume-title":"Moe-llava: Mixture of experts for large vision-language models. arXiv preprint arXiv:2401.15947","author":"Lin Bin","year":"2024","unstructured":"Bin Lin, Zhenyu Tang, Yang Ye, Jiaxi Cui, Bin Zhu, Peng Jin, Junwu Zhang, Munan Ning, and Li Yuan. 2024. Moe-llava: Mixture of experts for large vision-language models. arXiv preprint arXiv:2401.15947 (2024)."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6820"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01108"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i2.20060"},{"key":"e_1_3_2_2_25_1","volume-title":"Xin Eric Wang, and William Yang Wang","author":"Lu Yujie","year":"2022","unstructured":"Yujie Lu, Huiliang Zhang, Ping Nie, Weixi Feng, Wenda Xu, Xin Eric Wang, and William Yang Wang. 2022. Anticipating the Unseen Discrepancy for Vision and Language Navigation. arXiv preprint arXiv:2209.04725 (2022)."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02207"},{"key":"e_1_3_2_2_27_1","volume-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 12435--12445","author":"Melas-Kyriazi Luke","unstructured":"Luke Melas-Kyriazi and Arjun K. Manrai. 2021. PixMatch: Unsupervised Domain Adaptation via Pixelwise Consistency Training. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 12435--12445."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01454-y"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01186"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01082"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00150"},{"key":"e_1_3_2_2_32_1","volume-title":"Niluthpol Chowdhury Mithun, and Amit K Roy-Chowdhury","author":"Paul Sudipta","year":"2022","unstructured":"Sudipta Paul, Niluthpol Chowdhury Mithun, and Amit K Roy-Chowdhury. 2022. Text-based temporal localization of novel events. In Visualizing and Understanding Convolutional Networks. 567--587."},{"key":"e_1_3_2_2_33_1","volume-title":"Subhabrata Mukherjee, David P Woodruff, Barnabas Poczos, and Hany Hassan Awadalla.","author":"Pham Hai","year":"2023","unstructured":"Hai Pham, Young Jin Kim, Subhabrata Mukherjee, David P Woodruff, Barnabas Poczos, and Hany Hassan Awadalla. 2023. Task-Based MoE for Multitask Multilingual Machine Translation. arXiv preprint arXiv:2308.15772 (2023)."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00207"},{"key":"e_1_3_2_2_35_1","volume-title":"Daniel Keysers, and Neil Houlsby.","author":"Riquelme Carlos","year":"2021","unstructured":"Carlos Riquelme, Joan Puigcerver, Basil Mustafa, Maxim Neumann, Rodolphe Jenatton, Andr\u00e9 Susano Pinto, Daniel Keysers, and Neil Houlsby. 2021. Scaling vision with sparse mixture of experts. Advances in Neural Information Processing Systems, Vol. 34 (2021)."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-long.518"},{"key":"e_1_3_2_2_37_1","volume-title":"Proceedings of the International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=B1ckMDqlg","author":"Shazeer Noam","year":"2017","unstructured":"Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean. 2017. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. In Proceedings of the International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=B1ckMDqlg"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3120545"},{"key":"e_1_3_2_2_39_1","volume-title":"Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems","author":"Tong Zhan","year":"2022","unstructured":"Zhan Tong, Yibing Song, Jue Wang, and Limin Wang. 2022. Videomae: Masked autoencoders are data-efficient learners for self-supervised video pre-training. Advances in neural information processing systems, Vol. 35 (2022), 10078--10093."},{"key":"e_1_3_2_2_40_1","volume-title":"Learning Spatiotemporal Features with 3D Convolutional Networks. arXiv preprint arXiv:1412.0767","author":"Tran Du","year":"2015","unstructured":"Du Tran, Lubomir Bourdev, Rob Fergus, Lorenzo Torresani, and Manohar Paluri. 2015. Learning Spatiotemporal Features with 3D Convolutional Networks. arXiv preprint arXiv:1412.0767 (2015)."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2022.103147"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3548004"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i8.20831"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2016.2606428"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475278"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413862"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16406"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019062"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3191841"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462823"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3475723.3484247"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3478025","article-title":"Moment is Important: Language-Based Video Moment Retrieval via Adversarial Learning","volume":"18","author":"Zeng Yawen","year":"2022","unstructured":"Yawen Zeng, Da Cao, Shaofei Lu, Hanling Zhang, Jiao Xu, and Zheng Qin. 2022. Moment is Important: Language-Based Video Moment Retrieval via Adversarial Learning. ACM Transactions on Multimedia Computing, Communications, and Applications, Vol. 18, 2 (2022), 1--21.","journal-title":"ACM Transactions on Multimedia Computing, Communications, and Applications"},{"key":"e_1_3_2_2_53_1","volume-title":"Joey Tianyi Zhou, and Rick Siow Mong Goh","author":"Zhang Hao","year":"2021","unstructured":"Hao Zhang, Aixin Sun, Wei Jing, Liangli Zhen, Joey Tianyi Zhou, and Rick Siow Mong Goh. 2021. Natural language video localization: A revisit in span-based question answering framework. IEEE transactions on pattern analysis and machine intelligence, Vol. 44, 8 (2021), 4252--4266."},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3120745"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6984"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00215"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02201"},{"key":"e_1_3_2_2_58_1","volume-title":"Proceedings of the International Conference on Learning Representations.","author":"Zhao Xinyu","year":"2024","unstructured":"Xinyu Zhao, Xuxi Chen, Yu Cheng, and Tianlong Chen. 2024. Sparse MoE with Language Guided Routing for Multilingual Machine Translation. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00418"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.794"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01511"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01631"}],"event":{"name":"KDD '24: The 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Barcelona Spain","acronym":"KDD '24","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 30th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671693","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3637528.3671693","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:06:00Z","timestamp":1750291560000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3637528.3671693"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,8,24]]},"references-count":62,"alternative-id":["10.1145\/3637528.3671693","10.1145\/3637528"],"URL":"https:\/\/doi.org\/10.1145\/3637528.3671693","relation":{},"subject":[],"published":{"date-parts":[[2024,8,24]]},"assertion":[{"value":"2024-08-24","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}