{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T16:59:47Z","timestamp":1780765187992,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681537","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:27Z","timestamp":1729925967000},"page":"5507-5516","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Probabilistic Vision-Language Representation for Weakly Supervised Temporal Action Localization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2204-6109","authenticated-orcid":false,"given":"Geuntaek","family":"Lim","sequence":"first","affiliation":[{"name":"Sejong University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5408-8365","authenticated-orcid":false,"given":"Hyunwoo","family":"Kim","sequence":"additional","affiliation":[{"name":"Sejong University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6470-0773","authenticated-orcid":false,"given":"Joonsoo","family":"Kim","sequence":"additional","affiliation":[{"name":"Electronics and Telecommunications Research Institute, Daejeon, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9970-0132","authenticated-orcid":false,"given":"Yukyung","family":"Choi","sequence":"additional","affiliation":[{"name":"Sejong University, Seoul, Republic of Korea"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00124"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19830-4_29"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00831"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW53098.2021.00444"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_6"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3287208"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01355"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475298"},{"key":"e_1_3_2_1_11_1","volume-title":"Learning Proposal-aware Re-ranking for Weakly-supervised Temporal Action Localization","author":"Hu Yufan","year":"2023","unstructured":"Yufan Hu, Jie Fu, Mengyuan Chen, Junyu Gao, Jianfeng Dong, Bin Fan, and Hongmin Liu. 2023. Learning Proposal-aware Re-ranking for Weakly-supervised Temporal Action Localization. IEEE Transactions on Circuits and Systems for Video Technology (2023)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00327"},{"key":"e_1_3_2_1_13_1","volume-title":"The thumos challenge on action recognition for videos 'in the wild'. Computer Vision and Image Understanding","author":"Idrees Haroon","year":"2017","unstructured":"Haroon Idrees, Amir R Zamir, Yu-Gang Jiang, Alex Gorban, Ivan Laptev, Rahul Sukthankar, and Mubarak Shah. 2017. The thumos challenge on action recognition for videos 'in the wild'. Computer Vision and Image Understanding (2017)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16256"},{"key":"e_1_3_2_1_15_1","volume-title":"VVS: Video-to-Video Retrieval with Irrelevant Frame Suppression.","author":"Jo Won","year":"2023","unstructured":"Won Jo, Geuntaek Lim, Gwangjin Lee, Hyunwoo Kim, Byungsoo Ko, and Yukyung Choi. 2023. VVS: Video-to-Video Retrieval with Irrelevant Frame Suppression. (2023)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01417"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3183789"},{"key":"e_1_3_2_1_19_1","volume-title":"Variational dropout and the local reparameterization trick. Advances in Neural Information Processing Systems","author":"Kingma Durk P","year":"2015","unstructured":"Durk P Kingma, Tim Salimans, and Max Welling. 2015. Variational dropout and the local reparameterization trick. Advances in Neural Information Processing Systems (2015)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00654"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6793"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16280"},{"key":"e_1_3_2_1_23_1","volume-title":"Weakly Supervised Temporal Action Localization With Bidirectional Semantic Consistency Constraint","author":"Li Guozhang","year":"2023","unstructured":"Guozhang Li, De Cheng, Xinpeng Ding, NannanWang, Jie Li, and Xinbo Gao. 2023. Weakly Supervised Temporal Action Localization With Bidirectional Semantic Consistency Constraint. IEEE Transactions on Neural Networks and Learning Systems (2023)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01026"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01929"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00399"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2019.2940407"},{"key":"e_1_3_2_1_28_1","volume-title":"Modal Consensus and Contextual Separation for Weakly Supervised Temporal Action Localization. In IEEE International Conference on Acoustics, Speech and Signal Processing.","author":"Liu Peng","year":"2024","unstructured":"Peng Liu, Chuanxu Wang, and Min Zhao. 2024. Modal Consensus and Contextual Separation for Weakly Supervised Temporal Action Localization. In IEEE International Conference on Acoustics, Speech and Signal Processing."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00957"},{"key":"e_1_3_2_1_30_1","volume-title":"Clip4clip: An empirical study of clip for end to end video clip retrieval and captioning. Neurocomputing","author":"Luo Huaishao","year":"2022","unstructured":"Huaishao Luo, Lei Ji, Ming Zhong, Yang Chen, Wen Lei, Nan Duan, and Tianrui Li. 2022. Clip4clip: An empirical study of clip for end to end video clip retrieval and captioning. Neurocomputing (2022)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_2_1_32_1","volume-title":"Proceedings of the European Conference on Computer Vision.","author":"Min Kyle","year":"2020","unstructured":"Kyle Min and Jason J Corso. 2020. Adversarial background-aware loss for weaklysupervised temporal activity localization. In Proceedings of the European Conference on Computer Vision."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00706"},{"key":"e_1_3_2_1_34_1","volume-title":"International Conference on Machine Learning.","author":"Novack Zachary","year":"2023","unstructured":"Zachary Novack, Julian McAuley, Zachary Chase Lipton, and Saurabh Garg. 2023. Chils: Zero-shot image classification with hierarchical label sets. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00593"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01430"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_35"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_39_1","volume-title":"International Conference on Machine Learning.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01755"},{"key":"e_1_3_2_1_41_1","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition.","author":"Ren Huan","year":"2023","unstructured":"Huan Ren, Wenfei Yang, Tianzhu Zhang, and Yongdong Zhang. 2023. Proposal-Based Multiple Instance Learning forWeakly-Supervised Temporal Action Localization. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02202"},{"key":"e_1_3_2_1_43_1","volume-title":"Burcu Karagol Ayan, Tim Salimans, et al.","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022. Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems (2022)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2967627"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01270-0_10"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00609"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00151"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00182"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00936"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01810"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3284853"},{"key":"e_1_3_2_1_52_1","volume-title":"International Conference on Learning Representations","author":"Xue Hongwei","year":"2023","unstructured":"Hongwei Xue, Yuchong Sun, Bei Liu, Jianlong Fu, Ruihua Song, Houqiang Li, and Jiebo Luo. 2023. Clip-vip: Adapting pre-trained image-text model to videolanguage representation alignment. International Conference on Learning Representations (2023)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28516"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00719"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01575"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_29"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3205594"},{"key":"e_1_3_2_1_58_1","volume-title":"Cross-Video Contextual Knowledge Exploration and Exploitation for Ambiguity Reduction in Weakly Supervised Temporal Action Localization","author":"Zhang Songchun","year":"2023","unstructured":"Songchun Zhang and Chunhui Zhao. 2023. Cross-Video Contextual Knowledge Exploration and Exploitation for Ambiguity Reduction in Weakly Supervised Temporal Action Localization. IEEE Transactions on Circuits and Systems for Video Technology (2023)."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58598-3_32"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3374870"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02203"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681537","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681537","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:57:48Z","timestamp":1750294668000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681537"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":61,"alternative-id":["10.1145\/3664647.3681537","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681537","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}