{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:25:33Z","timestamp":1765308333057,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62172208"],"award-info":[{"award-number":["No. 62172208"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755080","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:50:47Z","timestamp":1761371447000},"page":"3428-3437","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Sentence-level Segmentation for Long Sign Language Videos with Captions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6390-4398","authenticated-orcid":false,"given":"Bowen","family":"Guo","sequence":"first","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3360-4321","authenticated-orcid":false,"given":"Shiwei","family":"Gan","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, NanJing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9497-6244","authenticated-orcid":false,"given":"Yafeng","family":"Yin","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6943-9861","authenticated-orcid":false,"given":"Xiao","family":"Liu","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5243-4992","authenticated-orcid":false,"given":"Zhiwei","family":"Jiang","sequence":"additional","affiliation":[{"name":"State Key Laboratory for Novel Software Technology, Nanjing University, Suzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6173-9787","authenticated-orcid":false,"given":"Shunmei","family":"Meng","sequence":"additional","affiliation":[{"name":"Nanjing University of Science and Technology, Nanjing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01599"},{"key":"e_1_3_2_1_2_1","first-page":"35","volume-title":"UK","author":"Albanie Samuel","year":"2020","unstructured":"Samuel Albanie, G\u00fcl Varol, Liliane Momeni, Triantafyllos Afouras, Joon Son Chung, Neil Fox, and Andrew Zisserman. 2020. BSL-1K: Scaling up co-articulated sign language recognition using mouthing cues. In Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XI 16. Springer, 35-53."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_4"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2018.06.004"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01135"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-66096-3_14"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00812"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00276"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00369"},{"key":"e_1_3_2_1_10_1","unstructured":"Xiao Fu Wei Xi Jie Yang Yutao Bai Zhao Yang Rui Jiang LI XIZHE Jiankang Gao and Jizhong Zhao. [n.d.]. Balanced Multimodal Learning: An Integrated Framework for Multi-Task Learning in Audio-Visual Fusion. ([n.d.])."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01279"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475577"},{"key":"e_1_3_2_1_13_1","unstructured":"Daiheng Gao Shilin Lu Shaw Walters Wenbo Zhou Jiaming Chu Jie Zhang Bang Zhang Mengxi Jia Jian Zhao Zhaoxin Fan et al. 2024. EraseAnything: Enabling Concept Erasure in Rectified Flow Transformers. arXiv preprint arXiv:2412.20413 (2024)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-24797-2"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSMC.1997.625742"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2016.7532885"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00249"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01404"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00237"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01272"},{"key":"e_1_3_2_1_21_1","volume-title":"Visual Alignment Pre-training for Sign Language Translation. In European Conference on Computer Vision. Springer, 349-367","author":"Jiao Peiqi","year":"2025","unstructured":"Peiqi Jiao, Yuecong Min, and Xilin Chen. 2025. Visual Alignment Pre-training for Sign Language Translation. In European Conference on Computer Vision. Springer, 349-367."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.364"},{"key":"e_1_3_2_1_23_1","volume-title":"BERT: a review of applications in natural language processing and understanding. arXiv preprint arXiv:2103.11943","author":"Koroteev Mikhail V","year":"2021","unstructured":"Mikhail V Koroteev. 2021. BERT: a review of applications in natural language processing and understanding. arXiv preprint arXiv:2103.11943 (2021)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2017.06.004"},{"key":"e_1_3_2_1_25_1","volume-title":"A hybrid RNN-HMM approach for weakly supervised temporal action segmentation","author":"Kuehne Hilde","year":"2018","unstructured":"Hilde Kuehne, Alexander Richard, and Juergen Gall. 2018. A hybrid RNN-HMM approach for weakly supervised temporal action segmentation. IEEE transactions on pattern analysis and machine intelligence, Vol. 42, 4 (2018), 765-779."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.113"},{"key":"e_1_3_2_1_27_1","first-page":"47","volume-title":"proceedings, part III 14","author":"Lea Colin","year":"2016","unstructured":"Colin Lea, Rene Vidal, Austin Reiter, and Gregory D Hager. 2016. Temporal convolutional networks: A unified approach to action segmentation. In Computer vision-ECCV 2016 workshops: Amsterdam, the Netherlands, October 8-10 and 15-16, 2016, proceedings, part III 14. Springer, 47-54."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00634"},{"key":"e_1_3_2_1_29_1","volume-title":"Set you straight: Auto-steering denoising trajectories to sidestep unwanted concepts. arXiv preprint arXiv:2504.12782","author":"Li Leyang","year":"2025","unstructured":"Leyang Li, Shilin Lu, Yan Ren, and Adams Wai-Kin Kong. 2025. Set you straight: Auto-steering denoising trajectories to sidestep unwanted concepts. arXiv preprint arXiv:2504.12782 (2025)."},{"key":"e_1_3_2_1_30_1","volume-title":"Yun Liu, Ming-Ming Cheng, and Juergen Gall.","author":"Li Shijie","year":"2020","unstructured":"Shijie Li, Yazan Abu Farha, Yun Liu, Ming-Ming Cheng, and Juergen Gall. 2020. Ms-tcn: Multi-stage temporal convolutional network for action segmentation. IEEE transactions on pattern analysis and machine intelligence, Vol. 45, 6 (2020), 6647-6658."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00930"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01927"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00218"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00615"},{"key":"e_1_3_2_1_35_1","volume-title":"Robust watermarking using generative priors against image editing: From benchmarking to advances. arXiv preprint arXiv:2410.18775","author":"Lu Shilin","year":"2024","unstructured":"Shilin Lu, Zihan Zhou, Jiayou Lu, Yuanzhi Zhu, and Adams Wai-Kin Kong. 2024b. Robust watermarking using generative priors against image editing: From benchmarking to advances. arXiv preprint arXiv:2410.18775 (2024)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00798"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01721"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01134"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of the Asian Conference on Computer Vision.","author":"Momeni Liliane","year":"2020","unstructured":"Liliane Momeni, Gul Varol, Samuel Albanie, Triantafyllos Afouras, and Andrew Zisserman. 2020. Watch, read and lookup: learning to spot signs from multiple supervisors. In Proceedings of the Asian Conference on Computer Vision."},{"key":"e_1_3_2_1_40_1","volume-title":"Linguistically motivated sign language segmentation. arXiv preprint arXiv:2310.13960","author":"Moryossef Amit","year":"2023","unstructured":"Amit Moryossef, Zifan Jiang, Mathias M\u00fcller, Sarah Ebling, and Yoav Goldberg. 2023. Linguistically motivated sign language segmentation. arXiv preprint arXiv:2310.13960 (2023)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2993650"},{"key":"e_1_3_2_1_42_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Shen Xin","year":"2024","unstructured":"Xin Shen, Shaozu Yuan, Hongwei Sheng, Heming Du, and Xin Yu. 2024. Auslan-daily: Australian sign language translation for daily communication and news. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01722"},{"key":"e_1_3_2_1_44_1","unstructured":"Yuhao Su and Ehsan Elhamifar. [n.d.]. Two-Stage Active Learning for Efficient Temporal Action Segmentation. ([n.d.])."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTARS.2015.2424457"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01658"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1088\/1757-899X\/569\/3\/032035"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02158"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01728"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01429"},{"key":"e_1_3_2_1_51_1","volume-title":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers). 425-434","author":"Yasser Hamidullah","year":"2024","unstructured":"Hamidullah Yasser, Josef Genabith, and Cristina Espa na-Bonet. 2024. Sign Language Translation with Sentence Embedding Supervision. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers). 425-434."},{"key":"e_1_3_2_1_52_1","volume-title":"Cross-modality data augmentation for end-to-end sign language translation. arXiv preprint arXiv:2305.11096","author":"Ye Jinhui","year":"2023","unstructured":"Jinhui Ye, Wenxiang Jiao, Xing Wang, Zhaopeng Tu, and Hui Xiong. 2023. Cross-modality data augmentation for end-to-end sign language translation. arXiv preprint arXiv:2305.11096 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"Improving Gloss-free Sign Language Translation by Reducing Representation Density. arXiv preprint arXiv:2405.14312","author":"Ye Jinhui","year":"2024","unstructured":"Jinhui Ye, Xing Wang, Wenxiang Jiao, Junwei Liang, and Hui Xiong. 2024. Improving Gloss-free Sign Language Translation by Reducing Representation Density. arXiv preprint arXiv:2405.14312 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Asformer: Transformer for action segmentation. arXiv preprint arXiv:2110.08568","author":"Yi Fangqiu","year":"2021","unstructured":"Fangqiu Yi, Hongyu Wen, and Tingting Jiang. 2021. Asformer: Transformer for action segmentation. arXiv preprint arXiv:2110.08568 (2021)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01925"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/2632856.2632931"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01908"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72967-6_3"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755080","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:20:41Z","timestamp":1765308041000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755080"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":58,"alternative-id":["10.1145\/3746027.3755080","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755080","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}