{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,6]],"date-time":"2025-11-06T11:46:32Z","timestamp":1762429592844,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Natural Science Foundation of China","award":["U20A20183"],"award-info":[{"award-number":["U20A20183"]}]},{"name":"Supercomputing Center of the USTC"},{"name":"MCC Lab of Information Science and Technology Institution"},{"name":"National Natural Science Foundation of China","award":["62102128"],"award-info":[{"award-number":["62102128"]}]},{"name":"Youth Innovation Promotion Association CAS"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681237","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"5141-5150","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["SEDS: Semantically Enhanced Dual-Stream Encoder for Sign Language Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-7677-3346","authenticated-orcid":false,"given":"Longtao","family":"Jiang","sequence":"first","affiliation":[{"name":"MoE Key Laboratory of Brain-inspired Intelligent Perception and Cognition, University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3048-6980","authenticated-orcid":false,"given":"Min","family":"Wang","sequence":"additional","affiliation":[{"name":"Institute of Artificial Intelligence, Hefei Comprehensive National Science Center, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3537-4008","authenticated-orcid":false,"given":"Zecheng","family":"Li","sequence":"additional","affiliation":[{"name":"MoE Key Laboratory of Brain-inspired Intelligent Perception and Cognition, University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5316-5353","authenticated-orcid":false,"given":"Yao","family":"Fang","sequence":"additional","affiliation":[{"name":"Merchants Union Consumer Finance Company Limited, Shenzhen, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1690-9836","authenticated-orcid":false,"given":"Wengang","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2188-3028","authenticated-orcid":false,"given":"Houqiang","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_3"},{"key":"e_1_3_2_1_2_1","volume-title":"International Conference on Learning Representations (ICLR). 2322--2331","author":"Bahdanau Dzmitry","year":"2015","unstructured":"Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. 2015. Neural machine translation by jointly learning to align and translate. In International Conference on Learning Representations (ICLR). 2322--2331."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00812"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-66823-5_18"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01004"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00506"},{"key":"e_1_3_2_1_9_1","unstructured":"Yutong Chen Ronglai Zuo Fangyun Wei Yu Wu Shujie Liu and Brian Mak. 2022. Two-stream network for sign language recognition and translation. Advances in Neural Information Processing Systems (NeurIPS) 17043--17056."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01823"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.332"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.175"},{"key":"e_1_3_2_1_13_1","article-title":"A deep neural framework for continuous sign language recognition by iterative training","author":"Cui Runpeng","year":"2019","unstructured":"Runpeng Cui, Hu Liu, and Changshui Zhang. 2019. A deep neural framework for continuous sign language recognition by iterative training. IEEE Transactions on Multimedia (TMM), 1880--1891.","journal-title":"IEEE Transactions on Multimedia (TMM), 1880--1891."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3059295"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01370"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00276"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475577"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00495"},{"key":"e_1_3_2_1_20_1","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 16857--16866","author":"Gul Varol","year":"2021","unstructured":"Varol Gul, Momeni Liliane, Albanie Samuel, Afouras Triantafyllos, and Zisserman Andrew. 2021. Read and attend: Temporal localisation in sign language videos. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 16857--16866."},{"key":"e_1_3_2_1_21_1","volume-title":"Distilling Cross-Temporal Contexts for Continuous Sign Language Recognition. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 10771--10780","author":"Guo Leming","year":"2023","unstructured":"Leming Guo, Wanli Xue, Qing Guo, Bo Liu, Kaihua Zhang, Tiantian Yuan, and Shengyong Chen. 2023. Distilling Cross-Temporal Contexts for Continuous Sign Language Recognition. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 10771--10780."},{"key":"e_1_3_2_1_22_1","volume-title":"Momentum Contrast for Unsupervised Visual Representation Learning. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 9726--9735","author":"He Kaiming","year":"2020","unstructured":"Kaiming He, Haoqi Fan, Yuxin Wu, Saining Xie, and Ross Girshick. 2020. Momentum Contrast for Unsupervised Visual Representation Learning. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 9726--9735."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00975"},{"key":"e_1_3_2_1_24_1","volume-title":"SignBERT: Pre-Training of Hand-Model-Aware Representation for Sign Language Recognition. In International Conference on Computer Vision (ICCV). 11067--11076","author":"Hu Hezhen","year":"2021","unstructured":"Hezhen Hu, Weichao Zhao, Wengang Zhou, Yuechen Wang, and Houqiang Li. 2021. SignBERT: Pre-Training of Hand-Model-Aware Representation for Sign Language Recognition. In International Conference on Computer Vision (ICCV). 11067--11076."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"Hezhen Hu Wengang Zhou Junfu Pu and Houqiang Li. 2021. Global-local enhancement network for NMF-aware sign language recognition. ACM Transactions on Multimedia Computing Communications and Applications (TOMM) 1--19.","DOI":"10.1145\/3436754"},{"key":"e_1_3_2_1_26_1","volume-title":"Continuous Sign Language Recognition with Correlation Network. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 2529--2539","author":"Hu Lianyu","year":"2023","unstructured":"Lianyu Hu, Liqing Gao, Zekang Liu, and Wei Feng. 2023. Continuous Sign Language Recognition with Correlation Network. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 2529--2539."},{"key":"e_1_3_2_1_27_1","volume-title":"AdaBrowse: Adaptive Video Browser for Efficient Continuous Sign Language Recognition. In ACM International Conference on Multimedia (ACM MM). 709--718","author":"Hu Lianyu","year":"2023","unstructured":"Lianyu Hu, Liqing Gao, Zekang Liu, Chi-Man Pun, and Wei Feng. 2023. AdaBrowse: Adaptive Video Browser for Efficient Continuous Sign Language Recognition. In ACM International Conference on Multimedia (ACM MM). 709--718."},{"key":"e_1_3_2_1_28_1","unstructured":"Tao Jiang Peng Lu Li Zhang Ningsheng Ma Rui Han Chengqi Lyu Yining Li and Kai Chen. 2023. RTMPose: Real-Time Multi-Person Pose Estimation based on MMPose. arXiv preprint arXiv:2303.07399."},{"key":"e_1_3_2_1_29_1","volume-title":"CoSign: Exploring Co-occurrence Signals in Skeleton-based Continuous Sign Language Recognition. In International Conference on Computer Vision (ICCV). 20619--20629","author":"Jiao Peiqi","year":"2023","unstructured":"Peiqi Jiao, Yuecong Min, Yanan Li, Xiaotao Wang, Lei Lei, and Xilin Chen. 2023. CoSign: Exploring Co-occurrence Signals in Skeleton-based Continuous Sign Language Recognition. In International Conference on Computer Vision (ICCV). 20619--20629."},{"key":"e_1_3_2_1_30_1","first-page":"30291","article-title":"Expectation-maximization contrastive learning for compact video-and-language representations","volume":"35","author":"Jin Peng","year":"2022","unstructured":"Peng Jin, Jinfa Huang, Fenglin Liu, Xian Wu, Shen Ge, Guoli Song, David Clifton, and Jie Chen. 2022. Expectation-maximization contrastive learning for compact video-and-language representations. Advances in Neural Information Processing Systems (NeurIPS) 35, 30291--30306.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_31_1","volume-title":"Text-Video Retrieval with Disentangled Conceptualization and Set-to-Set Alignment. In International Joint Conference on Artificial Intelligence (IJCAI). 938--946","author":"Jin Peng","year":"2023","unstructured":"Peng Jin, Hao Li, Zesen Cheng, Jinfa Huang, Zhennan Wang, Li Yuan, Chang Liu, and Jie Chen. 2023. Text-Video Retrieval with Disentangled Conceptualization and Set-to-Set Alignment. In International Joint Conference on Artificial Intelligence (IJCAI). 938--946."},{"key":"e_1_3_2_1_32_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_33_1","unstructured":"Alex Krizhevsky Ilya Sutskever and Geoffrey E Hinton. 2012. Imagenet classifi- cation with deep convolutional neural networks. Advances in Neural Information Processing Systems (NeurIPS) 3235--3254."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00033"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01896"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018585"},{"key":"e_1_3_2_1_37_1","volume-title":"Tspnet: Hierarchical feature learning via temporal semantic pyramid for sign language translation. Advances in Neural Information Processing Systems (NeurIPS), 12034--12045.","author":"Li Dongxu","year":"2020","unstructured":"Dongxu Li, Chenchen Xu, Xin Yu, Kaihao Zhang, Benjamin Swift, Hanna Suominen, and Hongdong Li. 2020. Tspnet: Hierarchical feature learning via temporal semantic pyramid for sign language translation. Advances in Neural Information Processing Systems (NeurIPS), 12034--12045."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00379"},{"key":"e_1_3_2_1_39_1","unstructured":"Chengzhi Lin Ancong Wu Junwei Liang Jun Zhang Wenhang Ge Wei-Shi Zheng and Chunhua Shen. 2022. Text-adaptive multiple visual prototype matching for video-text retrieval. Advances in Neural Information Processing Systems (NeurIPS) 38655--38666."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01170"},{"key":"e_1_3_2_1_41_1","volume-title":"British Machine Vision Conference (BMVC). 2035--2045","author":"Liu Yang","year":"2019","unstructured":"Yang Liu, Samuel Albanie, Arsha Nagrani, and Andrew Zisserman. 2019. Use What You Have: Video retrieval using representations from collaborative experts. In British Machine Vision Conference (BMVC). 2035--2045."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19781-9_19"},{"key":"e_1_3_2_1_43_1","volume-title":"Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983","author":"Loshchilov Ilya","year":"2016","unstructured":"Ilya Loshchilov and Frank Hutter. 2016. Sgdr: Stochastic gradient descent with warm restarts. arXiv preprint arXiv:1608.03983 (2016)."},{"key":"e_1_3_2_1_44_1","volume-title":"Clip4clip: An empirical study of clip for end to end video clip retrieval and captioning. Neural Computing and Applications (Neurocomputing)","author":"Luo Huaishao","year":"2022","unstructured":"Huaishao Luo, Lei Ji, Ming Zhong, Yang Chen, Wen Lei, Nan Duan, and Tianrui Li. 2022. Clip4clip: An empirical study of clip for end to end video clip retrieval and captioning. Neural Computing and Applications (Neurocomputing) (2022), 293--304."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D15-1166"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503161.3547910"},{"key":"e_1_3_2_1_47_1","volume-title":"Asian Conference on Computer Vision (ACCV). 291--308","author":"Momeni Liliane","year":"2020","unstructured":"Liliane Momeni, Gul Varol, Samuel Albanie, Triantafyllos Afouras, and Andrew Zisserman. 2020. Watch, Read and Lookup: Learning to Spot Signs from Multiple Supervisors. In Asian Conference on Computer Vision (ACCV). 291--308."},{"volume-title":"Multimodal Sign Language Recognition via Temporal Deformable Convolutional Sequence Learning","author":"Papadimitriou Katerina","key":"e_1_3_2_1_48_1","unstructured":"Katerina Papadimitriou and Gerasimos Potamianos. 2020. Multimodal Sign Language Recognition via Temporal Deformable Convolutional Sequence Learning..In International Speech Communication Association (Interspeech). 2752--2756."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413931"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00429"},{"key":"e_1_3_2_1_51_1","volume-title":"International Conference on Machine Learning (ICML). 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning (ICML). 8748--8763."},{"key":"e_1_3_2_1_52_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning (ICML). 8339--8349","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In International Conference on Machine Learning (ICML). 8339--8349."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00584"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01173"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00264"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00054"},{"key":"e_1_3_2_1_57_1","unstructured":"Lumin Xu Yingda Guan Sheng Jin Wentao Liu Chen Qian Ping Luo Wanli Ouyang and Xiaogang Wang. 2021. ViPNAS: Efficient Video Pose Estimation via Neural Architecture Search. (2021) 3233--3243."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01429"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00505"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475544"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00251"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46478-7_27"},{"key":"e_1_3_2_1_63_1","article-title":"Conditional sentence generation and cross-modal reranking for sign language translation","author":"Zhao Jian","year":"2021","unstructured":"Jian Zhao, Weizhen Qi, Wengang Zhou, Nan Duan, Ming Zhou, and Houqiang Li. 2021. Conditional sentence generation and cross-modal reranking for sign language translation. IEEE Transactions on Multimedia (TMM), 2662--2672.","journal-title":"IEEE Transactions on Multimedia (TMM), 2662--2672."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531950"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01908"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00137"},{"key":"e_1_3_2_1_67_1","article-title":"Spatial-temporal multi-cue network for sign language recognition and translation","author":"Zhou Hao","year":"2021","unstructured":"Hao Zhou, Wengang Zhou, Yun Zhou, and Houqiang Li. 2021. Spatial-temporal multi-cue network for sign language recognition and translation. IEEE Transactions on Multimedia (TMM), 768--779.","journal-title":"IEEE Transactions on Multimedia (TMM), 768--779."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00507"},{"key":"e_1_3_2_1_69_1","volume-title":"Natural Language-Assisted Sign Language Recognition. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 14890--14900","author":"Zuo Ronglai","year":"2023","unstructured":"Ronglai Zuo, Fangyun Wei, and Brian Mak. 2023. Natural Language-Assisted Sign Language Recognition. In IEEE Conference on Computer Vision and Pattern Recognition (CVPR). 14890--14900."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Melbourne VIC Australia","acronym":"MM '24"},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681237","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681237","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:03Z","timestamp":1750295883000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681237"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":69,"alternative-id":["10.1145\/3664647.3681237","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681237","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}