{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T09:04:32Z","timestamp":1765357472265,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100018735","name":"Ant Group","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100018735","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shandong Provincial Natural Science and Foundation","award":["ZR2020QF106"],"award-info":[{"award-number":["ZR2020QF106"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62176137, 62006140"],"award-info":[{"award-number":["62176137, 62006140"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612120","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T19:52:06Z","timestamp":1698436326000},"page":"4637-4646","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["Temporal Sentence Grounding in Streaming Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3197-5698","authenticated-orcid":false,"given":"Tian","family":"Gan","sequence":"first","affiliation":[{"name":"Shandong University, Qingdao, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4879-2169","authenticated-orcid":false,"given":"Xiao","family":"Wang","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4959-0929","authenticated-orcid":false,"given":"Yan","family":"Sun","sequence":"additional","affiliation":[{"name":"Shandong University, Qingdao, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0247-5221","authenticated-orcid":false,"given":"Jianlong","family":"Wu","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8638-6594","authenticated-orcid":false,"given":"Qingpei","family":"Guo","sequence":"additional","affiliation":[{"name":"Ant Group, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1476-0273","authenticated-orcid":false,"given":"Liqiang","family":"Nie","sequence":"additional","affiliation":[{"name":"Harbin Institute of Technology, Shenzhen, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"streaming videos-modeling and evaluation,\" in European Conference on Computer Vision","author":"Shou Z.","year":"2018","unstructured":"Z. Shou, J. Pan, J. Chan, K. Miyazawa, H. Mansour, A. Vetro, X. G. Nieto, and S.-F. Chang, ?Online action detection in untrimmed, streaming videos-modeling and evaluation,\" in European Conference on Computer Vision. Springer, 2018, p. 5."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3087038"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3300941"},{"key":"e_1_3_2_1_4_1","first-page":"19","volume-title":"IEEE","author":"Dong X.","year":"2022","unstructured":"X. Dong, T. Gan, X. Song, J. Wu, Y. Cheng, and L. Nie, ?Stacked hybrid-attention and group collaborative learning for unbiased scene graph generation,\" in IEEE\/CVF Conference on Computer Vision and Pattern Recognition. IEEE, 2022, pp. 19 405--19 414."},{"key":"e_1_3_2_1_5_1","first-page":"269","volume-title":"?Online Action Detection,\" in European Conference on Computer Vision","author":"Geest R. D.","year":"2016","unstructured":"R. D. Geest, E. Gavves, A. Ghodrati, Z. Li, C. Snoek, and T. Tuytelaars, ?Online Action Detection,\" in European Conference on Computer Vision. Springer, 2016, pp. 269--284."},{"key":"e_1_3_2_1_6_1","first-page":"1086","author":"Xu M.","year":"2021","unstructured":"M. Xu, Y. Xiong, H. Chen, X. Li, W. Xia, Z. Tu, and S. Soatto, ?Long Short-Term Transformer for Online Action Detection,\" in advances in Neural Information Processing Systems, 2021, pp. 1086--1099.","journal-title":"Long Short-Term Transformer for Online Action Detection,\" in advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3265261"},{"key":"e_1_3_2_1_8_1","volume-title":"?Antpivot: Livestream highlight detection via hierarchical attention mechanism,\" arXiv","author":"Zhao Y.","year":"2022","unstructured":"Y. Zhao, X. Lin, W. Xu, M. Zheng, Z. Liu, and Z. Zhao, ?Antpivot: Livestream highlight detection via hierarchical attention mechanism,\" arXiv, 2022."},{"key":"e_1_3_2_1_9_1","first-page":"5531","volume-title":"IEEE","author":"Xu M.","year":"2019","unstructured":"M. Xu, M. Gao, Y.-T. Chen, L. Davis, and D. J. Crandall, ?Temporal Recurrent Networks for Online Action Detection,\" in International Conference on Computer Vision. IEEE, 2019, pp. 5531--5540."},{"key":"e_1_3_2_1_10_1","first-page":"7545","volume-title":"IEEE","author":"Wang X.","year":"2021","unstructured":"X. Wang, S. Zhang, Z. Qing, Y. Shao, Z. Zuo, C. Gao, and N. Sang, ?OadTR: Online Action Detection with Transformers,\" in International Conference on Computer Vision. IEEE, 2021, pp. 7545--7555."},{"key":"e_1_3_2_1_11_1","volume-title":"?The Elements of Temporal Sentence Grounding in Videos: A Survey and Future Directions,\" arXiv","author":"Zhang H.","year":"2022","unstructured":"H. Zhang, A. Sun, W. Jing, and J. T. Zhou, ?The Elements of Temporal Sentence Grounding in Videos: A Survey and Future Directions,\" arXiv, 2022."},{"key":"e_1_3_2_1_12_1","first-page":"13","volume-title":"ACM","author":"Yuan Y.","year":"2021","unstructured":"Y. Yuan, X. Lan, X. Wang, L. Chen, Z. Wang, and W. Zhu, ?A Closer Look at Temporal Sentence Grounding in Videos: Dataset and Metric,\" in International Workshop on Human-Centric Multimedia Analysis. ACM, 2021, pp. 13--21."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3532626"},{"key":"e_1_3_2_1_14_1","first-page":"5277","volume-title":"IEEE","author":"Gao J.","year":"2017","unstructured":"J. Gao, C. Sun, Z. Yang, and R. Nevatia, ?TALL: Temporal Activity Localization via Language Query,\" in International Conference on Computer Vision. IEEE, 2017, pp. 5277--5285."},{"key":"e_1_3_2_1_15_1","first-page":"5804","volume-title":"IEEE","author":"Hendricks L. A.","year":"2017","unstructured":"L. A. Hendricks, O. Wang, E. Shechtman, J. Sivic, T. Darrell, and B. C. Russell, ?Localizing Moments in Video with Natural Language,\" in International Conference on Computer Vision. IEEE, 2017, pp. 5804--5813."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240549"},{"key":"e_1_3_2_1_17_1","first-page":"9062","volume-title":"?Multilevel Language and Vision Integration for Text-to-Clip Retrieval,\" in AAAI Conference on Artificial Intelligence","author":"Xu H.","year":"2019","unstructured":"H. Xu, K. He, B. A. Plummer, L. Sigal, S. Sclaroff, and K. Saenko, ?Multilevel Language and Vision Integration for Text-to-Clip Retrieval,\" in AAAI Conference on Artificial Intelligence. AAAI Press, 2019, pp. 9062--9069."},{"key":"e_1_3_2_1_18_1","first-page":"162","volume-title":"Conference on Empirical Methods in Natural Language Processing. ACL","author":"Chen J.","year":"2018","unstructured":"J. Chen, X. Chen, L. Ma, Z. Jie, and T.-S. Chua, ?Temporally Grounding Natural Sentence in Video,\" in Conference on Empirical Methods in Natural Language Processing. ACL, 2018, pp. 162--171."},{"key":"e_1_3_2_1_19_1","first-page":"12","volume-title":"?Learning 2D Temporal Adjacent Networks for Moment Localization with Natural Language,\" in AAAI Conference on Artificial Intelligence","author":"Zhang S.","year":"2020","unstructured":"S. Zhang, H. Peng, J. Fu, and J. Luo, ?Learning 2D Temporal Adjacent Networks for Moment Localization with Natural Language,\" in AAAI Conference on Artificial Intelligence. AAAI Press, 2020, pp. 12 870--12 877."},{"key":"e_1_3_2_1_20_1","volume-title":"?Multi-scale 2d Temporal Adjacency Networks for Moment Localization with Natural Language,\" IEEE Transactions on Pattern Analysis and Machine Intelligence","author":"Zhang S.","year":"2021","unstructured":"S. Zhang, H. Peng, J. Fu, Y. Lu, and J. Luo, ?Multi-scale 2d Temporal Adjacency Networks for Moment Localization with Natural Language,\" IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 1, 2021."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-13-7603-0"},{"key":"e_1_3_2_1_22_1","first-page":"9159","volume-title":"?To Find Where You Talk: Temporal Sentence Localization in Video with Attention Based Location Regression,\" in AAAI Conference on Artificial Intelligence","author":"Yuan Y.","year":"2019","unstructured":"Y. Yuan, T. Mei, and W. Zhu, ?To Find Where You Talk: Temporal Sentence Localization in Video with Attention Based Location Regression,\" in AAAI Conference on Artificial Intelligence. AAAI Press, 2019, pp. 9159--9166."},{"key":"e_1_3_2_1_23_1","first-page":"4280","volume-title":"ACM","author":"Qu X.","year":"2020","unstructured":"X. Qu, P. Tang, Z. Zou, Y. Cheng, J. Dong, P. Zhou, and Z. Xu, ?Fine-grained Iterative Attention Network for Temporal Language Localization in Videos,\" in International Conference on Multimedia. ACM, 2020, pp. 4280--4288."},{"key":"e_1_3_2_1_24_1","volume-title":"?Text-Visual Prompting for Efficient 2D Temporal Video Grounding,\" arXiv","author":"Zhang Y.","year":"2023","unstructured":"Y. Zhang, X. Chen, J. Jia, S. Liu, and K. Ding, ?Text-Visual Prompting for Efficient 2D Temporal Video Grounding,\" arXiv, 2023."},{"key":"e_1_3_2_1_25_1","volume-title":"?Scanning Only Once: An End-to-end Framework for Fast Temporal Grounding in Long Videos,\" arXiv","author":"Pan Y.","year":"2023","unstructured":"Y. Pan, X. He, B. Gong, Y. Lv, Y. Shen, Y. Peng, and D. Zhao, ?Scanning Only Once: An End-to-end Framework for Fast Temporal Grounding in Long Videos,\" arXiv, 2023."},{"key":"e_1_3_2_1_26_1","volume-title":"?Towards Generalisable Video Moment Retrieval: Visual-Dynamic Injection to Image-Text Pre-Training,\" arXiv","author":"Luo D.","year":"2023","unstructured":"D. Luo, J. Huang, S. Gong, H. Jin, and Y. Liu, ?Towards Generalisable Video Moment Retrieval: Visual-Dynamic Injection to Image-Text Pre-Training,\" arXiv, 2023."},{"key":"e_1_3_2_1_27_1","first-page":"1984","volume-title":"ACL","author":"Ghosh S.","year":"2019","unstructured":"S. Ghosh, A. Agarwal, Z. Parekh, and A. G. Hauptmann, ?ExCL: Extractive Clip Localization Using Natural Language Descriptions,\" in Conference of the North American Chapter of the Association for Computational Linguistics. ACL, 2019, pp. 1984--1990."},{"key":"e_1_3_2_1_28_1","first-page":"8175","volume-title":"?Localizing Natural Language in Videos,\" in AAAI Conference on Artificial Intelligence","author":"Chen J.","year":"2019","unstructured":"J. Chen, L. Ma, X. Chen, Z. Jie, and J. Luo, ?Localizing Natural Language in Videos,\" in AAAI Conference on Artificial Intelligence. AAAI Press, 2019, pp. 8175--8182."},{"key":"e_1_3_2_1_29_1","first-page":"6543","volume-title":"ACL","author":"Zhang H.","year":"2020","unstructured":"H. Zhang, A. Sun, W. Jing, and J. T. Zhou, ?Span-based Localizing Network for Natural Language Video Localization,\" in Annual Meeting of the Association for Computational Linguistics. ACL, 2020, pp. 6543--6554."},{"key":"e_1_3_2_1_30_1","first-page":"4252","volume":"44","author":"Zhang H.","year":"2021","unstructured":"H. Zhang, A. Sun, W. Jing, L. Zhen, J. T. Zhou, and R. S. M. Goh, ?Natural Language Video Localization: A Revisit in Span-based Question Answering Framework,\" IEEE Transactions on Pattern Analysis and Machine Intelligence, vol. 44, pp. 4252--4266, 2021.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"e_1_3_2_1_31_1","first-page":"4197","volume-title":"IEEE","author":"Zhao Y.","year":"2021","unstructured":"Y. Zhao, Z. Zhao, Z. Zhang, and Z. Lin, ?Cascaded Prediction Network via Segment Tree for Temporal Video Grounding,\" in IEEE Conference on Computer Vision and Pattern Recognition. IEEE, 2021, pp. 4197--4206."},{"key":"e_1_3_2_1_32_1","first-page":"8393","volume-title":"?Read, Watch, and Move: Reinforcement Learning for Temporally Grounding Natural Language Descriptions in Videos,\" in AAAI Conference on Artificial Intelligence","author":"He D.","year":"2019","unstructured":"D. He, X. Zhao, J. Huang, F. Li, X. Liu, and S. Wen, ?Read, Watch, and Move: Reinforcement Learning for Temporally Grounding Natural Language Descriptions in Videos,\" in AAAI Conference on Artificial Intelligence. AAAI Press, 2019, pp. 8393--8400."},{"key":"e_1_3_2_1_33_1","first-page":"334","volume-title":"IEEE","author":"Wang W.","year":"2019","unstructured":"W. Wang, Y. Huang, and L. Wang, ?Language-Driven Temporal Activity Localization: A Semantic Matching Reinforcement Learning Model,\" in Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. IEEE, 2019, pp. 334--343."},{"key":"e_1_3_2_1_34_1","first-page":"4162","volume-title":"ACM","author":"Cao D.","year":"2020","unstructured":"D. Cao, Y. Zeng, M. Liu, X. He, M. Wang, and Z. Qin, ?STRONG: Spatio-Temporal Reinforcement Learning for Cross-Modal Video Moment Localization,\" in International Conference on Multimedia. ACM, 2020, pp. 4162--4170."},{"key":"e_1_3_2_1_35_1","first-page":"1283","volume-title":"ACM","author":"Wu J.","year":"2020","unstructured":"J. Wu, G. Li, X. Han, and L. Lin, ?Reinforcement Learning for Weakly Supervised Temporal Grounding of Natural Language in Untrimmed Videos,\" in International Conference on Multimedia. ACM, 2020, pp. 1283--1291."},{"key":"e_1_3_2_1_36_1","first-page":"12","volume-title":"?Tree-Structured Policy Based Progressive Reinforcement Learning for Temporally Language Grounding in Video,\" in AAAI Conference on Artificial Intelligence","author":"Wu J.","year":"2020","unstructured":"J. Wu, G. Li, S. Liu, and L. Lin, ?Tree-Structured Policy Based Progressive Reinforcement Learning for Temporally Language Grounding in Video,\" in AAAI Conference on Artificial Intelligence. AAAI Press, 2020, pp. 12 386--12 393."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3086591"},{"key":"e_1_3_2_1_38_1","volume-title":"?Progressive privileged knowledge distillation for online action detection,\" Pattern Recognit","author":"Zhao P.","year":"2022","unstructured":"P. Zhao, L. Xie, J. Wang, Y. Zhang, and Q. Tian, ?Progressive privileged knowledge distillation for online action detection,\" Pattern Recognit., vol. 129, p. 108741, 2022."},{"key":"e_1_3_2_1_39_1","first-page":"806","volume-title":"IEEE","author":"Eun H.","year":"2020","unstructured":"H. Eun, J. Moon, J. Park, C. Jung, and C. Kim, ?Learning to Discriminate Information for Online Action Detection,\" in Conference on Computer Vision and Pattern Recognition. IEEE, 2020, pp. 806--815."},{"key":"e_1_3_2_1_40_1","first-page":"806","volume-title":"IEEE","author":"Chen J.","year":"2022","unstructured":"J. Chen, G. Mittal, Y. Yu, Y. Kong, and M. Chen, ?GateHUB: Gated History Unit with Background Suppression for Online Action Detection,\" in Conference on Computer Vision and Pattern Recognition. IEEE, 2022, pp. 806--815."},{"key":"e_1_3_2_1_41_1","first-page":"13","volume-title":"IEEE","author":"Kang H.","year":"2021","unstructured":"H. Kang, K. Kim, Y. Ko, and S. J. Kim, ?CAG-QIL: Context-Aware Actionness Grouping via Q Imitation Learning for Online Temporal Action Localization,\" in International Conference on Computer Vision. IEEE, 2021, pp. 13 709--13 718."},{"key":"e_1_3_2_1_42_1","first-page":"786","author":"Ryoo M. S.","year":"2021","unstructured":"M. S. Ryoo, A. J. Piergiovanni, A. Arnab, M. Dehghani, and A. Angelova, ?TokenLearner: Adaptive Space-Time Tokenization for Videos,\" in Advances in Neural Information Processing Systems, 2021, pp. 12 786--12 797.","journal-title":"TokenLearner: Adaptive Space-Time Tokenization for Videos,\" in Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_43_1","first-page":"1104","volume-title":"ACM","author":"Qu L.","year":"2021","unstructured":"L. Qu, M. Liu, J. Wu, Z. Gao, and L. Nie, ?Dynamic Modality Interaction Modeling for Image-Text Retrieval,\" in SIGIR Conference on Research and Development in Information Retrieval. ACM, 2021, pp. 1104--1113."},{"key":"e_1_3_2_1_44_1","first-page":"12","volume-title":"IEEE","author":"Zhang M.","year":"2021","unstructured":"M. Zhang, Y. Yang, X. Chen, Y. Ji, X. Xu, J. Li, and H. T. Shen, ?Multi-Stage Aggregated Transformer Network for Temporal Language Localization in Videos,\" in IEEE Conference on Computer Vision and Pattern Recognition. IEEE, 2021, pp. 12 669--12 678."},{"key":"e_1_3_2_1_45_1","first-page":"776","volume-title":"ACL","author":"Zhang H.","year":"2021","unstructured":"H. Zhang, A. Sun, W. Jing, L. Zhen, J. T. Zhou, and R. S. M. Goh, ?Parallel Attention Network with Sequence Matching for Video Grounding,\" in Findings of the Association for Computational Linguistics: ACL\/IJCNLP. ACL, 2021, pp. 776--790."},{"key":"e_1_3_2_1_46_1","first-page":"7026","volume-title":"IEEE","author":"Wang H.","year":"2021","unstructured":"H. Wang, Z.-J. Zha, L. Li, D. Liu, and J. Luo, ?Structured Multi-Level Interaction Network for Video Moment Localization via Language Query,\" in Conference on Computer Vision and Pattern Recognition. IEEE, 2021, pp. 7026--7035."},{"key":"e_1_3_2_1_47_1","first-page":"706","volume-title":"IEEE","author":"Krishna R.","year":"2017","unstructured":"R. Krishna, K. Hata, F. Ren, L. Fei-Fei, and J. Carlos Niebles, ?Dense-captioning Events in Videos,\" in International Conference on Computer Vision. IEEE, 2017, pp. 706--715."},{"key":"e_1_3_2_1_48_1","first-page":"25","volume":"1","author":"Michaela R.","year":"2013","unstructured":"R. Michaela, R. Marcus, W. Dominikus, T. Stefan, S. Bernt, and P. Manfred, ?Grounding action descriptions in videos,\" Trans. Assoc. Comput. Linguistics, vol. 1, pp. 25--36, 2013.","journal-title":"Grounding action descriptions in videos,\" Trans. Assoc. Comput. Linguistics"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00207"},{"key":"e_1_3_2_1_50_1","first-page":"5026","volume-title":"?Mad: A Scalable Dataset for Language Grounding in Videos from Movie Audio Descriptions,\" in Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Soldan M.","year":"2022","unstructured":"M. Soldan, A. Pardo, J. L. Alc\u00e1zar, F. Caba, C. Zhao, S. Giancola, and B. Ghanem, ?Mad: A Scalable Dataset for Language Grounding in Videos from Movie Audio Descriptions,\" in Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 5026--5035."},{"key":"e_1_3_2_1_51_1","first-page":"3234","volume-title":"ACM","author":"Nie L.","year":"2022","unstructured":"L. Nie, L. Qu, D. Meng, M. Zhang, Q. Tian, and A. D. Bimbo, ?Search-oriented micro-video captioning,\" in International Conference on Multimedia. ACM, 2022, pp. 3234--3243."},{"key":"e_1_3_2_1_52_1","first-page":"4478","volume-title":"ACM","author":"Wang X.","year":"2022","unstructured":"X. Wang, T. Gan, Y. Wei, J. Wu, D. Meng, and L. Nie, ?Micro-video tagging via jointly modeling social influence and tag relation,\" in MM '22: The 30th ACM International Conference on Multimedia, Lisboa, Portugal, October 10 - 14, 2022. ACM, 2022, pp. 4478--4486."},{"key":"e_1_3_2_1_53_1","first-page":"4171","volume-title":"?BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding,\" in Conference of the North American","author":"Devlin J.","year":"2019","unstructured":"J. Devlin, M.-W. Chang, K. Lee, and K. Toutanova, ?BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding,\" in Conference of the North American Chapter of the Association for Computational Linguistics. Association for Computational Linguistics, 2019, pp. 4171--4186."},{"key":"e_1_3_2_1_54_1","first-page":"4489","author":"Tran D.","year":"2015","unstructured":"D. Tran, L. Bourdev, R. Fergus, L. Torresani, and M. Paluri, ?Learning Spatio temporal Features with 3d Convolutional Networks,\" in International Conference on Computer Vision, 2015, pp. 4489--4497.","journal-title":"Learning Spatio temporal Features with 3d Convolutional Networks,\" in International Conference on Computer Vision"},{"key":"e_1_3_2_1_55_1","first-page":"8748","author":"Radford A.","year":"2021","unstructured":"A. Radford, J. W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin, J. Clark et al., ?Learning Transferable Visual Models from Natural Language Supervision,\" in International Conference on Machine Learning, 2021, pp. 8748--8763.","journal-title":"International Conference on Machine Learning"},{"key":"e_1_3_2_1_56_1","first-page":"5977","volume":"35","author":"Liu H.","year":"2023","unstructured":"H. Liu, Y. Wei, J. Yin, and L. Nie, ?Hs-gcn: Hamming spatial graph convolutional networks for recommendation,\" IEEE Transactions on Knowledge and Data Engineering, vol. 35, pp. 5977--5990, 2023.","journal-title":"Hs-gcn: Hamming spatial graph convolutional networks for recommendation,\" IEEE Transactions on Knowledge and Data Engineering"},{"key":"e_1_3_2_1_57_1","volume-title":"?Decoupled Weight Decay Regularization,\" in International Conference on Learning Representations","author":"Loshchilov I.","year":"2019","unstructured":"I. Loshchilov and F. Hutter, ?Decoupled Weight Decay Regularization,\" in International Conference on Learning Representations, 2019."},{"key":"e_1_3_2_1_58_1","first-page":"14","volume-title":"?Cnvid-3.5m: Build, filter, and pre-train the large-scale public chinese video-text dataset,\" in Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Gan T.","year":"2023","unstructured":"T. Gan, Q. Wang, X. Dong, X. Ren, L. Nie, and Q. Guo, ?Cnvid-3.5m: Build, filter, and pre-train the large-scale public chinese video-text dataset,\" in Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 14 815--14 824"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Ottawa ON Canada","acronym":"MM '23"},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612120","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612120","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:03:31Z","timestamp":1755821011000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612120"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":58,"alternative-id":["10.1145\/3581783.3612120","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612120","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}