{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T06:27:44Z","timestamp":1768544864647,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,12,15]],"date-time":"2023-12-15T00:00:00Z","timestamp":1702598400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"Science and Engineering Research Board","doi-asserted-by":"publisher","award":["SRG\/2021\/001417"],"award-info":[{"award-number":["SRG\/2021\/001417"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,12,15]]},"DOI":"10.1145\/3627631.3627637","type":"proceedings-article","created":{"date-parts":[[2024,1,31]],"date-time":"2024-01-31T12:08:32Z","timestamp":1706702912000},"page":"1-7","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["ViLP: Knowledge Exploration using Vision, Language, and Pose Embeddings for Video Action Recognition"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-1797-5121","authenticated-orcid":false,"given":"Soumyabrata","family":"Chaudhuri","sequence":"first","affiliation":[{"name":"IIT Bhubaneswar, IN"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1273-7969","authenticated-orcid":false,"given":"Saumik","family":"Bhattacharya","sequence":"additional","affiliation":[{"name":"E&amp;ECE, Indian Institute of Technology Kharagpur, IN"}]}],"member":"320","published-online":{"date-parts":[[2024,1,31]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00676"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-25446-8_4"},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings of the International Conference on Machine Learning (ICML).","author":"Bertasius Gedas","year":"2021","unstructured":"Gedas Bertasius, Heng Wang, and Lorenzo Torresani. 2021. Is Space-Time Attention All You Need for Video Understanding?. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.331"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/34.910878"},{"key":"e_1_3_2_1_6_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems 33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020), 1877\u20131901."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.143"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11554-013-0370-1"},{"key":"e_1_3_2_1_9_1","volume-title":"Reddit Market Sentiment Analysis with Large Language Models. In Companion Proceedings of the ACM Web Conference","author":"Deng Xiang","year":"2023","unstructured":"Xiang Deng, Vasilisa Bashlovkina, Feng Han, Simon Baumgartner, and Michael Bendersky. 2023. LLMs to the Moon? Reddit Market Sentiment Analysis with Large Language Models. In Companion Proceedings of the ACM Web Conference 2023. 1014\u20131019."},{"key":"e_1_3_2_1_10_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_11_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings Ninth IEEE International Conference on Computer Vision. IEEE, 726\u2013733","year":"2003","unstructured":"Efros, Berg, Mori, and Malik. 2003. Recognizing action at a distance. In Proceedings Ninth IEEE International Conference on Computer Vision. IEEE, 726\u2013733."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00675"},{"key":"e_1_3_2_1_14_1","volume-title":"Uatvr: Uncertainty-adaptive text-video retrieval. arXiv preprint arXiv:2301.06309","author":"Fang Bo","year":"2023","unstructured":"Bo Fang, Chang Liu, Yu Zhou, Min Yang, Yuxin Song, Fu Li, Weiping Wang, Xiangyang Ji, Wanli Ouyang, 2023. Uatvr: Uncertainty-adaptive text-video retrieval. arXiv preprint arXiv:2301.06309 (2023)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Christoph Feichtenhofer Axel Pinz and Richard Wildes. 2016. Spatiotemporal residual networks for video action recognition. In Advances in Neural Information Processing Systems (NIPS). 3468\u20133476.","DOI":"10.1109\/CVPR.2017.787"},{"key":"e_1_3_2_1_17_1","first-page":"15908","article-title":"Transformer in transformer","volume":"34","author":"Han Kai","year":"2021","unstructured":"Kai Han, An Xiao, Enhua Wu, Jianyuan Guo, Chunjing Xu, and Yunhe Wang. 2021. Transformer in transformer. Advances in Neural Information Processing Systems 34 (2021), 15908\u201315919.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","volume-title":"3D convolutional neural networks for human action recognition","author":"Ji Shuiwang","year":"2012","unstructured":"Shuiwang Ji, Wei Xu, Ming Yang, and Kai Yu. 2012. 3D convolutional neural networks for human action recognition. IEEE transactions on pattern analysis and machine intelligence 35, 1 (2012), 221\u2013231."},{"key":"e_1_3_2_1_19_1","volume-title":"International conference on machine learning. PMLR, 4904\u20134916","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. 2021. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning. PMLR, 4904\u20134916."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_7"},{"key":"e_1_3_2_1_21_1","volume-title":"The kinetics human action video dataset. arXiv preprint arXiv:1705.06950","author":"Kay Will","year":"2017","unstructured":"Will Kay, Joao Carreira, Karen Simonyan, Brian Zhang, Chloe Hillier, Sudheendra Vijayanarasimhan, Fabio Viola, Tim Green, Trevor Back, Paul Natsev, 2017. The kinetics human action video dataset. arXiv preprint arXiv:1705.06950 (2017)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00099"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6836"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01345"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2007.383131"},{"key":"e_1_3_2_1_30_1","volume-title":"Are These Birds Similar: Learning Branched Networks for Fine-grained Representations. In 2019 International Conference on Image and Vision Computing New Zealand (IVCNZ","author":"Nawaz Shah","year":"2019","unstructured":"Shah Nawaz, Alessandro Calefati, Moreno Caraffini, Nicola Landro, and Ignazio Gallo. 2019. Are These Birds Similar: Learning Branched Networks for Fine-grained Representations. In 2019 International Conference on Image and Vision Computing New Zealand (IVCNZ 2019)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_29"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.590"},{"key":"e_1_3_2_1_33_1","volume-title":"International conference on machine learning. PMLR, 8748\u20138763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_2_1_34_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. (2019)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19839-7_10"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2505295"},{"key":"e_1_3_2_1_38_1","volume-title":"Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems 27","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Two-stream convolutional networks for action recognition in videos. Advances in neural information processing systems 27 (2014)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.11212"},{"key":"e_1_3_2_1_40_1","volume-title":"UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402","author":"Soomro Khurram","year":"2012","unstructured":"Khurram Soomro, Amir\u00a0Roshan Zamir, and Mubarak Shah. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.214"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00675"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.441"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299059"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_2"},{"key":"e_1_3_2_1_46_1","volume-title":"Actionclip: A new paradigm for video action recognition. arXiv preprint arXiv:2109.08472","author":"Wang Mengmeng","year":"2021","unstructured":"Mengmeng Wang, Jiazheng Xing, and Yong Liu. 2021. Actionclip: A new paradigm for video action recognition. arXiv preprint arXiv:2109.08472 (2021)."},{"key":"e_1_3_2_1_47_1","volume-title":"Free viewpoint action recognition using motion history volumes. Computer vision and image understanding 104, 2-3","author":"Weinland Daniel","year":"2006","unstructured":"Daniel Weinland, Remi Ronfard, and Edmond Boyer. 2006. Free viewpoint action recognition using motion history volumes. Computer vision and image understanding 104, 2-3 (2006), 249\u2013257."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01031"},{"key":"e_1_3_2_1_49_1","volume-title":"Revisiting Classifier: Transferring Vision-Language Models for Video Recognition.","author":"Wu Wenhao","year":"2023","unstructured":"Wenhao Wu, Zhun Sun, and Wanli Ouyang. 2023. Revisiting Classifier: Transferring Vision-Language Models for Video Recognition. (2023)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00640"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_29"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.5244\/C.25.67"},{"key":"e_1_3_2_1_53_1","volume-title":"Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917","author":"Yu Jiahui","year":"2022","unstructured":"Jiahui Yu, Zirui Wang, Vijay Vasudevan, Legg Yeung, Mojtaba Seyedhosseini, and Yonghui Wu. 2022. Coca: Contrastive captioners are image-text foundation models. arXiv preprint arXiv:2205.01917 (2022)."},{"key":"e_1_3_2_1_54_1","volume-title":"Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432","author":"Yuan Lu","year":"2021","unstructured":"Lu Yuan, Dongdong Chen, Yi-Ling Chen, Noel Codella, Xiyang Dai, Jianfeng Gao, Houdong Hu, Xuedong Huang, Boxin Li, Chunyuan Li, 2021. Florence: A new foundation model for computer vision. arXiv preprint arXiv:2111.11432 (2021)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2896631"},{"key":"e_1_3_2_1_56_1","volume-title":"ERNIE: Enhanced language representation with informative entities. arXiv preprint arXiv:1905.07129","author":"Zhang Zhengyan","year":"2019","unstructured":"Zhengyan Zhang, Xu Han, Zhiyuan Liu, Xin Jiang, Maosong Sun, and Qun Liu. 2019. ERNIE: Enhanced language representation with informative entities. arXiv preprint arXiv:1905.07129 (2019)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531950"}],"event":{"name":"ICVGIP '23: Indian Conference on Computer Vision, Graphics and Image Processing","location":"Rupnagar India","acronym":"ICVGIP '23"},"container-title":["Proceedings of the Fourteenth Indian Conference on Computer Vision, Graphics and Image Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627631.3627637","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3627631.3627637","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:50:50Z","timestamp":1755892250000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3627631.3627637"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,15]]},"references-count":57,"alternative-id":["10.1145\/3627631.3627637","10.1145\/3627631"],"URL":"https:\/\/doi.org\/10.1145\/3627631.3627637","relation":{},"subject":[],"published":{"date-parts":[[2023,12,15]]},"assertion":[{"value":"2024-01-31","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}