{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,25]],"date-time":"2025-09-25T18:20:57Z","timestamp":1758824457363},"reference-count":48,"publisher":"IEEE","license":[{"start":{"date-parts":[[2022,8,21]],"date-time":"2022-08-21T00:00:00Z","timestamp":1661040000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2022,8,21]],"date-time":"2022-08-21T00:00:00Z","timestamp":1661040000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2022,8,21]]},"DOI":"10.1109\/icpr56361.2022.9956607","type":"proceedings-article","created":{"date-parts":[[2022,11,29]],"date-time":"2022-11-29T19:34:13Z","timestamp":1669750453000},"page":"5002-5009","source":"Crossref","is-referenced-by-count":5,"title":["Cross-modal Contrastive Distillation for Instructional Activity Anticipation"],"prefix":"10.1109","author":[{"given":"Zhengyuan","family":"Yang","sequence":"first","affiliation":[{"name":"University of Rochester,Department of Computer Science"}]},{"given":"Jingen","family":"Liu","sequence":"additional","affiliation":[{"name":"JD AI Research"}]},{"given":"Jing","family":"Huang","sequence":"additional","affiliation":[{"name":"JD AI Research"}]},{"given":"Xiaodong","family":"He","sequence":"additional","affiliation":[{"name":"JD AI Research"}]},{"given":"Tao","family":"Mei","sequence":"additional","affiliation":[{"name":"JD AI Research"}]},{"given":"Chenliang","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Rochester,Department of Computer Science"}]},{"given":"Jiebo","family":"Luo","sequence":"additional","affiliation":[{"name":"University of Rochester,Department of Computer Science"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"article-title":"Attention is all you need","year":"2017","author":"vaswani","key":"ref38"},{"article-title":"Vse++: Improving visual-semantic embeddings with hard negatives","year":"2017","author":"faghri","key":"ref33"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01088"},{"article-title":"Language models are few-shot learners","year":"2020","author":"brown","key":"ref37"},{"article-title":"An empirical study of gpt-3 for few-shot knowledge-based vqa","year":"2021","author":"yang","key":"ref36"},{"article-title":"Multimodal few-shot learning with frozen language models","year":"2021","author":"tsimpoukelli","key":"ref35"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00475"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.110"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.493"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0683-3"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10578-9_45"},{"key":"ref14","first-page":"0","article-title":"Action anticipation by predicting future dynamic images","author":"rodriguez","year":"2018","journal-title":"Proceedings of the European Conference on Computer Vision (ECCV) Workshops"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01025"},{"article-title":"Distilling the knowledge in a neural network","year":"2015","author":"hinton","key":"ref16"},{"article-title":"Fitnets: Hints for thin deep nets","year":"2014","author":"romero","key":"ref17"},{"key":"ref18","article-title":"Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer","author":"zagoruyko","year":"2017","journal-title":"ICLRE"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.309"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00146"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00095"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00048"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.18"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01016"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00483"},{"key":"ref5","first-page":"5343","article-title":"When will you do what?-anticipating temporal occurrences of activities","author":"abu farha","year":"2018","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00895"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58621-8_20"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.365"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00868"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126349"},{"key":"ref46","first-page":"311","article-title":"Bleu: a method for automatic evaluation of machine translation","author":"papineni","year":"2002","journal-title":"Proceedings of the 40th Annual Meeting on Association for Computational Linguistics  - ACL '02"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.211"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00911"},{"article-title":"Meal v2: Boosting vanilla resnet-50 to 80%+ top-1 accuracy on imagenet without tricks","year":"2020","author":"shen","key":"ref48"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00635"},{"article-title":"Adam: A method for stochastic optimization","year":"2014","author":"kingma","key":"ref47"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00864"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3021497"},{"key":"ref41","first-page":"121","article-title":"Oscar: Object-semantics aligned pre-training for vision-language tasks","author":"li","year":"2020","journal-title":"European Conference on Computer Vision"},{"article-title":"Long-term anticipation of activities with cycle consistency","year":"2020","author":"farha","key":"ref23"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.327"},{"key":"ref26","first-page":"8081","article-title":"Learning to specialize with knowledge distillation for visual question answering","author":"mun","year":"2018","journal-title":"Advances in neural information processing systems"},{"article-title":"Professor forcing: A new algorithm for training recurrent networks","year":"2016","author":"lamb","key":"ref43"},{"article-title":"Contrastive representation distillation","year":"2019","author":"tian","key":"ref25"}],"event":{"name":"2022 26th International Conference on Pattern Recognition (ICPR)","start":{"date-parts":[[2022,8,21]]},"location":"Montreal, QC, Canada","end":{"date-parts":[[2022,8,25]]}},"container-title":["2022 26th International Conference on Pattern Recognition (ICPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/9956007\/9955631\/09956607.pdf?arnumber=9956607","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,19]],"date-time":"2022-12-19T20:06:34Z","timestamp":1671480394000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9956607\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,21]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/icpr56361.2022.9956607","relation":{},"subject":[],"published":{"date-parts":[[2022,8,21]]}}}