{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,1,18]],"date-time":"2024-01-18T01:44:40Z","timestamp":1705542280045},"reference-count":74,"publisher":"IEEE","license":[{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,6,1]],"date-time":"2023-06-01T00:00:00Z","timestamp":1685577600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"NSFC","doi-asserted-by":"publisher","award":["62176008"]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023,6]]},"DOI":"10.1109\/cvpr52729.2023.00631","type":"proceedings-article","created":{"date-parts":[[2023,8,22]],"date-time":"2023-08-22T17:30:52Z","timestamp":1692725452000},"source":"Crossref","is-referenced-by-count":2,"title":["Iterative Proposal Refinement for Weakly-Supervised Video Grounding"],"prefix":"10.1109","author":[{"given":"Meng","family":"Cao","sequence":"first","affiliation":[{"name":"School of Electronic and Computer Engineering, Peking University"}]},{"given":"Fangyun","family":"Wei","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia"}]},{"given":"Can","family":"Xu","sequence":"additional","affiliation":[{"name":"Microsoft"}]},{"given":"Xiubo","family":"Geng","sequence":"additional","affiliation":[{"name":"Microsoft"}]},{"given":"Long","family":"Chen","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology"}]},{"given":"Can","family":"Zhang","sequence":"additional","affiliation":[{"name":"School of Electronic and Computer Engineering, Peking University"}]},{"given":"Yuexian","family":"Zou","sequence":"additional","affiliation":[{"name":"School of Electronic and Computer Engineering, Peking University"}]},{"given":"Tao","family":"Shen","sequence":"additional","affiliation":[{"name":"Microsoft"}]},{"given":"Daxin","family":"Jiang","sequence":"additional","affiliation":[{"name":"Microsoft"}]}],"member":"263","reference":[{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10096514"},{"key":"ref57","article-title":"Actionclip: A new paradigm for video action recognition","author":"wang","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref12","article-title":"Look closer to ground better: Weakly-supervised temporal grounding of sentence in video","author":"chen","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00331"},{"key":"ref15","first-page":"14084","article-title":"Learning to prompt for open-vocabulary ob-ject detection with vision-language model","author":"du","year":"0","journal-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475278"},{"key":"ref14","article-title":"An image is worth 16x16 words: Trans-formers for image recognition at scale","author":"dosovitskiy","year":"2021","journal-title":"ICLRE"},{"key":"ref58","article-title":"Weakly supervised temporal adjacent network for language grounding","author":"wang","year":"2021","journal-title":"IEEE Transactions on Multimedia"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.326"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/WACV48630.2021.00213"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00832"},{"key":"ref55","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"NeurIPS"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1023\/A:1007379606734"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"ref17","first-page":"1473","article-title":"From captions to vi-sual concepts and back","author":"fang","year":"0","journal-title":"Proceedings of the IEEE Con-ference on Computer Vision and Pattern Recognition"},{"key":"ref16","article-title":"Weakly supervised dense event captioning in videos","volume":"31","author":"duan","year":"2018","journal-title":"Advances in Neural Infor-Mation Processing Systems"},{"key":"ref19","first-page":"214","article-title":"Multi modal transformer for video retrieval","author":"gabeur","year":"2020","journal-title":"European Conference on Computer Vision"},{"key":"ref18","article-title":"Weak supervision and referring attention for temporal-textual association learning","author":"fang","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"ref50","article-title":"Weakly-supervised multi-level attentional reconstruction network for grounding textual queries in videos","author":"song","year":"2020","journal-title":"ArXiv Preprint"},{"key":"ref46","article-title":"Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter","author":"sanh","year":"2019","journal-title":"ArXiv Preprint"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01061"},{"key":"ref48","first-page":"510","article-title":"Hollywood in homes: Crowdsourcing data collection for activity under-standing","author":"sigurdsson","year":"2016","journal-title":"ECCV"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.155"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.111"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01186"},{"key":"ref44","first-page":"8748","article-title":"Learning transferable visual models from natural language super-vision","author":"radford","year":"2021","journal-title":"International Conference on Machine Learning"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-016-2525-z"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3193752"},{"key":"ref7","first-page":"38","article-title":"Locvtp: Video-text pre-training for temporal localization","author":"cao","year":"0","journal-title":"Computer Vision-ECCV 2022 17th European Conference Tel Aviv Israel October 23&#x2013;27 2022 Proceedings Part XXVI"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.502"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.311"},{"key":"ref6","first-page":"4967","article-title":"Correspondence matters for video referring expression compre-hension","author":"cao","year":"0","journal-title":"Proceedings of the 30th ACM International Con-ference on Multimedia"},{"key":"ref5","first-page":"9810","article-title":"On pursuit of designing multi-modal trans-former for video grounding","author":"cao","year":"0","journal-title":"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Pro-cessing"},{"key":"ref40","first-page":"156","article-title":"Vlanet: Video-language alignment network for weakly-supervised video moment re-trieval","author":"ma","year":"0","journal-title":"European Conference on Computer Vision"},{"key":"ref35","article-title":"Generating templated caption for video grounding","author":"li","year":"2023","journal-title":"ArXiv Preprint"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.382"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6820"},{"key":"ref36","article-title":"Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm","author":"li","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref31","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014","journal-title":"ArXiv Preprint"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01653-1"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.381"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref1","first-page":"2425","article-title":"V qa: Visual question answering","author":"antol","year":"0","journal-title":"Proceedings of the IEEE International Conference on Computer Vision"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.07.028"},{"key":"ref38","first-page":"1298","article-title":"Completeness modeling and context separation for weakly su-pervised temporal action localization","author":"liu","year":"0","journal-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.317"},{"key":"ref70","first-page":"18123","article-title":"Counterfactual contrastive learning for weakly-supervised vision-language grounding","volume":"33","author":"zhang","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01511"},{"key":"ref72","article-title":"Weakly supervised video moment localization with con-trastive negative sample mining","volume":"1","author":"zheng","year":"0","journal-title":"Proceedings of the AAAI Conference on Artificial Intelligence"},{"key":"ref24","first-page":"9729","article-title":"Momentum contrast for unsupervised visual rep-resentation learning","author":"he","year":"0","journal-title":"Proceedings of the IEEE\/CVF Con-ference on Computer Vision and Pattern Recognition"},{"key":"ref68","article-title":"Tip-adapter: Training-free clip-adapter for better vision-language modeling","author":"zhang","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref23","article-title":"Open-vocabulary object detection via vision and language knowledge distillation","author":"gu","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01575"},{"key":"ref26","first-page":"4555","article-title":"Naturallanguage object re-trieval","author":"hu","year":"0","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413967"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.563"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.524"},{"key":"ref63","doi-asserted-by":"crossref","first-page":"3252","DOI":"10.1109\/TIP.2021.3058614","article-title":"Local correspondence network for weakly supervised temporal sentence grounding","volume":"30","author":"yang","year":"2021","journal-title":"IEEE Transactions on Image Processing"},{"key":"ref22","article-title":"Clip-adapter: Better vision-language models with feature adapters","author":"gao","year":"2021","journal-title":"ArXiv Preprint"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2019.2922108"},{"key":"ref21","first-page":"1481","article-title":"Wslln: Weakly supervised natural language local-ization networks","author":"gao","year":"0","journal-title":"Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)"},{"key":"ref65","article-title":"Marginalized average attentional network for weakly-supervised learning","author":"yuan","year":"0","journal-title":"ICLR 2019-Seventh International Conference on Learning Representations"},{"key":"ref28","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"jia","year":"2021","journal-title":"International Conference on Machine Learning"},{"key":"ref27","first-page":"7199","article-title":"Cross-sentence temporal and semantic relations in video ac-tivity localisation","author":"huang","year":"0","journal-title":"Proceedings of the IEEE\/CVF Inter-national Conference on Computer Vision"},{"key":"ref29","first-page":"1377","article-title":"Deep self-taught learning for weakly supervised ob-ject localization","author":"jie","year":"0","journal-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition"},{"key":"ref60","first-page":"1283","article-title":"Rein-forcement learning for weakly supervised temporal grounding of natural language in untrimmed videos","author":"wu","year":"0","journal-title":"Proceedings of the 28th ACM International Conference on Multimedia"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00846"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"}],"event":{"name":"2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Vancouver, BC, Canada","start":{"date-parts":[[2023,6,17]]},"end":{"date-parts":[[2023,6,24]]}},"container-title":["2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/10203037\/10203050\/10203782.pdf?arnumber=10203782","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,11]],"date-time":"2023-09-11T17:54:00Z","timestamp":1694454840000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10203782\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6]]},"references-count":74,"URL":"http:\/\/dx.doi.org\/10.1109\/cvpr52729.2023.00631","relation":{},"subject":[],"published":{"date-parts":[[2023,6]]}}}