{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:21:03Z","timestamp":1750220463788,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475245","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T21:45:34Z","timestamp":1634593534000},"page":"760-768","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":7,"title":["Weakly-Supervised Video Object Grounding via Stable Context Learning"],"prefix":"10.1145","author":[{"given":"Wei","family":"Wang","sequence":"first","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}]},{"given":"Junyu","family":"Gao","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences &amp; University of Chinese Academy of Sciences, Beijing, China"}]},{"given":"Changsheng","family":"Xu","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences; University of Chinese Academy of Sciences; &amp; Peng Cheng Laboratory, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.5555\/3298023.3298199"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413614"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00425"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3078971.3078976"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2019.00177"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00269"},{"volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","year":"2018","author":"Devlin Jacob","key":"e_1_3_2_1_7_1"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2832602"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3059295"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2019.2947442"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00478"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Junyu Gao Tianzhu Zhang and Changsheng Xu. 2019 b. I Know the Relationships: Zero-Shot Action Recognition via Two-Stream Graph Convolutional Networks and Knowledge Graphs. In AAAI .  Junyu Gao Tianzhu Zhang and Changsheng Xu. 2019 b. I Know the Relationships: Zero-Shot Action Recognition via Two-Stream Graph Convolutional Networks and Knowledge Graphs. In AAAI .","DOI":"10.1609\/aaai.v33i01.33018303"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.2985708"},{"volume-title":"Contrastive learning for weakly supervised phrase grounding. arXiv preprint arXiv:2006.09920","year":"2020","author":"Gupta Tanmay","key":"e_1_3_2_1_14_1"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.493"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00623"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"volume-title":"Spatio-temporal grounding for video question answering. arXiv preprint arXiv:1904.11574","year":"2019","author":"Lei Jie","key":"e_1_3_2_1_19_1"},{"volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","year":"2019","author":"Li Liunian Harold","key":"e_1_3_2_1_20_1"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351074"},{"volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. arXiv preprint arXiv:1908.02265","year":"2019","author":"Lu Jiasen","key":"e_1_3_2_1_22_1"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.213"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.303"},{"volume-title":"Faster r-cnn: Towards real-time object detection with region proposal networks. arXiv preprint arXiv:1506.01497","year":"2015","author":"Ren Shaoqing","key":"e_1_3_2_1_26_1"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46448-0_49"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01043"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.509"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01069"},{"volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","year":"2019","author":"Tan Hao","key":"e_1_3_2_1_32_1"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2797921"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_42"},{"volume-title":"2020 b. MAF: Multimodal Alignment Framework for Weakly-Supervised Phrase Grounding. arXiv preprint arXiv:2010.05379","year":"2020","author":"Wang Qinxin","key":"e_1_3_2_1_35_1"},{"volume-title":"2020 a. Learning coarse-to-fine graph neural networks for video-text retrieval","year":"2020","author":"Wang Wei","key":"e_1_3_2_1_36_1"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.558"},{"volume-title":"Visual relation grounding in videos","author":"Xiao Junbin","key":"e_1_3_2_1_38_1"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401151"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462823"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413610"},{"key":"e_1_3_2_1_42_1","first-page":"2987","article-title":"Person reidentification via structural deep metric learning","volume":"30","author":"Yang Xun","year":"2018","journal-title":"IEEE Transactions on NNLS"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-017-1018-6"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00437"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3091882"},{"volume-title":"Object-aware multi-branch relation networks for spatio-temporal video grounding. arXiv preprint arXiv:2008.06941","year":"2020","author":"Zhang Zhu","key":"e_1_3_2_1_46_1"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00597"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00674"},{"volume-title":"Weakly-supervised video object grounding from text by loss weighting and object interaction. arXiv preprint arXiv:1805.02834","year":"2018","author":"Zhou Luowei","key":"e_1_3_2_1_49_1"}],"event":{"name":"MM '21: ACM Multimedia Conference","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Virtual Event China","acronym":"MM '21"},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475245","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475245","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:16Z","timestamp":1750193296000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475245"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":49,"alternative-id":["10.1145\/3474085.3475245","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475245","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}