{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:47:47Z","timestamp":1778082467218,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,7,11]],"date-time":"2021-07-11T00:00:00Z","timestamp":1625961600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Zhejiang Natural Science Foundation","award":["LR19F020006"],"award-info":[{"award-number":["LR19F020006"]}]},{"name":"National Key R&D Program of China","award":["No.2020YFC0832505"],"award-info":[{"award-number":["No.2020YFC0832505"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No.61836002, No.62072397"],"award-info":[{"award-number":["No.61836002, No.62072397"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,7,11]]},"DOI":"10.1145\/3404835.3462974","type":"proceedings-article","created":{"date-parts":[[2021,7,12]],"date-time":"2021-07-12T02:41:54Z","timestamp":1626057714000},"page":"1114-1124","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":44,"title":["Hierarchical Cross-Modal Graph Consistency Learning for Video-Text Retrieval"],"prefix":"10.1145","author":[{"given":"Weike","family":"Jin","sequence":"first","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhou","family":"Zhao","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Pengcheng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jieming","family":"Zhu","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiuqiang","family":"He","sequence":"additional","affiliation":[{"name":"Huawei Noah's Ark Lab, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yueting","family":"Zhuang","sequence":"additional","affiliation":[{"name":"Zhejiang University, Hangzhou, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,7,11]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"e_1_3_2_2_2_1","unstructured":"Xiaojun Chang Yi Yang Alexander Hauptmann Eric P Xing and Yao-Liang Yu. 2015. Semantic concept discovery for large-scale zero-shot event detection. In Twenty-fourth international joint conference on artificial intelligence."},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.5555\/2002472.2002497"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2505515.2507880"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2832602"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00957"},{"key":"e_1_3_2_2_9_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2017. Vse++: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612 (2017)."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654902"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/140"},{"key":"e_1_3_2_2_12_1","volume-title":"Devise: A deep visual-semantic embedding model. In Advances in Neural Information Processing Systems. 2121--2129.","author":"Frome Andrea","year":"2013","unstructured":"Andrea Frome, Greg S Corrado, Jon Shlens, Samy Bengio, Jeff Dean, Marc'Aurelio Ranzato, and Tomas Mikolov. 2013. Devise: A deep visual-semantic embedding model. In Advances in Neural Information Processing Systems. 2121--2129."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0658-4"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00750"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.337"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2578726.2578746"},{"key":"e_1_3_2_2_17_1","volume-title":"Video2vec embeddings recognize events when examples are scarce","author":"Habibian Amirhossein","year":"2016","unstructured":"Amirhossein Habibian, Thomas Mensink, and Cees GM Snoek. 2016. Video2vec embeddings recognize events when examples are scarce. IEEE transactions on pattern analysis and machine intelligence 39, 10 (2016), 2089--2103."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_2_20_1","volume-title":"Unifying visualsemantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539","author":"Kiros Ryan","year":"2014","unstructured":"Ryan Kiros, Ruslan Salakhutdinov, and Richard S Zemel. 2014. Unifying visualsemantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539 (2014)."},{"key":"e_1_3_2_2_21_1","volume-title":"Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems (2012)."},{"key":"e_1_3_2_2_22_1","volume-title":"Minh-Triet Tran, Yuki Watanabe, Martin Klinkigt, et al.","author":"Le Duy-Dinh","year":"2016","unstructured":"Duy-Dinh Le, Sang Phan, Vinh-Tiep Nguyen, Benjamin Renoust, Tuan A Nguyen, Van-Nam Hoang, Thanh Duc Ngo, Minh-Triet Tran, Yuki Watanabe, Martin Klinkigt, et al. 2016. NII-HITACHI-UIT at TRECVID 2016.. In TRECVID."},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350906"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.502"},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.340"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01093"},{"key":"e_1_3_2_2_27_1","volume-title":"Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487","author":"Liu Yang","year":"2019","unstructured":"Yang Liu, Samuel Albanie, Arsha Nagrani, and Andrew Zisserman. 2019. Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487 (2019)."},{"key":"e_1_3_2_2_28_1","volume-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. arXiv preprint arXiv:1908.02265","author":"Lu Jiasen","year":"2019","unstructured":"Jiasen Lu, Dhruv Batra, Devi Parikh, and Stefan Lee. 2019. Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. arXiv preprint arXiv:1908.02265 (2019)."},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3078971.3079041"},{"key":"e_1_3_2_2_30_1","volume-title":"Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781","author":"Mikolov Tomas","year":"2013","unstructured":"Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. 2013. Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781 (2013)."},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3206025.3206064"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.497"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10599-4_35"},{"key":"e_1_3_2_2_35_1","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition. 2046--2054","author":"Nhi Tran Thi Quynh","year":"2016","unstructured":"Thi Quynh Nhi Tran, Herv\u00e9 Le Borgne, and Michel Crucianu. 2016. Aggregating image and text quantized correlated components. In Proceedings of the IEEE conference on computer vision and pattern recognition. 2046--2054."},{"key":"e_1_3_2_2_36_1","volume-title":"Simple bert models for relation extraction and semantic role labeling. arXiv preprint arXiv:1904.05255","author":"Shi Peng","year":"2019","unstructured":"Peng Shi and Jimmy Lin. 2019. Simple bert models for relation extraction and semantic role labeling. arXiv preprint arXiv:1904.05255 (2019)."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1561\/1500000014"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00208"},{"key":"e_1_3_2_2_39_1","volume-title":"VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In International Conference on Learning Representations.","author":"Su Weijie","year":"2020","unstructured":"Weijie Su, Xizhou Zhu, Yue Cao, Bin Li, Lewei Lu, Furu Wei, and Jifeng Dai. 2020. VL-BERT: Pre-training of Generic Visual-Linguistic Representations. In International Conference on Learning Representations."},{"key":"e_1_3_2_2_40_1","volume-title":"Florida International University-University of Miami TRECVID 2019","author":"Tao Yudong","year":"2019","unstructured":"Yudong Tao, TianyiWang, Diana Machado, Raul Garcia, Yuexuan Tu, Maria Presa Reyes, Yeda Chen, Haiman Tian, Mei-Ling Shyu, and Shu-Ching Chen. 2019. Florida International University-University of Miami TRECVID 2019. TRECVID. NIST, USA (2019)."},{"key":"e_1_3_2_2_41_1","unstructured":"Kazuya Ueki Koji Hirakawa Kotaro Kikuchi Tetsuji Ogawa and Tetsunori Kobayashi. 2017. Waseda_Meisei at TRECVID 2017: Ad-hoc Video Search.. In TRECVID."},{"key":"e_1_3_2_2_42_1","volume-title":"Orderembeddings of images and language. arXiv preprint arXiv:1511.06361","author":"Vendrov Ivan","year":"2015","unstructured":"Ivan Vendrov, Ryan Kiros, Sanja Fidler, and Raquel Urtasun. 2015. Orderembeddings of images and language. arXiv preprint arXiv:1511.06361 (2015)."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00677"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v29i1.9512"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401151"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"e_1_3_2_2_49_1","volume-title":"Utilizing related samples to enhance interactive concept-based video search. 13, 6","author":"Yuan Jin","year":"2011","unstructured":"Jin Yuan, Zheng-Jun Zha, Yan-Tao Zheng, MengWang, Xiangdong Zhou, and Tat-Seng Chua. 2011. Utilizing related samples to enhance interactive concept-based video search. 13, 6 (2011), 1343--1355."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462908"}],"event":{"name":"SIGIR '21: The 44th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Virtual Event Canada","acronym":"SIGIR '21","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3462974","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3404835.3462974","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:18:20Z","timestamp":1750191500000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3462974"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,11]]},"references-count":50,"alternative-id":["10.1145\/3404835.3462974","10.1145\/3404835"],"URL":"https:\/\/doi.org\/10.1145\/3404835.3462974","relation":{},"subject":[],"published":{"date-parts":[[2021,7,11]]},"assertion":[{"value":"2021-07-11","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}