{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T01:11:44Z","timestamp":1780708304503,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,10,15]],"date-time":"2019-10-15T00:00:00Z","timestamp":1571097600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Beijing Natural Science Foundation","award":["4192029"],"award-info":[{"award-number":["4192029"]}]},{"name":"the Fundamental Research Funds for the Central Universities and the Research Funds of Renmin University of China","award":["18XNLG19"],"award-info":[{"award-number":["18XNLG19"]}]},{"DOI":"10.13039\/501100012659","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61672523, 61771468"],"award-info":[{"award-number":["61672523, 61771468"]}],"id":[{"id":"10.13039\/501100012659","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,10,15]]},"DOI":"10.1145\/3343031.3350906","type":"proceedings-article","created":{"date-parts":[[2019,10,21]],"date-time":"2019-10-21T16:32:26Z","timestamp":1571675546000},"page":"1786-1794","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":95,"title":["W2VV++"],"prefix":"10.1145","author":[{"given":"Xirong","family":"Li","sequence":"first","affiliation":[{"name":"Renmin University of China &amp; University of Science and Technology of China, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chaoxi","family":"Xu","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Gang","family":"Yang","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhineng","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute of Automation, Chinese Academy of Sciences, Beijing, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jianfeng","family":"Dong","sequence":"additional","affiliation":[{"name":"Zhejiang Gongshang University, Hangzhou, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2019,10,15]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"TRECVID 2018: Benchmarking Video Activity Detection, Video Captioning and Matching, Video Storytelling Linking and Video Search. In TRECVID .","author":"Awad G."},{"key":"e_1_3_2_1_2_1","volume-title":"TRECVID 2017: Evaluating ad-hoc and instance video search, events detection, video captioning and hyperlinking. In TRECVID .","author":"Awad G."},{"key":"e_1_3_2_1_3_1","volume-title":"TRECVID 2016: Evaluating Video Search, Video Event Detection, Localization, and Hyperlinking. In TRECVID .","author":"Awad G."},{"key":"e_1_3_2_1_4_1","unstructured":"M. Bastan X. Shi J. Gu Z. Heng C. Zhuo D. Sng and A. Kot. 2018. NTU ROSE Lab at TRECVID 2018: Ad-hoc Video Search and Video to Text. In TRECVID .  M. Bastan X. Shi J. Gu Z. Heng C. Zhuo D. Sng and A. Kot. 2018. NTU ROSE Lab at TRECVID 2018: Ad-hoc Video Search and Video to Text. In TRECVID ."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"K. Cho B. Van Merri\u00ebnboer C. Gulcehre D. Bahdanau F. Bougares H. Schwenk and Y. Bengio. 2014. Learning Phrase Representations using RNN Encoder-decoder for Statistical Machine Translation. In EMNLP .  K. Cho B. Van Merri\u00ebnboer C. Gulcehre D. Bahdanau F. Bougares H. Schwenk and Y. Bengio. 2014. Learning Phrase Representations using RNN Encoder-decoder for Statistical Machine Translation. In EMNLP .","DOI":"10.3115\/v1\/D14-1179"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"crossref","unstructured":"J. Deng W. Dong R. Socher L.-J. Li K. Li and L. Fei-Fei. 2009. ImageNet: a Large-scale Hierarchical Image Database. In CVPR .  J. Deng W. Dong R. Socher L.-J. Li K. Li and L. Fei-Fei. 2009. ImageNet: a Large-scale Hierarchical Image Database. In CVPR .","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_7_1","unstructured":"J. Dong S. Huang D. Xu and D. Tao. 2017. DL-61--86 at TRECVID 2017: Video-to-Text Description. In TRECVID .  J. Dong S. Huang D. Xu and D. Tao. 2017. DL-61--86 at TRECVID 2017: Video-to-Text Description. In TRECVID ."},{"key":"e_1_3_2_1_8_1","first-page":"3377","article-title":"Predicting Visual Features from Text for Image and Video Caption Retrieval","volume":"20","author":"Dong J.","year":"2018","journal-title":"T-MM"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"J. Dong X. Li C. Xu S. Ji Y. He G. Yang and X. Wang. 2019. Dual Encoding for Zero-Example Video Retrieval. In CVPR .  J. Dong X. Li C. Xu S. Ji Y. He G. Yang and X. Wang. 2019. Dual Encoding for Zero-Example Video Retrieval. In CVPR .","DOI":"10.1109\/CVPR.2019.00957"},{"key":"e_1_3_2_1_10_1","first-page":"2371","article-title":"Cross-Media Similarity Evaluation for Web Image Retrieval in the Wild","volume":"20","author":"Dong J.","year":"2018","journal-title":"T-MM"},{"key":"e_1_3_2_1_11_1","unstructured":"F. Faghri D. J. Fleet J. R. Kiros and S. Fidler. 2018. VSE  F. Faghri D. J. Fleet J. R. Kiros and S. Fidler. 2018. VSE"},{"key":"e_1_3_2_1_12_1","unstructured":": Improving Visual-Semantic Embeddings with Hard Negatives. In BMVC .  : Improving Visual-Semantic Embeddings with Hard Negatives. In BMVC ."},{"key":"e_1_3_2_1_13_1","unstructured":"E. Gabrilovich and S. Markovitch. 2007. Computing Semantic Relatedness Using Wikipedia-based Explicit Semantic Analysis. In IJCAI .  E. Gabrilovich and S. Markovitch. 2007. Computing Semantic Relatedness Using Wikipedia-based Explicit Semantic Analysis. In IJCAI ."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2627563"},{"key":"e_1_3_2_1_15_1","unstructured":"P.-Y. Huang J. Liang V. Vaibhav X. Chang and A. Hauptmann. 2018. Informedia@TRECVID 2018: Ad-hoc Video Search with Discrete and Continuous Representations. In TRECVID .  P.-Y. Huang J. Liang V. Vaibhav X. Chang and A. Hauptmann. 2018. Informedia@TRECVID 2018: Ad-hoc Video Search with Discrete and Continuous Representations. In TRECVID ."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2670560"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"A. Joulin L. van der Maaten A. Jabri and N. Vasilache. 2016. Learning Visual Features from Large Weakly Supervised Data. In ECCV .  A. Joulin L. van der Maaten A. Jabri and N. Vasilache. 2016. Learning Visual Features from Large Weakly Supervised Data. In ECCV .","DOI":"10.1007\/978-3-319-46478-7_5"},{"key":"e_1_3_2_1_18_1","first-page":"1291","article-title":"Best practices for learning video concept detectors from social media examples","volume":"74","author":"Kordumova S.","year":"2015","journal-title":"MTAP"},{"key":"e_1_3_2_1_19_1","volume-title":"et almbox","author":"Le D.-D.","year":"2016"},{"key":"e_1_3_2_1_20_1","volume-title":"TGIF: A New Dataset and Benchmark on Animated GIF Description. In CVPR .","author":"Li Y.","year":"2016"},{"key":"e_1_3_2_1_21_1","volume-title":"et almbox","author":"Liang J.","year":"2016"},{"key":"e_1_3_2_1_22_1","unstructured":"J. Liang L. Jiang D. Meng and A. Hauptmann. 2016b. Learning to Detect Concepts from Webly-labeled Video Data. In IJCAI .  J. Liang L. Jiang D. Meng and A. Hauptmann. 2016b. Learning to Detect Concepts from Webly-labeled Video Data. In IJCAI ."},{"key":"e_1_3_2_1_23_1","first-page":"3361","article-title":"On Influential Trends in Interactive Video Retrieval: Video Browser Showdown 2015--2017","volume":"20","author":"Lokovc J.","year":"2018","journal-title":"T-MM"},{"key":"e_1_3_2_1_24_1","unstructured":"Y.-J. Lu H. Zhang M. de Boer and C.-W. Ngo. 2016. Event Detection with Zero Example: Select the Right and Suppress the Wrong Concepts. In ICMR .  Y.-J. Lu H. Zhang M. de Boer and C.-W. Ngo. 2016. Event Detection with Zero Example: Select the Right and Suppress the Wrong Concepts. In ICMR ."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"crossref","unstructured":"F. Markatopoulou D. Galanopoulos V. Mezaris and I. Patras. 2017. Query and Keyframe Representations for Ad-hoc Video Search. In ICMR .  F. Markatopoulou D. Galanopoulos V. Mezaris and I. Patras. 2017. Query and Keyframe Representations for Ad-hoc Video Search. In ICMR .","DOI":"10.1145\/3078971.3079041"},{"key":"e_1_3_2_1_26_1","volume-title":"ITI-CERTH Participation in TRECVID","author":"Markatopoulou F.","year":"2016"},{"key":"e_1_3_2_1_27_1","volume-title":"Semantic Model Vectors for Complex Video Event Recognition. T-MM","volume":"14","author":"Merler M.","year":"2012"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"N. Mithun J. Li F. Metze and A. K. Roy-Chowdhury. 2018. Learning Joint Embedding with Multimodal Cues for Cross-Modal Video-Text Retrieval. In ICMR .  N. Mithun J. Li F. Metze and A. K. Roy-Chowdhury. 2018. Learning Joint Embedding with Multimodal Cues for Cross-Modal Video-Text Retrieval. In ICMR .","DOI":"10.1145\/3206025.3206064"},{"key":"e_1_3_2_1_29_1","unstructured":"P. Nguyen Q. Li Z.-Q. Cheng Y.-J. Lu H. Zhang X. Wu and C.-W. Ngo. 2017. VIREO @ TRECVID 2017: Video-to-Text Ad-hoc Video Search and Video Hyperlinking. In TRECVID .  P. Nguyen Q. Li Z.-Q. Cheng Y.-J. Lu H. Zhang X. Wu and C.-W. Ngo. 2017. VIREO @ TRECVID 2017: Video-to-Text Ad-hoc Video Search and Video Hyperlinking. In TRECVID ."},{"key":"e_1_3_2_1_30_1","unstructured":"A. Paszke S. Gross S. Chintala G. Chanan E. Yang Z. DeVito Z. Lin A. Desmaison L. Antiga and A. Lerer. 2017. Automatic differentiation in PyTorch. In NIPS-W .  A. Paszke S. Gross S. Chintala G. Chanan E. Yang Z. DeVito Z. Lin A. Desmaison L. Antiga and A. Lerer. 2017. Automatic differentiation in PyTorch. In NIPS-W ."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"L. Rossetto M. Amiri Parian R. Gasser I. Giangreco S. Heller and H. Schuldt. 2019. Deep Learning-Based Concept Detection in vitrivr. In MMM .  L. Rossetto M. Amiri Parian R. Gasser I. Giangreco S. Heller and H. Schuldt. 2019. Deep Learning-Based Concept Detection in vitrivr. In MMM .","DOI":"10.1007\/978-3-030-05716-9_55"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"M. Smucker J. Allan and B. Carterette. 2007. A Comparison of Statistical Significance Tests for Information Retrieval Evaluation. In CIKM .  M. Smucker J. Allan and B. Carterette. 2007. A Comparison of Statistical Significance Tests for Information Retrieval Evaluation. In CIKM .","DOI":"10.1145\/1321440.1321528"},{"key":"e_1_3_2_1_33_1","unstructured":"C. G. M. Snoek J. Dong X. Li X. Wang Q. Wei W. Lan E. Gavves N. Hussein D. C. Koelma and A. W. M. Smeulders. 2016. University of Amsterdam and Renmin University at TRECVID 2016: Searching Video Detecting Events and Describing Video. In TRECVID .  C. G. M. Snoek J. Dong X. Li X. Wang Q. Wei W. Lan E. Gavves N. Hussein D. C. Koelma and A. W. M. Smeulders. 2016. University of Amsterdam and Renmin University at TRECVID 2016: Searching Video Detecting Events and Describing Video. In TRECVID ."},{"key":"e_1_3_2_1_34_1","unstructured":"C. G. M. Snoek X. Li C. Xu and D. C. Koelma. 2017. University of Amsterdam and Renmin University at TRECVID 2017: Searching Video Detecting Events and Describing Video. In TRECVID .  C. G. M. Snoek X. Li C. Xu and D. C. Koelma. 2017. University of Amsterdam and Renmin University at TRECVID 2017: Searching Video Detecting Events and Describing Video. In TRECVID ."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1561\/1500000014"},{"key":"e_1_3_2_1_36_1","unstructured":"K. Ueki K. Hirakawa K. Kikuchi T. Ogawa and T. Kobayashi. 2017. Waseda_Meisei at TRECVID 2017: Ad-hoc Video Search. In TRECVID .  K. Ueki K. Hirakawa K. Kikuchi T. Ogawa and T. Kobayashi. 2017. Waseda_Meisei at TRECVID 2017: Ad-hoc Video Search. In TRECVID ."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"J. Xu T. Mei T. Yao and Y. Rui. 2016. MSR-VTT: A Large Video Description Dataset for Bridging Video and Language. In CVPR .  J. Xu T. Mei T. Yao and Y. Rui. 2016. MSR-VTT: A Large Video Description Dataset for Bridging Video and Language. In CVPR .","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"G. Ye Y. Li H. Xu D. Liu and S.-F. Chang. 2015. EventNet: A Large Scale Structured Concept Library for Complex Event Detection in Video. In ACMMM .  G. Ye Y. Li H. Xu D. Liu and S.-F. Chang. 2015. EventNet: A Large Scale Structured Concept Library for Complex Event Detection in Video. In ACMMM .","DOI":"10.1145\/2733373.2806221"},{"key":"e_1_3_2_1_39_1","unstructured":"S.-I. Yu L. Jiang Z. Xu Y. Yang and A. G. Hauptmann. 2015. Content-Based Video Search over 1 Million Videos with 1 Core in 1 Second. In ICMR .  S.-I. Yu L. Jiang Z. Xu Y. Yang and A. G. Hauptmann. 2015. Content-Based Video Search over 1 Million Videos with 1 Core in 1 Second. In ICMR ."}],"event":{"name":"MM '19: The 27th ACM International Conference on Multimedia","location":"Nice France","acronym":"MM '19","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 27th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3343031.3350906","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3343031.3350906","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:13:17Z","timestamp":1750201997000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3343031.3350906"}},"subtitle":["Fully Deep Learning for Ad-hoc Video Search"],"short-title":[],"issued":{"date-parts":[[2019,10,15]]},"references-count":39,"alternative-id":["10.1145\/3343031.3350906","10.1145\/3343031"],"URL":"https:\/\/doi.org\/10.1145\/3343031.3350906","relation":{},"subject":[],"published":{"date-parts":[[2019,10,15]]},"assertion":[{"value":"2019-10-15","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}