{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T05:07:15Z","timestamp":1755839235084,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,26]],"date-time":"2021-10-26T00:00:00Z","timestamp":1635206400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,26]]},"DOI":"10.1145\/3459637.3481937","type":"proceedings-article","created":{"date-parts":[[2021,11,15]],"date-time":"2021-11-15T15:53:43Z","timestamp":1636991623000},"page":"4341-4351","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["Multi-modal Dictionary BERT for Cross-modal Video Search in Baidu Advertising"],"prefix":"10.1145","author":[{"given":"Tan","family":"Yu","sequence":"first","affiliation":[{"name":"Baidu Research, Bellevue, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi","family":"Yang","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yi","family":"Li","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lin","family":"Liu","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mingming","family":"Sun","sequence":"additional","affiliation":[{"name":"Baidu Research, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ping","family":"Li","sequence":"additional","affiliation":[{"name":"Baidu Research, Bellevue, WA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,10,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.5555\/3042817.3043076"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939812"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.5555\/3016387.3016390"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2339530.2339652"},{"volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)","year":"2019","author":"Devlin Jacob","key":"e_1_3_2_1_6_1"},{"volume-title":"Proceedings of the British Machine Vision Conference (BMVC)","year":"2018","author":"Faghri Fartash","key":"e_1_3_2_1_7_1"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330651"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3340531.3412162"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.285"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.5555\/2999792.2999849"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58548-8_13"},{"key":"e_1_3_2_1_13_1","unstructured":"Zhe Gan Yen-Chun Chen Linjie Li Chen Zhu Yu Cheng and Jingjing Liu. 2020. Large-Scale Adversarial Training for Vision-and-Language Representation Learning. In Advances in Neural Information Processing Systems (NeurIPS). virtual. Zhe Gan Yen-Chun Chen Linjie Li Chen Zhu Yu Cheng and Jingjing Liu. 2020. Large-Scale Adversarial Training for Vision-and-Language Representation Learning. In Advances in Neural Information Processing Systems (NeurIPS). virtual."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.240"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3269206.3272007"},{"volume-title":"Retrieve Fast","year":"1920","author":"Geigle Gregor","key":"e_1_3_2_1_16_1"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-013-0658-4"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1162\/0899766042321814"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.195"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.767"},{"volume-title":"Pixel-BERT: Aligning image pixels with text by deep multi-modal transformers. arXiv preprint arXiv:2004.00849","year":"2020","author":"Huang Zhicheng","key":"e_1_3_2_1_22_1"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2010.57"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"volume-title":"Fahad Shahbaz Khan, and Mubarak Shah.","year":"2021","author":"Khan Salman","key":"e_1_3_2_1_25_1"},{"volume-title":"Proceedings of the NeurIPS 2019 Workshop on Visually Grounded Interaction and Language (ViGIL)","year":"2019","author":"Kiela Douwe","key":"e_1_3_2_1_26_1"},{"key":"e_1_3_2_1_27_1","unstructured":"Douwe Kiela Hamed Firooz Aravind Mohan Vedanuj Goswami Amanpreet Singh Pratik Ringshia and Davide Testuggine. 2020. The Hateful Memes Challenge: Detecting Hate Speech in Multimodal Memes. In Advances in Neural Information Processing Systems (NeurIPS). Virtual. Douwe Kiela Hamed Firooz Aravind Mohan Vedanuj Goswami Amanpreet Singh Pratik Ringshia and Davide Testuggine. 2020. The Hateful Memes Challenge: Detecting Hate Speech in Multimodal Memes. In Advances in Neural Information Processing Systems (NeurIPS). Virtual."},{"volume-title":"Stacked Cross Attention for Image-Text Matching. In Computer Vision - ECCV 2018 Proceedings of the 15th European Conference on Computer Vision (ECCV), Part IV","year":"2018","author":"Lee Kuang-Huei","key":"e_1_3_2_1_28_1"},{"volume-title":"Proceedings of the 9th International Conference on Learning Representations (ICLR)","year":"2021","author":"Lee Sangho","key":"e_1_3_2_1_29_1"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401238"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.emnlp-main.161"},{"volume-title":"A Closer Look at the Robustness of Vision-and-Language Pre-trained Models. arXiv preprint arXiv:2012.08673","year":"2020","author":"Li Linjie","key":"e_1_3_2_1_33_1"},{"volume-title":"VisualBERT: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","year":"2019","author":"Li Liunian Harold","key":"e_1_3_2_1_34_1"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454289"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01045"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Xiaopeng Lu Tiancheng Zhao and Kyusong Lee. 2021. VisualSparta: An Embarrassingly Simple Approach to Large-scale Text-to-Image Search with Weighted Bag-of-words. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (ACL\/IJCNLP). Virtual Event 5020--5029. Xiaopeng Lu Tiancheng Zhao and Kyusong Lee. 2021. VisualSparta: An Embarrassingly Simple Approach to Large-scale Text-to-Image Search with Weighted Bag-of-words. In Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (ACL\/IJCNLP). Virtual Event 5020--5029.","DOI":"10.18653\/v1\/2021.acl-long.389"},{"volume-title":"UniVL: A unified video and language pre-training model for multimodal understanding and generation. arXiv preprint arXiv:2002.06353","year":"2020","author":"Luo Huaishao","key":"e_1_3_2_1_39_1"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00970"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.5555\/2832747.2832769"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.232"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.208"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.5555\/2986459.2986587"},{"volume-title":"ImageBERT: Cross-modal pre-training with large-scale weak-supervised image- text data. arXiv preprint arXiv:2001.07966","year":"2020","author":"Qi Di","key":"e_1_3_2_1_45_1"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1873987"},{"volume-title":"Learning video representations using contrastive bidirectional transformer. arXiv preprint arXiv:1906.05743","year":"2019","author":"Sun Chen","key":"e_1_3_2_1_47_1"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00756"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.naacl-main.77"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"volume-title":"MiniVLM: A Smaller and Faster Vision-Language Model. arXiv preprint arXiv:2012.06946","year":"2020","author":"Wang Jianfeng","key":"e_1_3_2_1_51_1"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3457236"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3357833"},{"volume-title":"Proceedings of the Thirty-Fifth AAAI Conference on Artificial Intelligence (AAAI). Virtual Event, 3208--3216","year":"2021","author":"Yu Fei","key":"e_1_3_2_1_54_1"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE51399.2021.00225"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3459637.3482233"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403297"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462924"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE48307.2020.00094"},{"volume-title":"Proceedings of the 3rd Conference on Machine Learning and Systems (MLSys)","year":"2020","author":"Zhao Weijie","key":"e_1_3_2_1_61_1"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3358045"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00414"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3455025"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00877"}],"event":{"name":"CIKM '21: The 30th ACM International Conference on Information and Knowledge Management","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web","SIGIR ACM Special Interest Group on Information Retrieval"],"location":"Virtual Event Queensland Australia","acronym":"CIKM '21"},"container-title":["Proceedings of the 30th ACM International Conference on Information &amp; Knowledge Management"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3459637.3481937","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3459637.3481937","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:58Z","timestamp":1750193338000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3459637.3481937"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,26]]},"references-count":66,"alternative-id":["10.1145\/3459637.3481937","10.1145\/3459637"],"URL":"https:\/\/doi.org\/10.1145\/3459637.3481937","relation":{},"subject":[],"published":{"date-parts":[[2021,10,26]]},"assertion":[{"value":"2021-10-30","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}