{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T05:06:53Z","timestamp":1755839213553,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Project supported by the Foundation for Innovative Research Groups of the National Natural Science Foundation of China","award":["61921003"],"award-info":[{"award-number":["61921003"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3475637","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T06:09:05Z","timestamp":1634537345000},"page":"5203-5212","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["Dense Contrastive Visual-Linguistic Pretraining"],"prefix":"10.1145","author":[{"given":"Lei","family":"Shi","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"given":"Kai","family":"Shuang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"given":"Shijie","family":"Geng","sequence":"additional","affiliation":[{"name":"Rutgers University, Piscataway, NJ, USA"}]},{"given":"Peng","family":"Gao","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory, Shanghai, China"}]},{"given":"Zuohui","family":"Fu","sequence":"additional","affiliation":[{"name":"Rutgers University, Piscataway, NJ, USA"}]},{"given":"Gerard","family":"de Melo","sequence":"additional","affiliation":[{"name":"Hasso Plattner Institute, University of Potsdam, Potsdam, Germany"}]},{"given":"Yunpeng","family":"Chen","sequence":"additional","affiliation":[{"name":"YITU Technology, Beijing, China"}]},{"given":"Sen","family":"Su","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"volume-title":"Learning Visual Representations with Caption Annotations. arXiv e-prints","year":"2020","author":"Sariyildiz Mert Bulent","key":"e_1_3_2_1_2_1"},{"volume-title":"A simple framework for contrastive learning of visual representations. arXiv preprint arXiv:2002.05709","year":"2020","author":"Chen Ting","key":"e_1_3_2_1_3_1"},{"volume-title":"Improved Baselines with Momentum Contrastive Learning. arXiv preprint arXiv:2003.04297","year":"2020","author":"Chen Xinlei","key":"e_1_3_2_1_4_1"},{"volume-title":"Faisal Ahmed, Zhe Gan, Yu Cheng, and Jingjing Liu.","year":"2019","author":"Chen Yen-Chun","key":"e_1_3_2_1_5_1"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"volume-title":"VirTex: Learning Visual Representations from Textual Annotations. arXiv preprint arXiv:2006.06666","year":"2020","author":"Desai Karan","key":"e_1_3_2_1_7_1"},{"volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","year":"2018","author":"Devlin Jacob","key":"e_1_3_2_1_8_1"},{"volume-title":"Large-Scale Adversarial Training for Vision-and-Language Representation Learning. arXiv preprint arXiv:2006.06195","year":"2020","author":"Gan Zhe","key":"e_1_3_2_1_9_1"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00680"},{"volume-title":"AAAI Conference on Artificial Intelligence.","year":"2021","author":"Geng Shijie","key":"e_1_3_2_1_11_1"},{"volume-title":"Explaining and harnessing adversarial examples. arXiv preprint arXiv:1412.6572","year":"2014","author":"Goodfellow Ian J","key":"e_1_3_2_1_12_1"},{"volume-title":"Making the V in VQA Matter: Elevating the Role of Image Understanding in Visual Question Answering. In Conference on Computer Vision and Pattern Recognition (CVPR).","year":"2017","author":"Goyal Yash","key":"e_1_3_2_1_13_1"},{"volume-title":"Zhaohan Daniel Guo, Mohammad Gheshlaghi Azar, et al.","year":"2020","author":"Grill Jean-Bastien","key":"e_1_3_2_1_14_1"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.100"},{"volume-title":"Momentum contrast for unsupervised visual representation learning. arXiv preprint arXiv:1911.05722","year":"2019","author":"He Kaiming","key":"e_1_3_2_1_16_1"},{"volume-title":"AdCo: Adversarial Contrast for Efficient Learning of Unsupervised Representations from Self-Trained Negative Adversaries. arXiv preprint arXiv:2011.08435","year":"2020","author":"Hu Qianjiang","key":"e_1_3_2_1_17_1"},{"volume-title":"GQA: A New Dataset for Real- World Visual Reasoning and Compositional Question Answering. Conference on Computer Vision and Pattern Recognition (CVPR)","year":"2019","author":"Hudson Drew A","key":"e_1_3_2_1_18_1"},{"volume-title":"Robust pre-training by adversarial contrastive learning. arXiv preprint arXiv:2010.13337","year":"2020","author":"Jiang Ziyu","key":"e_1_3_2_1_19_1"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00202"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.5555\/2999134.2999257"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.01041"},{"volume-title":"Visualbert: A simple and performant baseline for vision and language. arXiv preprint arXiv:1908.03557","year":"2019","author":"Li Liunian Harold","key":"e_1_3_2_1_25_1"},{"volume-title":"Oscar: Object-semantics aligned pre-training for vision-language tasks. arXiv preprint arXiv:2004.06165","year":"2020","author":"Li Xiujun","key":"e_1_3_2_1_26_1"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","year":"2019","author":"Liu Yinhan","key":"e_1_3_2_1_28_1"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454289"},{"volume-title":"International Conference on Learning Representations. https: \/\/openreview.net\/forum?id=rJzIBfZAb","year":"2018","author":"Madry Aleksander","key":"e_1_3_2_1_30_1"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.17"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_48"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46466-4_5"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.5555\/2986459.2986587"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.278"},{"volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al.","year":"2021","author":"Radford Alec","key":"e_1_3_2_1_36_1"},{"key":"e_1_3_2_1_37_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans and Ilya Sutskever. 2018. Improving language understanding by generative pre-training. (2018).  Alec Radford Karthik Narasimhan Tim Salimans and Ilya Sutskever. 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2979270"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.5555\/2969239.2969250"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.5555\/3454287.3454589"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"volume-title":"Multi-Layer Content Interaction Through Quaternion Product For Visual Question Answering. In IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 4412--4416","year":"2020","author":"Shi Lei","key":"e_1_3_2_1_42_1"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-2034"},{"volume-title":"A corpus for reasoning about natural language grounded in photographs. arXiv preprint arXiv:1811.00491","year":"2018","author":"Suhr Alane","key":"e_1_3_2_1_44_1"},{"volume-title":"Intriguing properties of neural networks. arXiv preprint arXiv:1312.6199","year":"2013","author":"Szegedy Christian","key":"e_1_3_2_1_45_1"},{"volume-title":"Lxmert: Learning cross-modality encoder representations from transformers. arXiv preprint arXiv:1908.07490","year":"2019","author":"Tan Hao","key":"e_1_3_2_1_46_1"},{"volume-title":"Contrastive multiview coding. arXiv preprint arXiv:1906.05849","year":"2019","author":"Tian Yonglong","key":"e_1_3_2_1_47_1"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_49_1","unstructured":"Yonghui Wu Mike Schuster Zhifeng Chen Quoc V Le Mohammad Norouzi Wolfgang Macherey Maxim Krikun Yuan Cao Qin Gao Klaus Macherey etal 2016. Google's neural machine translation system: Bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144 (2016).  Yonghui Wu Mike Schuster Zhifeng Chen Quoc V Le Mohammad Norouzi Wolfgang Macherey Maxim Krikun Yuan Cao Qin Gao Klaus Macherey et al. 2016. Google's neural machine translation system: Bridging the gap between human and machine translation. arXiv preprint arXiv:1609.08144 (2016)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00393"},{"volume-title":"Visual entailment: A novel task for fine-grained image understanding. arXiv preprint arXiv:1901.06706","year":"2019","author":"Xie Ning","key":"e_1_3_2_1_51_1"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00644"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00688"},{"key":"e_1_3_2_1_54_1","unstructured":"Chen Zhu Yu Cheng Zhe Gan Siqi Sun Tom Goldstein and Jingjing Liu. 2020. FreeLB: Enhanced Adversarial Training for Natural Language Understanding. In ICLR. https:\/\/openreview.net\/forum?id=BygzbyHFvB  Chen Zhu Yu Cheng Zhe Gan Siqi Sun Tom Goldstein and Jingjing Liu. 2020. FreeLB: Enhanced Adversarial Training for Natural Language Understanding. In ICLR. https:\/\/openreview.net\/forum?id=BygzbyHFvB"}],"event":{"name":"MM '21: ACM Multimedia Conference","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Virtual Event China","acronym":"MM '21"},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475637","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3475637","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:48:24Z","timestamp":1750193304000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3475637"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":54,"alternative-id":["10.1145\/3474085.3475637","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3475637","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}