{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,5]],"date-time":"2025-10-05T19:48:58Z","timestamp":1759693738844,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,10,17]],"date-time":"2021-10-17T00:00:00Z","timestamp":1634428800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,10,17]]},"DOI":"10.1145\/3474085.3481541","type":"proceedings-article","created":{"date-parts":[[2021,10,18]],"date-time":"2021-10-18T06:57:34Z","timestamp":1634540254000},"page":"1148-1156","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Once and for All"],"prefix":"10.1145","author":[{"given":"Lianghua","family":"Huang","sequence":"first","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"given":"Yu","family":"Liu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"given":"Xiangzeng","family":"Zhou","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"given":"Ansheng","family":"You","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"given":"Ming","family":"Li","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"given":"Bin","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"given":"Yingya","family":"Zhang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"given":"Pan","family":"Pan","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]},{"given":"Xu","family":"Yinghui","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,17]]},"reference":[{"volume-title":"Youtube-8m: A large-scale video classification benchmark. arXiv","year":"2016","author":"Abu-El-Haija Sami","key":"e_1_3_2_1_1_1"},{"volume-title":"Lucas Smaira, Sander Dieleman, and Andrew Zisserman.","year":"2020","author":"Alayrac Jean-Baptiste","key":"e_1_3_2_1_2_1"},{"volume-title":"Self-supervised learning by cross-modal audio-video clustering. NeurIPS","year":"2020","author":"Alwassel Humam","key":"e_1_3_2_1_3_1"},{"key":"e_1_3_2_1_4_1","unstructured":"Yuki Markus Asano Christian Rupprecht and Andrea Vedaldi. 2020. Self-labelling via simultaneous clustering and representation learning. (2020).  Yuki Markus Asano Christian Rupprecht and Andrea Vedaldi. 2020. Self-labelling via simultaneous clustering and representation learning. (2020)."},{"volume-title":"wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. NeurIPS","year":"2020","author":"Baevski Alexei","key":"e_1_3_2_1_5_1"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/279943.279962"},{"volume-title":"et almbox","year":"2020","author":"Brown Tom B","key":"e_1_3_2_1_7_1"},{"volume-title":"A short note about kinetics-600. arXiv","year":"2018","author":"Carreira Joao","key":"e_1_3_2_1_8_1"},{"key":"e_1_3_2_1_9_1","unstructured":"Ting Chen Simon Kornblith Mohammad Norouzi and Geoffrey Hinton. 2020 b. A Simple Framework for Contrastive Learning of Visual Representations. In ICML. 1597--1607.  Ting Chen Simon Kornblith Mohammad Norouzi and Geoffrey Hinton. 2020 b. A Simple Framework for Contrastive Learning of Visual Representations. In ICML. 1597--1607."},{"volume-title":"2020 c. Big Self-Supervised Models are Strong Semi-Supervised Learners. NeurIPS","year":"2020","author":"Chen Ting","key":"e_1_3_2_1_10_1"},{"volume-title":"2020 a. Improved baselines with momentum contrastive learning. arXiv","year":"2020","author":"Chen Xinlei","key":"e_1_3_2_1_11_1"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Yu-An Chung Wei-Ning Hsu Hao Tang and James Glass. 2019. An unsupervised autoregressive model for speech representation learning. In Interspeech.  Yu-An Chung Wei-Ning Hsu Hao Tang and James Glass. 2019. An unsupervised autoregressive model for speech representation learning. In Interspeech.","DOI":"10.21437\/Interspeech.2019-1473"},{"volume-title":"Imagenet: A large-scale hierarchical image database. In CVPR. 248--255.","year":"2009","author":"Deng Jia","key":"e_1_3_2_1_13_1"},{"volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In ACL.","year":"2019","author":"Devlin Jacob","key":"e_1_3_2_1_14_1"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Jianfeng Dong Xirong Li Chaoxi Xu Shouling Ji Yuan He Gang Yang and Xun Wang. 2019. Dual encoding for zero-example video retrieval. In CVPR. 9346--9355.  Jianfeng Dong Xirong Li Chaoxi Xu Shouling Ji Yuan He Gang Yang and Xun Wang. 2019. Dual encoding for zero-example video retrieval. In CVPR. 9346--9355.","DOI":"10.1109\/CVPR.2019.00957"},{"volume-title":"et almbox","year":"2021","author":"Dosovitskiy Alexey","key":"e_1_3_2_1_16_1"},{"volume-title":"E. V. D. Westhuizen, Lisa van Staden, and H. Kamper.","year":"2019","author":"Eloff Ryan","key":"e_1_3_2_1_17_1"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"Christoph Feichtenhofer Haoqi Fan Jitendra Malik and Kaiming He. 2019. Slowfast networks for video recognition. In ICCV. 6202--6211.  Christoph Feichtenhofer Haoqi Fan Jitendra Malik and Kaiming He. 2019. Slowfast networks for video recognition. In ICCV. 6202--6211.","DOI":"10.1109\/ICCV.2019.00630"},{"volume-title":"Dylan Freedman, Aren Jansen, Wade Lawrence, R Channing Moore, Manoj Plakal, and Marvin Ritter.","year":"2017","author":"Gemmeke Jort F","key":"e_1_3_2_1_19_1"},{"volume-title":"large minibatch sgd: Training imagenet in 1 hour. arXiv","year":"2017","author":"Goyal Priya","key":"e_1_3_2_1_20_1"},{"volume-title":"Memory-augmented Dense Predictive Coding for Video Representation Learning. In European Conference on Computer Vision.","year":"2020","author":"Han Tengda","key":"e_1_3_2_1_21_1"},{"volume-title":"2020 b. Self-supervised co-training for video representation learning. NeurIPS","year":"2020","author":"Han Tengda","key":"e_1_3_2_1_22_1"},{"key":"e_1_3_2_1_23_1","unstructured":"Kaiming He Haoqi Fan Yuxin Wu Saining Xie and Ross Girshick. 2020. Momentum contrast for unsupervised visual representation learning. In CVPR.  Kaiming He Haoqi Fan Yuxin Wu Saining Xie and Ross Girshick. 2020. Momentum contrast for unsupervised visual representation learning. In CVPR."},{"key":"e_1_3_2_1_24_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778.  Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2011.6126543"},{"volume-title":"Automatic Curation of Large-Scale Datasets for Audio-Visual Representation Learning. arXiv","year":"2021","author":"Lee Sangho","key":"e_1_3_2_1_26_1"},{"key":"e_1_3_2_1_27_1","unstructured":"Tsung-Yi Lin Priya Goyal Ross Girshick Kaiming He and Piotr Doll\u00e1r. 2017. Focal loss for dense object detection. In ICCV. 2980--2988.  Tsung-Yi Lin Priya Goyal Ross Girshick Kaiming He and Piotr Doll\u00e1r. 2017. Focal loss for dense object detection. In ICCV. 2980--2988."},{"volume-title":"Train a One-Million-Way Instance Classifier for Unsupervised Visual Representation Learning. AAAI","year":"2021","author":"Liu Yu","key":"e_1_3_2_1_28_1"},{"key":"e_1_3_2_1_29_1","unstructured":"Chenxu Luo and Alan L Yuille. 2019. Grouped spatial-temporal aggregation for efficient action recognition. In ICCV. 5512--5521.  Chenxu Luo and Alan L Yuille. 2019. Grouped spatial-temporal aggregation for efficient action recognition. In ICCV. 5512--5521."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.5555\/3305890.3305916"},{"key":"e_1_3_2_1_31_1","unstructured":"Shuang Ma Daniel McDuff and Yale Song. 2019. Unpaired image-to-speech synthesis with multimodal information bottleneck. In ICCV. 7598--7607.  Shuang Ma Daniel McDuff and Yale Song. 2019. Unpaired image-to-speech synthesis with multimodal information bottleneck. In ICCV. 7598--7607."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"crossref","unstructured":"Antoine Miech Jean-Baptiste Alayrac Lucas Smaira Ivan Laptev Josef Sivic and Andrew Zisserman. 2020. End-to-end learning of visual representations from uncurated instructional videos. In CVPR. 9879--9889.  Antoine Miech Jean-Baptiste Alayrac Lucas Smaira Ivan Laptev Josef Sivic and Andrew Zisserman. 2020. End-to-end learning of visual representations from uncurated instructional videos. In CVPR. 9879--9889.","DOI":"10.1109\/CVPR42600.2020.00990"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Antoine Miech Dimitri Zhukov Jean-Baptiste Alayrac Makarand Tapaswi Ivan Laptev and Josef Sivic. 2019. Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. In ICCV. 2630--2640.  Antoine Miech Dimitri Zhukov Jean-Baptiste Alayrac Makarand Tapaswi Ivan Laptev and Josef Sivic. 2019. Howto100m: Learning a text-video embedding by watching hundred million narrated video clips. In ICCV. 2630--2640.","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240712"},{"volume-title":"Audio-visual instance discrimination with cross-modal agreement. arXiv","year":"2020","author":"Morgado Pedro","key":"e_1_3_2_1_35_1"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"crossref","unstructured":"Mehdi Noroozi Ananth Vinjimoor Paolo Favaro and Hamed Pirsiavash. 2018. Boosting self-supervised learning via knowledge transfer. In CVPR. 9359--9367.  Mehdi Noroozi Ananth Vinjimoor Paolo Favaro and Hamed Pirsiavash. 2018. Boosting self-supervised learning via knowledge transfer. In CVPR. 9359--9367.","DOI":"10.1109\/CVPR.2018.00975"},{"volume-title":"Multi-modal self-supervision from generalized data transformations. arXiv","year":"2020","author":"Patrick Mandela","key":"e_1_3_2_1_37_1"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/2733373.2806390"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"AJ Piergiovanni Anelia Angelova and Michael S Ryoo. 2020. Evolving losses for unsupervised video representation learning. In CVPR. 133--142.  AJ Piergiovanni Anelia Angelova and Michael S Ryoo. 2020. Evolving losses for unsupervised video representation learning. In CVPR. 133--142.","DOI":"10.1109\/CVPR42600.2020.00021"},{"volume-title":"Spatiotemporal contrastive video representation learning. arXiv","year":"2020","author":"Qian Rui","key":"e_1_3_2_1_40_1"},{"key":"e_1_3_2_1_41_1","unstructured":"Zhaofan Qiu Ting Yao and Tao Mei. 2017. Learning spatio-temporal representation with pseudo-3d residual networks. In ICCV. 5533--5541.  Zhaofan Qiu Ting Yao and Tao Mei. 2017. Learning spatio-temporal representation with pseudo-3d residual networks. In ICCV. 5533--5541."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Mark Sandler Andrew Howard Menglong Zhu Andrey Zhmoginov and Liang-Chieh Chen. 2018. Mobilenetv2: Inverted residuals and linear bottlenecks. In CVPR. 4510--4520.  Mark Sandler Andrew Howard Menglong Zhu Andrey Zhmoginov and Liang-Chieh Chen. 2018. Mobilenetv2: Inverted residuals and linear bottlenecks. In CVPR. 4510--4520.","DOI":"10.1109\/CVPR.2018.00474"},{"volume-title":"Amir Roshan Zamir, and Mubarak Shah","year":"2012","author":"Soomro Khurram","key":"e_1_3_2_1_43_1"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Du Tran Heng Wang Lorenzo Torresani Jamie Ray Yann LeCun and Manohar Paluri. 2018. A closer look at spatiotemporal convolutions for action recognition. In CVPR. 6450--6459.  Du Tran Heng Wang Lorenzo Torresani Jamie Ray Yann LeCun and Manohar Paluri. 2018. A closer look at spatiotemporal convolutions for action recognition. In CVPR. 6450--6459.","DOI":"10.1109\/CVPR.2018.00675"},{"volume-title":"J Zico Kolter, Louis-Philippe Morency, and Ruslan Salakhutdinov.","year":"2019","author":"Hubert Tsai Yao-Hung","key":"e_1_3_2_1_45_1"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_30"},{"volume-title":"2020 a. Structbert: Incorporating language structures into pre-training for deep language understanding. ICLR","year":"2020","author":"Wang Wei","key":"e_1_3_2_1_47_1"},{"key":"e_1_3_2_1_48_1","unstructured":"Chao-Yuan Wu Manzil Zaheer Hexiang Hu R Manmatha Alexander J Smola and Philipp Kr\u00e4henb\u00fchl. 2018. Compressed video action recognition. In CVPR. 6026--6035.  Chao-Yuan Wu Manzil Zaheer Hexiang Hu R Manmatha Alexander J Smola and Philipp Kr\u00e4henb\u00fchl. 2018. Compressed video action recognition. In CVPR. 6026--6035."},{"key":"e_1_3_2_1_49_1","unstructured":"Dejing Xu Jun Xiao Zhou Zhao Jian Shao Di Xie and Yueting Zhuang. 2019. Self-supervised spatiotemporal learning via video clip order prediction. In CVPR. 10334--10343.  Dejing Xu Jun Xiao Zhou Zhao Jian Shao Di Xie and Yueting Zhuang. 2019. Self-supervised spatiotemporal learning via video clip order prediction. In CVPR. 10334--10343."},{"volume-title":"Large batch training of convolutional networks. arXiv","year":"2017","author":"You Yang","key":"e_1_3_2_1_50_1"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Liheng Zhang Guo-Jun Qi Liqiang Wang and Jiebo Luo. 2019. Aet vs. aed: Unsupervised representation learning by auto-encoding transformations rather than data. In CVPR. 2547--2555.  Liheng Zhang Guo-Jun Qi Liqiang Wang and Jiebo Luo. 2019. Aet vs. aed: Unsupervised representation learning by auto-encoding transformations rather than data. In CVPR. 2547--2555.","DOI":"10.1109\/CVPR.2019.00265"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Richard Zhang Phillip Isola and Alexei A Efros. 2016. Colorful image colorization. In ECCV. 649--666.  Richard Zhang Phillip Isola and Alexei A Efros. 2016. Colorful image colorization. In ECCV. 649--666.","DOI":"10.1007\/978-3-319-46487-9_40"},{"volume-title":"Shufflenet: An extremely efficient convolutional neural network for mobile devices. In CVPR. 6848--6856.","year":"2018","author":"Zhang Xiangyu","key":"e_1_3_2_1_53_1"},{"volume-title":"Places: A 10 million image database for scene recognition. PAMI","year":"2017","author":"Zhou Bolei","key":"e_1_3_2_1_54_1"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.5555\/1619645.1619754"}],"event":{"name":"MM '21: ACM Multimedia Conference","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Virtual Event China","acronym":"MM '21"},"container-title":["Proceedings of the 29th ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3481541","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3474085.3481541","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:17:35Z","timestamp":1750191455000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3474085.3481541"}},"subtitle":["Self-supervised Multi-modal Co-training on One-billion Videos at Alibaba"],"short-title":[],"issued":{"date-parts":[[2021,10,17]]},"references-count":55,"alternative-id":["10.1145\/3474085.3481541","10.1145\/3474085"],"URL":"https:\/\/doi.org\/10.1145\/3474085.3481541","relation":{},"subject":[],"published":{"date-parts":[[2021,10,17]]},"assertion":[{"value":"2021-10-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}