{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:39:19Z","timestamp":1777891159876,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,7,11]],"date-time":"2021-07-11T00:00:00Z","timestamp":1625961600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,7,11]]},"DOI":"10.1145\/3404835.3462927","type":"proceedings-article","created":{"date-parts":[[2021,7,12]],"date-time":"2021-07-12T02:41:54Z","timestamp":1626057714000},"page":"1359-1368","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":21,"title":["Improving Video Retrieval by Adaptive Margin"],"prefix":"10.1145","author":[{"given":"Feng","family":"He","sequence":"first","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qi","family":"Wang","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhifan","family":"Feng","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenbin","family":"Jiang","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yajuan","family":"L\u00fc","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yong","family":"Zhu","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xiao","family":"Tan","sequence":"additional","affiliation":[{"name":"Baidu Inc., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,7,11]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Xiaojun Chang Yi Yang Alexander Hauptmann Eric P Xing and Yao-Liang Yu. 2015. Semantic concept discovery for large-scale zero-shot event detection. In Twenty-fourth international joint conference on artificial intelligence."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01065"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.149"},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2505515.2507880"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_2_6_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT (1).","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. In NAACL-HLT (1)."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2832602"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00957"},{"key":"e_1_3_2_2_9_1","volume-title":"Jamie Ryan Kiros, and Sanja Fidler","author":"Faghri Fartash","year":"2017","unstructured":"Fartash Faghri, David J Fleet, Jamie Ryan Kiros, and Sanja Fidler. 2017. Vse++: Improving visual-semantic embeddings with hard negatives. arXiv preprint arXiv:1707.05612(2017)."},{"key":"e_1_3_2_2_10_1","volume-title":"Multi-modal Transformer for Video Retrieval. In European Conference on Computer Vision (ECCV).","author":"Gabeur Valentin","year":"2020","unstructured":"Valentin Gabeur, Chen Sun, Karteek Alahari, and Cordelia Schmid. 2020. Multi-modal Transformer for Video Retrieval. In European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2578726.2578746"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"crossref","unstructured":"Sangchul Hahn and Heeyoul Choi. 2019. Self-knowledge distillation in natural language processing. arXiv preprint arXiv:1908.01851(2019).","DOI":"10.26615\/978-954-452-056-4_050"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_14_1","unstructured":"Geoffrey Hinton Oriol Vinyals and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531(2015)."},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00758"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"e_1_3_2_2_18_1","volume-title":"International conference on machine learning. PMLR, 448--456","author":"Ioffe Sergey","year":"2015","unstructured":"Sergey Ioffe and Christian Szegedy. 2015. Batch normalization: Accelerating deep network training by reducing internal covariate shift. In International conference on machine learning. PMLR, 448--456."},{"key":"e_1_3_2_2_19_1","unstructured":"Ryan Kiros Ruslan Salakhutdinov and Richard S Zemel. 2014. Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539(2014)."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299073"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.83"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5894"},{"key":"e_1_3_2_2_23_1","unstructured":"Yang Liu Samuel Albanie Arsha Nagrani and Andrew Zisserman. 2019. Use what you have: Video retrieval using representations from collaborative experts. arXiv preprint arXiv:1907.13487(2019)."},{"key":"e_1_3_2_2_24_1","unstructured":"Antoine Miech Ivan Laptev and Josef Sivic. 2018. Learning a text-video embed-ding from incomplete and heterogeneous data. arXiv preprint arXiv:1804.02516(2018)."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00272"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3206025.3206064"},{"key":"e_1_3_2_2_27_1","unstructured":"Hossein Mobahi Mehrdad Farajtabar and Peter L Bartlett. 2020. Self-distillation amplifies regularization in hilbert space. arXiv preprint arXiv:2002.05715(2020)."},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1410"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351030"},{"key":"e_1_3_2_2_31_1","unstructured":"Kaiye Wang Qiyue Yin Wei Wang Shu Wu and Liang Wang. 2016. A comprehensive survey on cross-modal retrieval. arXiv preprint arXiv:1607.06215(2016)."},{"key":"e_1_3_2_2_32_1","article-title":"Distance metric learning for large margin nearest neighbor classification","volume":"10","author":"Weinberger Kilian Q","year":"2009","unstructured":"Kilian Q Weinberger and Lawrence K Saul. 2009. Distance metric learning for large margin nearest neighbor classification. Journal of machine learning research 10, 2 (2009).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00054"},{"key":"e_1_3_2_2_34_1","volume-title":"Proceedings of the European Conference on Computer Vision(ECCV). 305--321","author":"Xie Saining","year":"2018","unstructured":"Saining Xie, Chen Sun, Jonathan Huang, Zhuowen Tu, and Kevin Murphy. 2018. Rethinking spatio-temporal feature learning: Speed-accuracy trade-offs in video classification. In Proceedings of the European Conference on Computer Vision(ECCV). 305--321."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401151"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_29"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.347"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/IROS.2017.8202207"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_23"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00381"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33019243"}],"event":{"name":"SIGIR '21: The 44th International ACM SIGIR Conference on Research and Development in Information Retrieval","location":"Virtual Event Canada","acronym":"SIGIR '21","sponsor":["SIGIR ACM Special Interest Group on Information Retrieval"]},"container-title":["Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3462927","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3404835.3462927","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:18:20Z","timestamp":1750191500000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3404835.3462927"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,7,11]]},"references-count":42,"alternative-id":["10.1145\/3404835.3462927","10.1145\/3404835"],"URL":"https:\/\/doi.org\/10.1145\/3404835.3462927","relation":{},"subject":[],"published":{"date-parts":[[2021,7,11]]},"assertion":[{"value":"2021-07-11","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}