{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T01:26:07Z","timestamp":1777339567882,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":49,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372054, 62006005"],"award-info":[{"award-number":["62372054, 62006005"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFC3302200"],"award-info":[{"award-number":["2022YFC3302200"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681266","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"2370-2378","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["AVHash: Joint Audio-Visual Hashing for Video Retrieval"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-4124-4151","authenticated-orcid":false,"given":"Yuxiang","family":"Zhou","sequence":"first","affiliation":[{"name":"School of Computer Science, BUPT, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1040-5126","authenticated-orcid":false,"given":"Zhe","family":"Sun","sequence":"additional","affiliation":[{"name":"School of Computer Science, BUPT, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1373-6108","authenticated-orcid":false,"given":"Rui","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science, BUAA, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1348-9218","authenticated-orcid":false,"given":"Yong","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Computer Science, BUPT, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8774-3725","authenticated-orcid":false,"given":"Dell","family":"Zhang","sequence":"additional","affiliation":[{"name":"TeleAI, China Telecom, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patrec.2019.07.003"},{"key":"e_1_3_2_1_2_1","first-page":"1","article-title":"A Supervised Video Hashing Method Based on a Deep 3D Convolutional Neural Network for Large-Scale Video Retrieval","volume":"21","author":"Chen Hanqing","year":"2021","unstructured":"Hanqing Chen, Chunyan Hu, Feifei Lee, Chaowei Lin, Wei Yao, Lu Chen, and Qiu Chen. 2021. A Supervised Video Hashing Method Based on a Deep 3D Convolutional Neural Network for Large-Scale Video Retrieval. Sensors, Vol. 21, 9 (2021), 1--15.","journal-title":"Sensors"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Yong Chen Yuqing Hou Shu Leng Qing Zhang Zhouchen Lin and Dell Zhang. 2021. Long-Tail Hashing. In SIGIR. ACM 1328--1338.","DOI":"10.1145\/3404835.3462888"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2963952"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2020.2995195"},{"key":"e_1_3_2_1_6_1","volume-title":"Enhanced Discrete Multi-modal Hashing: More Constraints yet Less Time to Learn (Extended Abstract)","author":"Chen Yong","unstructured":"Yong Chen, Hui Zhang, Zhibao Tian, Jun Wang, Dell Zhang, and Xuelong Li. 2023. Enhanced Discrete Multi-modal Hashing: More Constraints yet Less Time to Learn (Extended Abstract). In ICDE. IEEE, 3857--3858."},{"key":"e_1_3_2_1_7_1","volume-title":"KyungHyun Cho, and Yoshua Bengio.","author":"Chung Junyoung","year":"2014","unstructured":"Junyoung Chung, cCaglar G\u00fclccehre, KyungHyun Cho, and Yoshua Bengio. 2014. Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling. CoRR, Vol. abs\/1412.3555 (2014)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1038\/38278"},{"key":"e_1_3_2_1_9_1","volume-title":"Mirrokni","author":"Datar Mayur","year":"2004","unstructured":"Mayur Datar, Nicole Immorlica, Piotr Indyk, and Vahab S. Mirrokni. 2004. Locality-sensitive hashing scheme based on p-stable distributions. In SCG. 253--262."},{"key":"e_1_3_2_1_10_1","volume-title":"Glass","author":"Gong Yuan","year":"2021","unstructured":"Yuan Gong, Yu-An Chung, and James R. Glass. 2021. AST: Audio Spectrogram Transformer. In Interspeech. 571--575."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.193"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Yun Gu Chao Ma and Jie Yang. 2016. Supervised Recurrent Hashing for Large Scale Video Retrieval. In ACM Multimedia. 272--276.","DOI":"10.1145\/2964284.2967225"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Yanbin Hao Jingru Duan Hao Zhang Bin Zhu Pengyuan Zhou and Xiangnan He. 2022. Unsupervised Video Hashing with Multi-granularity Contextualization and Multi-structure Preservation. In ACM Multimedia. 3754--3763.","DOI":"10.1145\/3503161.3547836"},{"key":"e_1_3_2_1_14_1","volume-title":"Girshick","author":"He Kaiming","year":"2020","unstructured":"Kaiming He, Haoqi Fan, Yuxin Wu, Saining Xie, and Ross B. Girshick. 2020. Momentum Contrast for Unsupervised Visual Representation Learning. In CVPR. 9726--9735."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Fabian Caba Heilbron Victor Escorcia Bernard Ghanem and Juan Carlos Niebles. 2015. ActivityNet: A large-scale video benchmark for human activity understanding. In CVPR. 961--970.","DOI":"10.1109\/CVPR.2015.7298698"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_17_1","unstructured":"Yu Huang Chenzhuang Du Zihui Xue Xuanyao Chen Hang Zhao and Longbo Huang. 2021. What Makes Multi-Modal Learning Better than Single (Provably). In NeurIPS. 10944--10956."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2670560"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Wang-Cheng Kang Wu-Jun Li and Zhi-Hua Zhou. 2016. Column Sampling Based Discrete Supervised Hashing. In AAAI. 1230--1236.","DOI":"10.1609\/aaai.v30i1.10176"},{"key":"e_1_3_2_1_20_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2015","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In ICLR. 1--15."},{"key":"e_1_3_2_1_21_1","volume-title":"Hinton","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E. Hinton. 2012. ImageNet Classification with Deep Convolutional Neural Networks. In NeurIPS. 1106--1114."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"e_1_3_2_1_23_1","first-page":"181","article-title":"Dual-Stream Knowledge-Preserving Hashing for Unsupervised Video Retrieval","volume":"13674","author":"Li Pandeng","year":"2022","unstructured":"Pandeng Li, Hongtao Xie, Jiannan Ge, Lei Zhang, Shaobo Min, and Yongdong Zhang. 2022. Dual-Stream Knowledge-Preserving Hashing for Unsupervised Video Retrieval. In ECCV, Vol. 13674. 181--197.","journal-title":"ECCV"},{"key":"e_1_3_2_1_24_1","unstructured":"Qi Li Zhenan Sun Ran He and Tieniu Tan. 2017. Deep Supervised Discrete Hashing. In NeurIPS. 2482--2491."},{"key":"e_1_3_2_1_25_1","unstructured":"Shuyan Li Xiu Li Jiwen Lu and Jie Zhou. 2021. Self-Supervised Video Hashing via Bidirectional Transformers. In CVPR. 13549--13558."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3093258"},{"key":"e_1_3_2_1_27_1","first-page":"115","article-title":"Exploring the Impact of Short Videos on Society and Culture","volume":"6","author":"Lin Chen","year":"2023","unstructured":"Chen Lin. 2023. Exploring the Impact of Short Videos on Society and Culture: An Analysis of Social Dynamics and Cultural Expression. , Vol. 6, 3 (2023), 115--118.","journal-title":"An Analysis of Social Dynamics and Cultural Expression."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2016.2645404"},{"key":"e_1_3_2_1_29_1","volume-title":"Do You Even Need Attention? A Stack of Feed-Forward Layers Does Surprisingly Well on ImageNet. CoRR","author":"Melas-Kyriazi Luke","year":"2021","unstructured":"Luke Melas-Kyriazi. 2021. Do You Even Need Attention? A Stack of Feed-Forward Layers Does Surprisingly Well on ImageNet. CoRR, Vol. abs\/2105.02723 (2021)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3293104"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.3048680"},{"key":"e_1_3_2_1_32_1","first-page":"8748","article-title":"Learning Transferable Visual Models From Natural Language Supervision","volume":"139","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. In ICML, Vol. 139. 8748--8763.","journal-title":"ICML"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Fumin Shen Chunhua Shen Wei Liu and Heng Tao Shen. 2015. Supervised Discrete Hashing. In CVPR. 37--45.","DOI":"10.1109\/CVPR.2015.7298598"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3326994"},{"key":"e_1_3_2_1_35_1","volume-title":"Heng Tao Shen, and Richang Hong","author":"Song Jingkuan","year":"2011","unstructured":"Jingkuan Song, Yi Yang, Zi Huang, Heng Tao Shen, and Richang Hong. 2011. Multiple feature hashing for real-time large scale near-duplicate video retrieval. In ACM Multimedia. 423--432."},{"key":"e_1_3_2_1_36_1","unstructured":"Ilya O. Tolstikhin Neil Houlsby Alexander Kolesnikov Lucas Beyer Xiaohua Zhai Thomas Unterthiner Jessica Yung Andreas Steiner Daniel Keysers Jakob Uszkoreit Mario Lucic and Alexey Dosovitskiy. 2021. MLP-Mixer: An all-MLP Architecture for Vision. In NeurIPS. 24261--24272."},{"key":"e_1_3_2_1_37_1","volume-title":"Representation Learning with Contrastive Predictive Coding. CoRR","author":"van den Oord A\u00e4ron","year":"2018","unstructured":"A\u00e4ron van den Oord, Yazhe Li, and Oriol Vinyals. 2018. Representation Learning with Contrastive Predictive Coding. CoRR, Vol. abs\/1807.03748 (2018)."},{"key":"e_1_3_2_1_38_1","first-page":"2579","article-title":"Visualizing High-Dimensional Data Using t-SNE","volume":"9","author":"van der Maaten Laurens","year":"2008","unstructured":"Laurens van der Maaten and Geoffrey Hinton. 2008. Visualizing High-Dimensional Data Using t-SNE. Journal of Machine Learning Research, Vol. 9 (2008), 2579--2605.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_39_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In NeurIPS. 5998--6008."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Liangdao Wang Yan Pan Cong Liu Hanjiang Lai Jian Yin and Ye Liu. 2023. Deep Hashing with Minimal-Distance-Separated Hash Centers. In CVPR. 23455--23464.","DOI":"10.1109\/CVPR52729.2023.02246"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCDS.2019.2963339"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Yuting Wang Jinpeng Wang Bin Chen Ziyun Zeng and Shu-Tao Xia. 2023. Contrastive Masked Autoencoders for Self-Supervised Video Hashing. In AAAI. 2733--2741.","DOI":"10.1609\/aaai.v37i3.25373"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2882155"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Ke Xia Yuqing Ma Xianglong Liu Yadong Mu and Li Liu. 2017. Temporal Binary Coding for Large-Scale Video Search. In ACM Multimedia. 333--341.","DOI":"10.1145\/3123266.3123273"},{"key":"e_1_3_2_1_45_1","volume-title":"Large-scale video analysis and understanding. Ph.,D. Dissertation","author":"Zhongwen Xu.","unstructured":"Zhongwen Xu. 2017. Large-scale video analysis and understanding. Ph.,D. Dissertation. University of Technology Sydney, Australia."},{"key":"e_1_3_2_1_46_1","unstructured":"Guangnan Ye Dong Liu Jun Wang and Shih-Fu Chang. 2013. Large-Scale Video Hashing via Structure Learning. In ICCV. 2272--2279."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Li Yuan Tao Wang Xiaopeng Zhang Francis E. H. Tay Zequn Jie Wei Liu and Jiashi Feng. 2020. Central Similarity Quantization for Efficient Image and Video Retrieval. In CVPR. 3080--3089.","DOI":"10.1109\/CVPR42600.2020.00315"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Hanwang Zhang Meng Wang Richang Hong and Tat-Seng Chua. 2016. Play and Rewind: Optimizing Binary Representations of Videos by Self-Supervised Temporal Hashing. In ACM Multimedia. 781--790.","DOI":"10.1145\/2964284.2964308"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Yaoxin Zhuo Yikang Li Jenhao Hsiao Chiuman Ho and Baoxin Li. 2022. CLIP4Hashing: Unsupervised Deep Hashing for Cross-Modal Video-Text Retrieval. In ICMR. 158--166.","DOI":"10.1145\/3512527.3531381"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681266","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681266","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:42Z","timestamp":1750295862000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681266"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":49,"alternative-id":["10.1145\/3664647.3681266","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681266","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}