{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,7]],"date-time":"2026-07-07T15:58:25Z","timestamp":1783439905272,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":96,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Key R\\&D Program of China","award":["2020AAA0107100"],"award-info":[{"award-number":["2020AAA0107100"]}]},{"DOI":"10.13039\/100017052","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62325604, 62276271"],"award-info":[{"award-number":["62325604, 62276271"]}],"id":[{"id":"10.13039\/100017052","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3611853","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:12Z","timestamp":1698391632000},"page":"3365-3374","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":34,"title":["TMac: Temporal Multi-Modal Graph Learning for Acoustic Event Classification"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3900-4204","authenticated-orcid":false,"given":"Meng","family":"Liu","sequence":"first","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4837-455X","authenticated-orcid":false,"given":"Ke","family":"Liang","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9369-7390","authenticated-orcid":false,"given":"Dayu","family":"Hu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9044-4841","authenticated-orcid":false,"given":"Hao","family":"Yu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9894-0062","authenticated-orcid":false,"given":"Yue","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2489-573X","authenticated-orcid":false,"given":"Lingyuan","family":"Meng","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1353-2968","authenticated-orcid":false,"given":"Wenxuan","family":"Tu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1491-4594","authenticated-orcid":false,"given":"Sihang","family":"Zhou","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9066-1475","authenticated-orcid":false,"given":"Xinwang","family":"Liu","sequence":"additional","affiliation":[{"name":"National University of Defense Technology, Changsha, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text. NeurIPS","author":"Akbari Hassan","year":"2021","unstructured":"Hassan Akbari, Liangzhe Yuan, Rui Qian, Wei-Hong Chuang, Shih-Fu Chang, Yin Cui, and Boqing Gong. 2021. Vatt: Transformers for multimodal self-supervised learning from raw video, audio and text. NeurIPS (2021)."},{"key":"e_1_3_2_1_2_1","first-page":"25","article-title":"Self-supervised multimodal versatile networks","volume":"33","author":"Alayrac Jean-Baptiste","year":"2020","unstructured":"Jean-Baptiste Alayrac, Adria Recasens, Rosalia Schneider, Relja Arandjelovi\u0107, Jason Ramapuram, Jeffrey De Fauw, Lucas Smaira, Sander Dieleman, and Andrew Zisserman. 2020. Self-supervised multimodal versatile networks. NeurIPS, Vol. 33 (2020), 25--37.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_3_1","volume-title":"Self-supervised learning by cross-modal audio-video clustering. NeurIPS","author":"Alwassel Humam","year":"2020","unstructured":"Humam Alwassel, Dhruv Mahajan, Bruno Korbar, Lorenzo Torresani, Bernard Ghanem, and Du Tran. 2020. Self-supervised learning by cross-modal audio-video clustering. NeurIPS (2020)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Relja Arandjelovic and Andrew Zisserman. 2017. Look listen and learn. In ICCV.","DOI":"10.1109\/ICCV.2017.73"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Relja Arandjelovic and Andrew Zisserman. 2018. Objects that sound. In ECCV. 435--451.","DOI":"10.1007\/978-3-030-01246-5_27"},{"key":"e_1_3_2_1_6_1","volume-title":"Adrian KC Lee, and Jennifer K Bizley","author":"Atilgan Huriye","year":"2018","unstructured":"Huriye Atilgan, Stephen M Town, Katherine C Wood, Gareth P Jones, Ross K Maddox, Adrian KC Lee, and Jennifer K Bizley. 2018. Integration of visual information in auditory cortex promotes auditory scene analysis through multisensory binding. Neuron (2018)."},{"key":"e_1_3_2_1_7_1","volume-title":"Soundnet: Learning sound representations from unlabeled video. NeurIPS","author":"Aytar Yusuf","year":"2016","unstructured":"Yusuf Aytar, Carl Vondrick, and Antonio Torralba. 2016. Soundnet: Learning sound representations from unlabeled video. NeurIPS (2016)."},{"key":"e_1_3_2_1_8_1","volume-title":"wav2vec 2.0: A framework for self-supervised learning of speech representations. NeurIPS","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A framework for self-supervised learning of speech representations. NeurIPS (2020)."},{"key":"e_1_3_2_1_9_1","volume-title":"Generalizing to Unseen Elements: A Survey on Knowledge Extrapolation for Knowledge Graphs. arXiv preprint arXiv:2302.01859","author":"Chen Mingyang","year":"2023","unstructured":"Mingyang Chen, Wen Zhang, Yuxia Geng, Zezhong Xu, Jeff Z Pan, and Huajun Chen. 2023. Generalizing to Unseen Elements: A Survey on Knowledge Extrapolation for Knowledge Graphs. arXiv preprint arXiv:2302.01859 (2023)."},{"key":"e_1_3_2_1_10_1","volume-title":"Masked spectrogram prediction for self-supervised audio pre-training. arXiv preprint arXiv:2204.12768","author":"Chong Dading","year":"2022","unstructured":"Dading Chong, Helin Wang, Peilin Zhou, and Qingcheng Zeng. 2022. Masked spectrogram prediction for self-supervised audio pre-training. arXiv preprint arXiv:2204.12768 (2022)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Wei Dai Chia Dai Shuhui Qu Juncheng Li and Samarjit Das. 2017. Very deep convolutional neural networks for raw waveforms. In ICASSP.","DOI":"10.1109\/ICASSP.2017.7952190"},{"key":"e_1_3_2_1_12_1","unstructured":"Difei Gao Ke Li Ruiping Wang Shiguang Shan and Xilin Chen. [n.d.]. Multi-modal graph neural network for joint reasoning on vision and scene text. In CVPR."},{"key":"e_1_3_2_1_13_1","volume-title":"On the equivalence between temporal and static graph representations for observational predictions. arXiv preprint arXiv:2103.07016","author":"Gao Jianfei","year":"2021","unstructured":"Jianfei Gao and Bruno Ribeiro. 2021. On the equivalence between temporal and static graph representations for observational predictions. arXiv preprint arXiv:2103.07016 (2021)."},{"key":"e_1_3_2_1_14_1","volume-title":"Dylan Freedman, Aren Jansen, Wade Lawrence, R Channing Moore, Manoj Plakal, and Marvin Ritter.","author":"Gemmeke Jort F","year":"2017","unstructured":"Jort F Gemmeke, Daniel PW Ellis, Dylan Freedman, Aren Jansen, Wade Lawrence, R Channing Moore, Manoj Plakal, and Marvin Ritter. 2017. Audio set: An ontology and human-labeled dataset for audio events. In ICASSP."},{"key":"e_1_3_2_1_15_1","unstructured":"Oguzhan Gencoglu Tuomas Virtanen and Heikki Huttunen. 2014. Recognition of acoustic events using deep neural networks. In EUSIPCO."},{"key":"e_1_3_2_1_16_1","volume-title":"Ast: Audio spectrogram transformer. arXiv preprint arXiv:2104.01778","author":"Gong Yuan","year":"2021","unstructured":"Yuan Gong, Yu-An Chung, and James Glass. 2021. Ast: Audio spectrogram transformer. arXiv preprint arXiv:2104.01778 (2021)."},{"key":"e_1_3_2_1_17_1","volume-title":"Self-supervised co-training for video representation learning. NeurIPS","author":"Han Tengda","year":"2020","unstructured":"Tengda Han, Weidi Xie, and Andrew Zisserman. 2020. Self-supervised co-training for video representation learning. NeurIPS (2020)."},{"key":"e_1_3_2_1_18_1","volume-title":"Acoustic scene classification using convolutional neural network and multiple-width frequency-delta data augmentation. arXiv preprint arXiv:1607.02383","author":"Han Yoonchang","year":"2016","unstructured":"Yoonchang Han and Kyogu Lee. 2016. Acoustic scene classification using convolutional neural network and multiple-width frequency-delta data augmentation. arXiv preprint arXiv:1607.02383 (2016)."},{"key":"e_1_3_2_1_19_1","volume-title":"Point spectra of some mutually exciting point processes. Journal of the Royal Statistical Society: Series B (Methodological)","author":"Hawkes Alan G","year":"1971","unstructured":"Alan G Hawkes. 1971. Point spectra of some mutually exciting point processes. Journal of the Royal Statistical Society: Series B (Methodological) (1971)."},{"key":"e_1_3_2_1_20_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR."},{"key":"e_1_3_2_1_21_1","volume-title":"Jort F Gemmeke, Aren Jansen, R Channing Moore, Manoj Plakal, Devin Platt, Rif A Saurous, Bryan Seybold, et al.","author":"Hershey Shawn","year":"2017","unstructured":"Shawn Hershey, Sourish Chaudhuri, Daniel PW Ellis, Jort F Gemmeke, Aren Jansen, R Channing Moore, Manoj Plakal, Devin Platt, Rif A Saurous, Bryan Seybold, et al. 2017. CNN architectures for large-scale audio classification. In ICASSP."},{"key":"e_1_3_2_1_22_1","volume-title":"scDFC: A deep fusion clustering method for single-cell RNA-seq data. BIB","author":"Hu Dayu","year":"2023","unstructured":"Dayu Hu, Ke Liang, Sihang Zhou, Wenxuan Tu, Meng Liu, and Xinwang Liu. 2023. scDFC: A deep fusion clustering method for single-cell RNA-seq data. BIB (2023)."},{"key":"e_1_3_2_1_23_1","first-page":"28708","article-title":"Masked autoencoders that listen","volume":"35","author":"Huang Po-Yao","year":"2022","unstructured":"Po-Yao Huang, Hu Xu, Juncheng Li, Alexei Baevski, Michael Auli, Wojciech Galuba, Florian Metze, and Christoph Feichtenhofer. 2022. Masked autoencoders that listen. NeurIPS, Vol. 35 (2022), 28708--28720.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_24_1","volume-title":"2023 c. Deep Incomplete Multi-view Clustering with Cross-view Partial Sample and Prototype Alignment. arXiv preprint arXiv:2303.15689","author":"Jin Jiaqi","year":"2023","unstructured":"Jiaqi Jin, Siwei Wang, Zhibin Dong, Xinwang Liu, and En Zhu. 2023 c. Deep Incomplete Multi-view Clustering with Cross-view Partial Sample and Prototype Alignment. arXiv preprint arXiv:2303.15689 (2023)."},{"key":"e_1_3_2_1_25_1","unstructured":"Yiqiao Jin Yeon-Chang Lee Kartik Sharma Meng Ye Karan Sikka Ajay Divakaran and Srijan Kumar. 2023 a. Predicting Information Pathways Across Online Communities. In KDD."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Yeying Jin Ruoteng Li Wenhan Yang and Robby T Tan. 2023 b. Estimating reflectance layer from a single image: Integrating reflectance guidance and shadow\/specular aware learning. In AAAI. 1069--1077.","DOI":"10.1609\/aaai.v37i1.25188"},{"key":"e_1_3_2_1_27_1","volume-title":"Tan","author":"Jin Yeying","year":"2021","unstructured":"Yeying Jin, Aashish Sharma, and Robby T. Tan. 2021. DC-ShadowNet: Single-Image Hard and Soft Shadow Removal Using Unsupervised Domain-Classifier Guided Network. In ICCV. 5027--5036."},{"key":"e_1_3_2_1_28_1","unstructured":"Yiqiao Jin Xiting Wang Yaru Hao Yizhou Sun and Xing Xie. 2023 d. Prototypical Fine-tuning: Towards Robust Performance Under Varying Data Sizes. In AAAI."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Yiqiao Jin Xiting Wang Ruichao Yang Yizhou Sun Wei Wang Hao Liao and Xing Xie. 2022a. Towards fine-grained reasoning for fake news detection. In AAAI. 5746--5754.","DOI":"10.1609\/aaai.v36i5.20517"},{"key":"e_1_3_2_1_30_1","unstructured":"Yeying Jin Wenhan Yang and Robby T Tan. 2022b. Unsupervised night image enhancement: When layer decomposition meets light-effects suppression. In ECCV."},{"key":"e_1_3_2_1_31_1","volume-title":"Acoustic Event Detection in Multichannel Audio Using Gated Recurrent Neural Networks with High-Resolution Spectral Features. ETRI Journal","author":"Kim Hyoung-Gook","year":"2017","unstructured":"Hyoung-Gook Kim and Jin Young Kim. 2017. Acoustic Event Detection in Multichannel Audio Using Gated Recurrent Neural Networks with High-Resolution Spectral Features. ETRI Journal (2017)."},{"key":"e_1_3_2_1_32_1","volume-title":"Panns: Large-scale pretrained audio neural networks for audio pattern recognition","author":"Kong Qiuqiang","year":"2020","unstructured":"Qiuqiang Kong, Yin Cao, Turab Iqbal, Yuxuan Wang, Wenwu Wang, and Mark D Plumbley. 2020. Panns: Large-scale pretrained audio neural networks for audio pattern recognition. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (2020)."},{"key":"e_1_3_2_1_33_1","volume-title":"NeurIPS","volume":"31","author":"Korbar Bruno","year":"2018","unstructured":"Bruno Korbar, Du Tran, and Lorenzo Torresani. 2018. Cooperative learning of audio and video models from self-supervised synchronization. NeurIPS, Vol. 31 (2018)."},{"key":"e_1_3_2_1_34_1","volume-title":"Efficient training of audio transformers with patchout. arXiv preprint arXiv:2110.05069","author":"Koutini Khaled","year":"2021","unstructured":"Khaled Koutini, Jan Schl\u00fcter, Hamid Eghbal-zadeh, and Gerhard Widmer. 2021. Efficient training of audio transformers with patchout. arXiv preprint arXiv:2110.05069 (2021)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Srijan Kumar Xikun Zhang and Jure Leskovec. 2019. Predicting dynamic embedding trajectory in temporal interaction networks. In KDD.","DOI":"10.1145\/3292500.3330895"},{"key":"e_1_3_2_1_36_1","volume-title":"Enhancing speaking styles in conversational text-to-speech synthesis with graph-based multi-modal context modeling","author":"Li Jingbei","unstructured":"Jingbei Li, Yi Meng, Chenyi Li, Zhiyong Wu, Helen Meng, Chao Weng, and Dan Su. 2022. Enhancing speaking styles in conversational text-to-speech synthesis with graph-based multi-modal context modeling. In ICASSP. IEEE, 7917--7921."},{"key":"e_1_3_2_1_37_1","volume-title":"2023 b. Multi-View Bipartite Graph Clustering With Coupled Noisy Feature Filter. TKDE","author":"Li Liang","year":"2023","unstructured":"Liang Li, Junpu Zhang, Siwei Wang, Xinwang Liu, Kenli Li, and Keqin Li. 2023 b. Multi-View Bipartite Graph Clustering With Coupled Noisy Feature Filter. TKDE (2023), 1--13."},{"key":"e_1_3_2_1_38_1","volume-title":"2023 a. Attribute-Consistent Knowledge Graph Representation Learning for Multi-Modal Entity Alignment. arXiv preprint arXiv:2304.01563","author":"Li Qian","year":"2023","unstructured":"Qian Li, Shu Guo, Yangyifei Luo, Cheng Ji, Lihong Wang, Jiawei Sheng, and Jianxin Li. 2023 a. Attribute-Consistent Knowledge Graph Representation Learning for Multi-Modal Entity Alignment. arXiv preprint arXiv:2304.01563 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Search from history and reason for future: Two-stage reasoning on temporal knowledge graphs. arXiv preprint arXiv:2106.00327","author":"Li Zixuan","year":"2021","unstructured":"Zixuan Li, Xiaolong Jin, Saiping Guan, Wei Li, Jiafeng Guo, Yuanzhuo Wang, and Xueqi Cheng. 2021a. Search from history and reason for future: Two-stage reasoning on temporal knowledge graphs. arXiv preprint arXiv:2106.00327 (2021)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Zixuan Li Xiaolong Jin Wei Li Saiping Guan Jiafeng Guo Huawei Shen Yuanzhuo Wang and Xueqi Cheng. 2021b. Temporal knowledge graph reasoning based on evolutional representation learning. In SIGIR. 408--417.","DOI":"10.1145\/3404835.3462963"},{"key":"e_1_3_2_1_41_1","volume-title":"2023 a. Knowledge Graph Contrastive Learning Based on Relation-Symmetrical Structure. TKDE","author":"Liang Ke","year":"2023","unstructured":"Ke Liang, Yue Liu, Sihang Zhou, Wenxuan Tu, Yi Wen, Xihong Yang, Xiangjun Dong, and Xinwang Liu. 2023 a. Knowledge Graph Contrastive Learning Based on Relation-Symmetrical Structure. TKDE (2023)."},{"key":"e_1_3_2_1_42_1","unstructured":"Ke Liang Lingyuan Meng Meng Liu Yue Liu Wenxuan Tu Siwei Wang Sihang Zhou Xinwang Liu and Fuchun Sun. 2022. A Survey of Knowledge Graph Reasoning on Graph Types: Static Dynamic and Multimodal. (2022)."},{"key":"e_1_3_2_1_43_1","volume-title":"2023 b. Message Intercommunication for Inductive Relation Reasoning. arXiv preprint arXiv:2305.14074","author":"Liang Ke","year":"2023","unstructured":"Ke Liang, Lingyuan Meng, Sihang Zhou, Siwei Wang, Wenxuan Tu, Yue Liu, Meng Liu, and Xinwang Liu. 2023 b. Message Intercommunication for Inductive Relation Reasoning. arXiv preprint arXiv:2305.14074 (2023)."},{"key":"e_1_3_2_1_44_1","volume-title":"2023 c. Structure Guided Multi-modal Pre-trained Transformer for Knowledge Graph Reasoning. arXiv preprint arXiv:2307.03591","author":"Liang Ke","year":"2023","unstructured":"Ke Liang, Sihang Zhou, Yue Liu, Lingyuan Meng, Meng Liu, and Xinwang Liu. 2023 c. Structure Guided Multi-modal Pre-trained Transformer for Knowledge Graph Reasoning. arXiv preprint arXiv:2307.03591 (2023)."},{"key":"e_1_3_2_1_45_1","unstructured":"Tsung-Yi Lin Priya Goyal Ross Girshick Kaiming He and Piotr Doll\u00e1r. [n.d.]. Focal loss for dense object detection. In ICCV."},{"key":"e_1_3_2_1_46_1","volume-title":"2023 b. Self-Supervised Temporal Graph learning with Temporal and Structural Intensity Alignment. arXiv preprint arXiv:2302.07491","author":"Liu Meng","year":"2023","unstructured":"Meng Liu, Ke Liang, Bin Xiao, Sihang Zhou, Wenxuan Tu, Yue Liu, Xihong Yang, and Xinwang Liu. 2023 b. Self-Supervised Temporal Graph learning with Temporal and Structural Intensity Alignment. arXiv preprint arXiv:2302.07491 (2023)."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"crossref","unstructured":"Meng Liu and Yong Liu. 2021. Inductive representation learning in temporal networks via mining neighborhood and community influences. In SIGIR.","DOI":"10.1145\/3404835.3463052"},{"key":"e_1_3_2_1_48_1","volume-title":"2023 c. Deep Temporal Graph Clustering. arXiv preprint arXiv:2305.10738","author":"Liu Meng","year":"2023","unstructured":"Meng Liu, Yue Liu, Ke Liang, Siwei Wang, Sihang Zhou, and Xinwang Liu. 2023 c. Deep Temporal Graph Clustering. arXiv preprint arXiv:2305.10738 (2023)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Meng Liu Jiaming Wu and Yong Liu. 2022b. Embedding global and local influences for dynamic graphs. In CIKM. 4249--4253.","DOI":"10.1145\/3511808.3557594"},{"key":"e_1_3_2_1_50_1","unstructured":"Yue Liu Ke Liang Jun Xia Sihang Zhou Xihong Yang Xinwang Liu and Z. Stan Li. 2023 a. Dink-Net: Neural Clustering on Large Graphs. In ICML."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Yue Liu Wenxuan Tu Sihang Zhou Xinwang Liu Linxuan Song Xihong Yang and En Zhu. 2022a. Deep Graph Clustering via Dual Correlation Reduction. In AAAI. 7603--7611.","DOI":"10.1609\/aaai.v36i7.20726"},{"key":"e_1_3_2_1_52_1","volume-title":"A Survey of Deep Graph Clustering: Taxonomy, Challenge, and Application. arXiv preprint arXiv:2211.12875","author":"Liu Yue","year":"2022","unstructured":"Yue Liu, Jun Xia, Sihang Zhou, Siwei Wang, Xifeng Guo, Xihong Yang, Ke Liang, Wenxuan Tu, Z. Stan Li, and Xinwang Liu. 2022c. A Survey of Deep Graph Clustering: Taxonomy, Challenge, and Application. arXiv preprint arXiv:2211.12875 (2022)."},{"key":"e_1_3_2_1_53_1","volume-title":"2023 d. Simple contrastive graph clustering. TNNLS","author":"Liu Yue","year":"2023","unstructured":"Yue Liu, Xihong Yang, Sihang Zhou, and Xinwang Liu. 2023 d. Simple contrastive graph clustering. TNNLS (2023)."},{"key":"e_1_3_2_1_54_1","volume-title":"Active contrastive learning of audio-visual video representations. arXiv preprint arXiv:2009.09805","author":"Ma Shuang","year":"2020","unstructured":"Shuang Ma, Zhaoyang Zeng, Daniel McDuff, and Yale Song. 2020. Active contrastive learning of audio-visual video representations. arXiv preprint arXiv:2009.09805 (2020)."},{"key":"e_1_3_2_1_55_1","volume-title":"Hearing lips and seeing voices. Nature","author":"McGurk Harry","year":"1976","unstructured":"Harry McGurk and John MacDonald. 1976. Hearing lips and seeing voices. Nature (1976)."},{"key":"e_1_3_2_1_56_1","volume-title":"Heng Tao Shen, and Xiaofeng Zhu","author":"Mo Yujie","year":"2023","unstructured":"Yujie Mo, Yajie Lei, Jialie Shen, Xiaoshuang Shi, Heng Tao Shen, and Xiaofeng Zhu. 2023. Disentangled Multiplex Graph Representation Learning. In ICML."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"crossref","unstructured":"Pedro Morgado Nuno Vasconcelos and Ishan Misra. 2021. Audio-visual instance discrimination with cross-modal agreement. In CVPR. 12475--12486.","DOI":"10.1109\/CVPR46437.2021.01274"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"crossref","unstructured":"Andrew Owens Jiajun Wu Josh H McDermott William T Freeman and Antonio Torralba. 2016. Ambient sound provides supervision for visual learning. In ECCV.","DOI":"10.1007\/978-3-319-46448-0_48"},{"key":"e_1_3_2_1_59_1","volume-title":"Evolvegcn: Evolving graph convolutional networks for dynamic graphs. In AAAI.","author":"Pareja Aldo","year":"2020","unstructured":"Aldo Pareja, Giacomo Domeniconi, Jie Chen, Tengfei Ma, Toyotaro Suzumura, Hiroki Kanezashi, Tim Kaler, Tao Schardl, and Charles Leiserson. 2020. Evolvegcn: Evolving graph convolutional networks for dynamic graphs. In AAAI."},{"key":"e_1_3_2_1_60_1","volume-title":"Random regression forests for acoustic event detection and classification","author":"Phan Huy","year":"2014","unstructured":"Huy Phan, Marco Maa\u00df, Radoslaw Mazur, and Alfred Mertins. 2014. Random regression forests for acoustic event detection and classification. IEEE\/ACM Transactions on Audio, Speech, and Language Processing (2014)."},{"key":"e_1_3_2_1_61_1","unstructured":"AJ Piergiovanni Anelia Angelova and Michael S Ryoo. [n.d.]. Evolving losses for unsupervised video representation learning. In CVPR."},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3366423.3380073"},{"key":"e_1_3_2_1_63_1","volume-title":"Temporal graph networks for deep learning on dynamic graphs. arXiv preprint arXiv:2006.10637","author":"Rossi Emanuele","year":"2020","unstructured":"Emanuele Rossi, Ben Chamberlain, Fabrizio Frasca, Davide Eynard, Federico Monti, and Michael Bronstein. 2020. Temporal graph networks for deep learning on dynamic graphs. arXiv preprint arXiv:2006.10637 (2020)."},{"key":"e_1_3_2_1_64_1","volume-title":"Avlnet: Learning audio-visual language representations from instructional videos. arXiv preprint arXiv:2006.09199","author":"Rouditchenko Andrew","year":"2020","unstructured":"Andrew Rouditchenko, Angie Boggust, David Harwath, Brian Chen, Dhiraj Joshi, Samuel Thomas, Kartik Audhkhasi, Hilde Kuehne, Rameswar Panda, Rogerio Feris, et al. 2020. Avlnet: Learning audio-visual language representations from instructional videos. arXiv preprint arXiv:2006.09199 (2020)."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"crossref","unstructured":"Aaqib Saeed David Grangier and Neil Zeghidour. 2021. Contrastive learning of general-purpose audio representations. In ICASSP.","DOI":"10.1109\/ICASSP39728.2021.9413528"},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/3336191.3371845"},{"key":"e_1_3_2_1_67_1","volume-title":"Heterogeneous Graph Learning for Acoustic Event Classification. arXiv preprint arXiv:2303.02665","author":"Shirian Amir","year":"2023","unstructured":"Amir Shirian, Mona Ahmadian, Krishna Somandepalli, and Tanaya Guha. 2023. Heterogeneous Graph Learning for Acoustic Event Classification. arXiv preprint arXiv:2303.02665 (2023)."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTSP.2022.3190083"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2022-10670"},{"key":"e_1_3_2_1_70_1","volume-title":"Learning speech representations from raw audio by joint audiovisual self-supervision. arXiv preprint arXiv:2007.04134","author":"Shukla Abhinav","year":"2020","unstructured":"Abhinav Shukla, Stavros Petridis, and Maja Pantic. 2020. Learning speech representations from raw audio by joint audiovisual self-supervision. arXiv preprint arXiv:2007.04134 (2020)."},{"key":"e_1_3_2_1_71_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_72_1","volume-title":"Improved convolutional neural networks for acoustic event classification. Multimedia Tools and Applications","author":"Tang Guichen","year":"2019","unstructured":"Guichen Tang, Ruiyu Liang, Yue Xie, Yongqiang Bao, and Shijia Wang. 2019. Improved convolutional neural networks for acoustic event classification. Multimedia Tools and Applications (2019)."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"crossref","unstructured":"Du Tran Heng Wang Lorenzo Torresani Jamie Ray Yann LeCun and Manohar Paluri. 2018. A closer look at spatiotemporal convolutions for action recognition. In CVPR.","DOI":"10.1109\/CVPR.2018.00675"},{"key":"e_1_3_2_1_74_1","unstructured":"Xinhang Wan Jiyuan Liu Weixuan Liang Xinwang Liu Yi Wen and En Zhu. 2022. Continual Multi-View Clustering. In ACM MM."},{"key":"e_1_3_2_1_75_1","unstructured":"Xinhang Wan Xinwang Liu Jiyuan Liu Siwei Wang Yi Wen Weixuan Liang En Zhu Zhe Liu and Lu Zhou. 2023. Auto-weighted Multi-view Clustering for Large-scale Data. arxiv: 2303.01983"},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3366423.3380186"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3351034"},{"key":"e_1_3_2_1_78_1","volume-title":"Unpaired Multi-View Graph Clustering with Cross-View Structure Matching. arXiv preprint arXiv:2307.03476","author":"Wen Yi","year":"2023","unstructured":"Yi Wen, Siwei Wang, Qing Liao, Weixuan Liang, Ke Liang, Xinhang Wan, and Xinwang Liu. 2023. Unpaired Multi-View Graph Clustering with Cross-View Structure Matching. arXiv preprint arXiv:2307.03476 (2023)."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3512164"},{"key":"e_1_3_2_1_80_1","unstructured":"Martin Weyssow Houari Sahraoui and Bang Liu. [n.d.]. Better modeling the programming world with code concept graphs-augmented multi-modal learning. In ICSE."},{"key":"e_1_3_2_1_81_1","volume-title":"2023 a. Adversarial Auto-encoder Domain Adaptation for Cold-start Recommendation with Positive and Negative Hypergraphs. TOIS","author":"Wu Hanrui","year":"2023","unstructured":"Hanrui Wu, Jinyi Long, Nuosi Li, Dahai Yu, and Michael K Ng. 2023 a. Adversarial Auto-encoder Domain Adaptation for Cold-start Recommendation with Positive and Negative Hypergraphs. TOIS (2023), 1--25."},{"key":"e_1_3_2_1_82_1","volume-title":"2023 b. Hypergraph Collaborative Network on Vertices and Hyperedges. TPAMI","author":"Wu Hanrui","year":"2023","unstructured":"Hanrui Wu, Yuguang Yan, and Michael Kwok-Po Ng. 2023 b. Hypergraph Collaborative Network on Vertices and Hyperedges. TPAMI (2023), 3245--3258."},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"crossref","unstructured":"Liang Xiang Quan Yuan Shiwan Zhao Li Chen Xiatian Zhang Qing Yang and Jimeng Sun. 2010. Temporal recommendation on graphs via long-and short-term preference fusion. In KDD.","DOI":"10.1145\/1835804.1835896"},{"key":"e_1_3_2_1_84_1","volume-title":"Rethinking spatiotemporal feature learning for video understanding. arXiv preprint arXiv:1712.04851","author":"Xie Saining","year":"2017","unstructured":"Saining Xie, Chen Sun, Jonathan Huang, Zhuowen Tu, and Kevin Murphy. 2017. Rethinking spatiotemporal feature learning for video understanding. arXiv preprint arXiv:1712.04851 (2017)."},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"crossref","unstructured":"Ruichao Yang Xiting Wang Yiqiao Jin Chaozhuo Li Jianxun Lian and Xing Xie. 2022d. Reinforcement subgraph reasoning for fake news detection. In KDD. 2253--2262.","DOI":"10.1145\/3534678.3539277"},{"key":"e_1_3_2_1_86_1","volume-title":"Interpolation-based contrastive learning for few-label semi-supervised learning. TNNLS","author":"Yang Xihong","year":"2022","unstructured":"Xihong Yang, Xiaochang Hu, Sihang Zhou, Xinwang Liu, and En Zhu. 2022a. Interpolation-based contrastive learning for few-label semi-supervised learning. TNNLS (2022)."},{"key":"e_1_3_2_1_87_1","volume-title":"Mixed Graph Contrastive Network for Semi-Supervised Node Classification. arXiv preprint arXiv:2206.02796","author":"Yang Xihong","year":"2022","unstructured":"Xihong Yang, Yue Liu, Sihang Zhou, Xinwang Liu, and En Zhu. 2022b. Mixed Graph Contrastive Network for Semi-Supervised Node Classification. arXiv preprint arXiv:2206.02796 (2022)."},{"key":"e_1_3_2_1_88_1","volume-title":"Contrastive Deep Graph Clustering with Learnable Augmentation. arXiv preprint arXiv:2212.03559","author":"Yang Xihong","year":"2022","unstructured":"Xihong Yang, Yue Liu, Sihang Zhou, Siwei Wang, Xinwang Liu, and En Zhu. 2022c. Contrastive Deep Graph Clustering with Learnable Augmentation. arXiv preprint arXiv:2212.03559 (2022)."},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"crossref","unstructured":"Xihong Yang Yue Liu Sihang Zhou Siwei Wang Wenxuan Tu Qun Zheng Xinwang Liu Liming Fang and En Zhu. 2023. Cluster-guided Contrastive Graph Clustering Network. In AAAI.","DOI":"10.1609\/aaai.v37i9.26285"},{"key":"e_1_3_2_1_90_1","volume-title":"A novel graph-based multi-modal fusion encoder for neural machine translation. arXiv preprint arXiv:2007.08742","author":"Yin Yongjing","year":"2020","unstructured":"Yongjing Yin, Fandong Meng, Jinsong Su, Chulun Zhou, Zhengyuan Yang, Jie Zhou, and Jiebo Luo. 2020. A novel graph-based multi-modal fusion encoder for neural machine translation. arXiv preprint arXiv:2007.08742 (2020)."},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"crossref","unstructured":"Haomin Zhang Ian McLoughlin and Yan Song. 2015. Robust sound event recognition using convolutional neural networks. In ICASSP.","DOI":"10.1109\/ICASSP.2015.7178031"},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"crossref","unstructured":"Junpu Zhang Liang Li Siwei Wang Jiyuan Liu Yue Liu Xinwang Liu and En Zhu. 2022. Multiple Kernel Clustering with Dual Noise Minimization. In ACM MM. 3440--3450.","DOI":"10.1145\/3503161.3548334"},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"crossref","unstructured":"Mengqi Zhang Yuwei Xia Qiang Liu Shu Wu and Liang Wang. 2023. Learning Latent Relations for Temporal Knowledge Graph Reasoning. In ACL. 12617--12631.","DOI":"10.18653\/v1\/2023.acl-long.705"},{"key":"e_1_3_2_1_94_1","volume-title":"HTNet: Dynamic WLAN Performance Prediction using Heterogenous Temporal GNN. arXiv preprint arXiv:2304.10013","author":"Zhou Hongkuan","year":"2023","unstructured":"Hongkuan Zhou, Rajgopal Kannan, Ananthram Swami, and Viktor Prasanna. 2023. HTNet: Dynamic WLAN Performance Prediction using Heterogenous Temporal GNN. arXiv preprint arXiv:2304.10013 (2023)."},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2008-26"},{"key":"e_1_3_2_1_96_1","doi-asserted-by":"crossref","unstructured":"Yuan Zuo Guannan Liu Hao Lin Jia Guo Xiaoqian Hu and Junjie Wu. 2018. Embedding temporal network via neighborhood formation. In KDD.","DOI":"10.1145\/3219819.3220054"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611853","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3611853","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T23:58:25Z","timestamp":1755820705000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3611853"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":96,"alternative-id":["10.1145\/3581783.3611853","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3611853","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}