{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:20:59Z","timestamp":1765308059151,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755007","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:47:42Z","timestamp":1761371262000},"page":"7616-7624","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Video Instance Segmentation by Weighted Structure Inference"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2564-071X","authenticated-orcid":false,"given":"Zheyun","family":"Qin","sequence":"first","affiliation":[{"name":"School of Computer Science and Technology, Shandong University, Qingdao, Shandong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2343-3688","authenticated-orcid":false,"given":"Deng","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Shandong University, Jinan, Shandong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2515-1588","authenticated-orcid":false,"given":"Yang","family":"Shi","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Shandong University, Jinan, Shandong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3707-1761","authenticated-orcid":false,"given":"Qiangchang","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Software, Shandong University, Jinan, Shandong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4592-4074","authenticated-orcid":false,"given":"Zhumin","family":"Chen","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Shandong University, Jinan, Shandong, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neumar.2024.100034"},{"key":"e_1_3_2_1_2_1","volume-title":"Person re-identification by multi-hypergraph fusion","author":"An Le","year":"2016","unstructured":"Le An, Xiaojing Chen, Songfan Yang, and Xuelong Li. 2016. Person re-identification by multi-hypergraph fusion. IEEE transactions on neural networks and learning systems, Vol. 28, 11 (2016), 2763-2774."},{"key":"e_1_3_2_1_3_1","volume-title":"TarViS: A Unified Approach for Target-based Video Segmentation. arXiv preprint arXiv:2301.02657","author":"Athar Ali","year":"2023","unstructured":"Ali Athar, Alexander Hermans, Jonathon Luiten, Deva Ramanan, and Bastian Leibe. 2023. TarViS: A Unified Approach for Target-based Video Segmentation. arXiv preprint arXiv:2301.02657 (2023)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10851-014-0506-3"},{"key":"e_1_3_2_1_5_1","volume-title":"Hisham Cholakkal, Fahad Shahbaz Khan, Yanwei Pang, and Ling Shao.","author":"Cao Jiale","year":"2020","unstructured":"Jiale Cao, Rao Muhammad Anwer, Hisham Cholakkal, Fahad Shahbaz Khan, Yanwei Pang, and Ling Shao. 2020. Sipmask: Spatial information preservation for fast image and video instance segmentation. In ECCV."},{"key":"e_1_3_2_1_6_1","volume-title":"HourVideo: 1-Hour Video-Language Understanding. arXiv preprint arXiv:2411.04998","author":"Chandrasegaran Keshigeyan","year":"2024","unstructured":"Keshigeyan Chandrasegaran, Agrim Gupta, Lea M Hadzic, Taran Kota, Jimming He, Crist\u00f3bal Eyzaguirre, Zane Durante, Manling Li, Jiajun Wu, and Li Fei-Fei. 2024. HourVideo: 1-Hour Video-Language Understanding. arXiv preprint arXiv:2411.04998 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Lu Chen Qiangchang Wang Zhaohui Li and Yilong Yin. 2024. Hypergraph-guided Intra-and Inter-category Relation Modeling for Fine-grained Visual Recognition. In ACM MM.","DOI":"10.1145\/3664647.3680589"},{"key":"e_1_3_2_1_8_1","volume-title":"Mask2former for video instance segmentation. arXiv preprint arXiv:2112.10764","author":"Cheng Bowen","year":"2021","unstructured":"Bowen Cheng, Anwesa Choudhuri, Ishan Misra, Alexander Kirillov, Rohit Girdhar, and Alexander G Schwing. 2021. Mask2former for video instance segmentation. arXiv preprint arXiv:2112.10764 (2021)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Bowen Cheng Ishan Misra Alexander G Schwing Alexander Kirillov and Rohit Girdhar. 2022. Masked-attention mask transformer for universal image segmentation. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"e_1_3_2_1_10_1","volume-title":"Sheaf hypergraph networks. NeurIPS","author":"Duta Iulia","year":"2023","unstructured":"Iulia Duta, Giulia Cassar\u00e0, Fabrizio Silvestri, and Pietro Li\u00f2. 2023. Sheaf hypergraph networks. NeurIPS (2023)."},{"key":"e_1_3_2_1_11_1","volume-title":"Learning better video query with sam for video instance segmentation","author":"Fang Hao","year":"2024","unstructured":"Hao Fang, Tong Zhang, Xiaofei Zhou, and Xinxin Zhang. 2024. Learning better video query with sam for video instance segmentation. IEEE Transactions on Circuits and Systems for Video Technology (2024)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Yifan Feng Haoxuan You Zizhao Zhang Rongrong Ji and Yue Gao. 2019. Hypergraph neural networks. In AAAI.","DOI":"10.1609\/aaai.v33i01.33013558"},{"key":"e_1_3_2_1_13_1","volume-title":"Yeonchool Park, Hyunwoo Kim, Min-Jung Kim, and Seon Joo Kim.","author":"Han Su Ho","year":"2022","unstructured":"Su Ho Han, Sukjun Hwang, Seoung Wug Oh, Yeonchool Park, Hyunwoo Kim, Min-Jung Kim, and Seon Joo Kim. 2022. Visolo: Grid-based space-time aggregation for efficient online video instance segmentation. In CVPR."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Yan Han Peihao Wang Souvik Kundu Ying Ding and Zhangyang Wang. 2023. Vision HGNN: An Image is More than a Graph of Nodes. In ICCV.","DOI":"10.1109\/ICCV51070.2023.01820"},{"key":"e_1_3_2_1_15_1","volume-title":"Joon-Young Lee, and Seon Joo Kim.","author":"Heo Miran","year":"2022","unstructured":"Miran Heo, Sukjun Hwang, Jeongseok Hyun, Hanjung Kim, Seoung Wug Oh, Joon-Young Lee, and Seon Joo Kim. 2022a. A Generalized Framework for Video Instance Segmentation. arXiv preprint arXiv:2211.08834 (2022)."},{"key":"e_1_3_2_1_16_1","volume-title":"Joon-Young Lee, and Seon Joo Kim.","author":"Heo Miran","year":"2023","unstructured":"Miran Heo, Sukjun Hwang, Jeongseok Hyun, Hanjung Kim, Seoung Wug Oh, Joon-Young Lee, and Seon Joo Kim. 2023. A generalized framework for video instance segmentation. In CVPR."},{"key":"e_1_3_2_1_17_1","volume-title":"Joon-Young Lee, and Seon Joo Kim.","author":"Heo Miran","year":"2022","unstructured":"Miran Heo, Sukjun Hwang, Seoung Wug Oh, Joon-Young Lee, and Seon Joo Kim. 2022b. Vita: Video instance segmentation via object token association. arXiv preprint arXiv:2206.04403 (2022)."},{"key":"e_1_3_2_1_18_1","volume-title":"Minvis: A minimal video instance segmentation framework without video-based training. arXiv preprint arXiv:2208.02245","author":"Huang De-An","year":"2022","unstructured":"De-An Huang, Zhiding Yu, and Anima Anandkumar. 2022. Minvis: A minimal video instance segmentation framework without video-based training. arXiv preprint arXiv:2208.02245 (2022)."},{"key":"e_1_3_2_1_19_1","volume-title":"Seoung Wug Oh, and Seon Joo Kim","author":"Hwang Sukjun","year":"2021","unstructured":"Sukjun Hwang, Miran Heo, Seoung Wug Oh, and Seon Joo Kim. 2021. Video instance segmentation using inter-frame communication transformers. In NeurIPS."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"crossref","unstructured":"Jianwen Jiang Yuxuan Wei Yifan Feng Jingxuan Cao and Yue Gao. 2019. Dynamic hypergraph neural networks.. In IJCAI.","DOI":"10.24963\/ijcai.2019\/366"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.4103\/AGINGADVANCES.AGINGADV-D-24-00004"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Dahun Kim Sanghyun Woo Joon-Young Lee and In So Kweon. 2020. Video panoptic segmentation. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00988"},{"key":"e_1_3_2_1_23_1","volume-title":"Seoung Wug Oh, and Seon Joo Kim","author":"Kim Hanjung","year":"2025","unstructured":"Hanjung Kim, Jaehyun Kang, Miran Heo, Sukjun Hwang, Seoung Wug Oh, and Seon Joo Kim. 2025. VISAGE: Video Instance Segmentation with Appearance-Guided Enhancement. In ECCV."},{"key":"e_1_3_2_1_24_1","volume-title":"Tcovis: Temporally consistent online video instance segmentation. In ICCV.","author":"Li Junlong","year":"2023","unstructured":"Junlong Li, Bingyao Yu, Yongming Rao, Jie Zhou, and Jiwen Lu. 2023a. Tcovis: Temporally consistent online video instance segmentation. In ICCV."},{"key":"e_1_3_2_1_25_1","volume-title":"Tube-link: A flexible cross tube baseline for universal video segmentation. arXiv preprint arXiv:2303.12782","author":"Li Xiangtai","year":"2023","unstructured":"Xiangtai Li, Haobo Yuan, Wenwei Zhang, Guangliang Cheng, Jiangmiao Pang, and Chen Change Loy. 2023b. Tube-link: A flexible cross tube baseline for universal video segmentation. arXiv preprint arXiv:2303.12782 (2023)."},{"key":"e_1_3_2_1_26_1","unstructured":"Xiangtai Li Wenwei Zhang Jiangmiao Pang Kai Chen Guangliang Cheng Yunhai Tong and Chen Change Loy. 2022. Video k-net: A simple strong and unified baseline for video segmentation. In CVPR."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Jiaxu Miao Xiaohan Wang Yu Wu Wei Li Xu Zhang Yunchao Wei and Yi Yang. 2022. Large-scale video panoptic segmentation in the wild: A benchmark. In CVPR.","DOI":"10.1109\/CVPR52688.2022.02036"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neumar.2024.100033"},{"key":"e_1_3_2_1_29_1","volume-title":"Sliced Wasserstein with random-path projecting directions. arXiv preprint arXiv:2401.15889","author":"Nguyen Khai","year":"2024","unstructured":"Khai Nguyen, Shujian Zhang, Tam Le, and Nhat Ho. 2024. Sliced Wasserstein with random-path projecting directions. arXiv preprint arXiv:2401.15889 (2024)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3183388"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-022-01629-1"},{"key":"e_1_3_2_1_32_1","volume-title":"Vip-deeplab: Learning visual perception with depth-aware video panoptic segmentation. In CVPR.","author":"Qiao Siyuan","year":"2021","unstructured":"Siyuan Qiao, Yukun Zhu, Hartwig Adam, Alan Yuille, and Liang-Chieh Chen. 2021. Vip-deeplab: Learning visual perception with depth-aware video panoptic segmentation. In CVPR."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3328485"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/JAS.2023.123456"},{"key":"e_1_3_2_1_35_1","unstructured":"Zheyun Qin Xiankai Lu Xiushan Nie Xiantong Zhen and Yilong Yin. 2021. Learning hierarchical embedding for video instance segmentation. In ACM MM."},{"key":"e_1_3_2_1_36_1","volume-title":"So Kweon, Kuk-Jin Yoon, and Liang-Chieh Chen. 2023","author":"Shin Inkyu","year":"2023","unstructured":"Inkyu Shin, Dahun Kim, Qihang Yu, Jun Xie, Hong-Seok Kim, Bradley Green, In So Kweon, Kuk-Jin Yoon, and Liang-Chieh Chen. 2023. Video-kMaX: A simple unified approach for online and near-online video panoptic segmentation. arXiv preprint arXiv:2304.04694 (2023)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Yuqing Wang Zhaoliang Xu Xinlong Wang Chunhua Shen Baoshan Cheng Hao Shen and Huaxia Xia. 2021. End-to-end video instance segmentation with transformers. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00863"},{"key":"e_1_3_2_1_38_1","unstructured":"Sanghyun Woo Dahun Kim Joon-Young Lee and In So Kweon. 2021. Learning to associate every segment for video panoptic segmentation. In CVPR."},{"key":"e_1_3_2_1_39_1","volume-title":"Seqformer: Sequential transformer for video instance segmentation. In ECCV.","author":"Wu Junfeng","year":"2022","unstructured":"Junfeng Wu, Yi Jiang, Song Bai, Wenqing Zhang, and Xiang Bai. 2022a. Seqformer: Sequential transformer for video instance segmentation. In ECCV."},{"key":"e_1_3_2_1_40_1","unstructured":"Junfeng Wu Qihao Liu Yi Jiang Song Bai Alan Yuille and Xiang Bai. 2022b. In defense of online models for video instance segmentation. In ECCV."},{"key":"e_1_3_2_1_41_1","unstructured":"Jialian Wu Sudhir Yarram Hui Liang Tian Lan Junsong Yuan Jayan Eledath and Gerard Medioni. 2022c. Efficient video instance segmentation via tracklet query and proposal. In CVPR."},{"key":"e_1_3_2_1_42_1","volume-title":"Hypergcn: A new method for training graph convolutional networks on hypergraphs. NeurIPS","author":"Yadati Naganand","year":"2019","unstructured":"Naganand Yadati, Madhav Nimishakavi, Prateek Yadav, Vikram Nitin, Anand Louis, and Partha Talukdar. 2019. Hypergcn: A new method for training graph convolutional networks on hypergraphs. NeurIPS (2019)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Linjie Yang Yuchen Fan and Ning Xu. 2019. Video instance segmentation. In ICCV.","DOI":"10.1109\/ICCV.2019.00529"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Shusheng Yang Yuxin Fang Xinggang Wang Yu Li Chen Fang Ying Shan Bin Feng and Wenyu Liu. 2021. Crossover learning for fast online video instance segmentation. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00794"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00089"},{"key":"e_1_3_2_1_46_1","volume-title":"Robust Online Video Instance Segmentation with Track Queries. arXiv preprint arXiv:2211.09108","author":"Zhan Zitong","year":"2022","unstructured":"Zitong Zhan, Daniel McKee, and Svetlana Lazebnik. 2022. Robust Online Video Instance Segmentation with Track Queries. arXiv preprint arXiv:2211.09108 (2022)."},{"key":"e_1_3_2_1_47_1","volume-title":"Mobileinst: Video instance segmentation on the mobile. In AAAI.","author":"Zhang Renhong","year":"2024","unstructured":"Renhong Zhang, Tianheng Cheng, Shusheng Yang, Haoyi Jiang, Shuai Zhang, Jiancheng Lyu, Xin Li, Xiaowen Ying, Dashan Gao, Wenyu Liu, et al., 2024. Mobileinst: Video instance segmentation on the mobile. In AAAI."},{"key":"e_1_3_2_1_48_1","volume-title":"Dvis: Decoupled video instance segmentation framework. In ICCV.","author":"Zhang Tao","year":"2023","unstructured":"Tao Zhang, Xingye Tian, Yu Wu, Shunping Ji, Xuebo Wang, Yuan Zhang, and Pengfei Wan. 2023. Dvis: Decoupled video instance segmentation framework. In ICCV."},{"key":"e_1_3_2_1_49_1","volume-title":"Dvis: Improved decoupled framework for universal video segmentation","author":"Zhang Tao","year":"2025","unstructured":"Tao Zhang, Xingye Tian, Yikang Zhou, Shunping Ji, Xuebo Wang, Xin Tao, Yuan Zhang, Pengfei Wan, Zhongyuan Wang, and Yu Wu. 2025. Dvis: Improved decoupled framework for universal video segmentation. IEEE Transactions on Pattern Analysis and Machine Intelligence (2025)."},{"key":"e_1_3_2_1_50_1","volume-title":"Luc Van Gool, and Wenguan Wang","author":"Zhou Tianfei","year":"2022","unstructured":"Tianfei Zhou, Fatih Porikli, David J Crandall, Luc Van Gool, and Wenguan Wang. 2022. A survey on deep learning technique for video segmentation. IEEE transactions on pattern analysis and machine intelligence, Vol. 45, 6 (2022), 7099-7122."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Yikang Zhou Tao Zhang Shunping Ji Shuicheng Yan and Xiangtai Li. 2024. DVIS-DAQ: Improving Video Segmentation via Dynamic Anchor Queries. In ECCV.","DOI":"10.1007\/978-3-031-72973-7_26"},{"key":"e_1_3_2_1_52_1","volume-title":"Dynamical Attention Hypergraph Convolutional Network for Group Activity Recognition","author":"Zhu Xiaolin","year":"2024","unstructured":"Xiaolin Zhu, Dongli Wang, Jianxun Li, Rui Su, Qin Wan, and Yan Zhou. 2024. Dynamical Attention Hypergraph Convolutional Network for Group Activity Recognition. IEEE Transactions on Neural Networks and Learning Systems (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755007","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:18:30Z","timestamp":1765307910000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755007"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":52,"alternative-id":["10.1145\/3746027.3755007","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755007","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}