{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T18:34:52Z","timestamp":1776882892354,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":54,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"NSFC","award":["No. 62276141"],"award-info":[{"award-number":["No. 62276141"]}]},{"name":"NSFC","award":["No. 61825601"],"award-info":[{"award-number":["No. 61825601"]}]},{"name":"National Key Research and Development Program of China","award":["2021ZD0112200"],"award-info":[{"award-number":["2021ZD0112200"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612017","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"3394-3402","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["Temporally Efficient Gabor Transformer for Unsupervised Video Object Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5659-7457","authenticated-orcid":false,"given":"Jiaqing","family":"Fan","sequence":"first","affiliation":[{"name":"Nanjing University of Aeronautics and Astronautics, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6470-8230","authenticated-orcid":false,"given":"Tiankang","family":"Su","sequence":"additional","affiliation":[{"name":"Nanjing University of Information Science and Technology, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1613-3401","authenticated-orcid":false,"given":"Kaihua","family":"Zhang","sequence":"additional","affiliation":[{"name":"Ministry of Education &amp; Nanjing University of Information Science and Technology, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2113-7853","authenticated-orcid":false,"given":"Bo","family":"Liu","sequence":"additional","affiliation":[{"name":"Walmart Global Tech, Sunnyvale, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5512-6984","authenticated-orcid":false,"given":"Qingshan","family":"Liu","sequence":"additional","affiliation":[{"name":"Nanjing University of Posts and Telecommunications, Nanjing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Face recognition: The problem of compensating for changes in illumination direction. TPAMI","author":"Adini Yael","year":"1997","unstructured":"Yael Adini, Yael Moses, and Shimon Ullman. 1997. Face recognition: The problem of compensating for changes in illumination direction. TPAMI (1997)."},{"key":"e_1_3_2_2_2_1","volume-title":"Mario Luvc i\u0107, and Cordelia Schmid","author":"Arnab Anurag","year":"2021","unstructured":"Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Luvc i\u0107, and Cordelia Schmid. 2021. Vivit: A video vision transformer. In ICCV."},{"key":"e_1_3_2_2_3_1","unstructured":"Gedas Bertasius Heng Wang and Lorenzo Torresani. 2021. Is space-time attention all you need for video understanding?. In ICML."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"crossref","unstructured":"Adam Botach Evgenii Zheltonozhskii and Chaim Baskin. 2022. End-to-end referring video object segmentation with multimodal transformers. In CVPR.","DOI":"10.1109\/CVPR52688.2022.00493"},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"crossref","unstructured":"Yi-Wen Chen Xiaojie Jin Xiaohui Shen and Ming-Hsuan Yang. 2022. Video Salient Object Detection via Contrastive Features and Attention Modules. In WACV.","DOI":"10.1109\/WACV51458.2022.00061"},{"key":"e_1_3_2_2_6_1","unstructured":"Deng-Ping Fan Wenguan Wang Ming-Ming Cheng and Jianbing Shen. 2019. Shifting more attention to video salient object detection. In CVPR."},{"key":"e_1_3_2_2_7_1","unstructured":"Jiaqing Fan Tiankang Su Kaihua Zhang and Qingshan Liu. 2022. Bidirectionally Learning Dense Spatio-temporal Feature Propagation Network for Unsupervised Video Object Segmentation. In ACMMM."},{"key":"e_1_3_2_2_8_1","unstructured":"Yuchao Gu Lijuan Wang Ziqin Wang Yun Liu Ming-Ming Cheng and Shao-Ping Lu. 2020. Pyramid constrained self-attention network for fast video salient object detection. In AAAI."},{"key":"e_1_3_2_2_9_1","unstructured":"John Guibas Morteza Mardani Zongyi Li Andrew Tao Anima Anandkumar and Bryan Catanzaro. 2022. Adaptive fourier neural operators: Efficient token mixers for transformers. In ICLR."},{"key":"e_1_3_2_2_10_1","unstructured":"Yuan-Ting Hu Jia-Bin Huang and Alexander G Schwing. 2018. Unsupervised video object segmentation using motion saliency-guided spatio-temporal propagation. In ECCV."},{"key":"e_1_3_2_2_11_1","volume-title":"Fusionseg: Learning to combine motion and appearance for fully automatic segmentation of generic objects in videos. In CVPR.","author":"Jain Suyog Dutt","year":"2017","unstructured":"Suyog Dutt Jain, Bo Xiong, and Kristen Grauman. 2017. Fusionseg: Learning to combine motion and appearance for fully automatic segmentation of generic objects in videos. In CVPR."},{"key":"e_1_3_2_2_12_1","unstructured":"Ge-Peng Ji Keren Fu Zhe Wu Deng-Ping Fan Jianbing Shen and Ling Shao. 2021. Full-duplex strategy for video object segmentation. In ICCV."},{"key":"e_1_3_2_2_13_1","volume-title":"3D convolutional neural networks for human action recognition. TPAMI","author":"Ji Shuiwang","year":"2012","unstructured":"Shuiwang Ji, Wei Xu, Ming Yang, and Kai Yu. 2012. 3D convolutional neural networks for human action recognition. TPAMI (2012)."},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"Liming Jiang Bo Dai Wayne Wu and Chen Change Loy. 2021. Focal frequency loss for image reconstruction and synthesis. In ICCV.","DOI":"10.1109\/ICCV48922.2021.01366"},{"key":"e_1_3_2_2_15_1","volume-title":"Spatiotemporal saliency detection for video sequences based on random walk with restart. TIP","author":"Kim Hansang","year":"2015","unstructured":"Hansang Kim, Youngbae Kim, Jae-Young Sim, and Chang-Su Kim. 2015. Spatiotemporal saliency detection for video sequences based on random walk with restart. TIP (2015)."},{"key":"e_1_3_2_2_16_1","unstructured":"Youngjo Lee Hongje Seong and Euntai Kim. 2022. Iteratively selecting an easy reference frame makes unsupervised video object segmentation easier. In AAAI."},{"key":"e_1_3_2_2_17_1","unstructured":"Shuai Li Wanqing Li Chris Cook Ce Zhu and Yanbo Gao. 2018a. Independently recurrent neural network (indrnn): Building a longer and deeper rnn. In CVPR."},{"key":"e_1_3_2_2_18_1","unstructured":"Siyang Li Bryan Seybold Alexey Vorobyov Alireza Fathi Qin Huang and C-C Jay Kuo. 2018b. Instance embedding transfer to unsupervised video object segmentation. In CVPR."},{"key":"e_1_3_2_2_19_1","volume-title":"Gabor feature based classification using the enhanced fisher linear discriminant model for face recognition. TIP","author":"Liu Chengjun","year":"2002","unstructured":"Chengjun Liu and Harry Wechsler. 2002. Gabor feature based classification using the enhanced fisher linear discriminant model for face recognition. TIP (2002)."},{"key":"e_1_3_2_2_20_1","unstructured":"Daizong Liu Dongdong Yu Changhu Wang and Pan Zhou. 2021. F2Net: Learning to Focus on the Foreground for Unsupervised Video Object Segmentation. In AAAI."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"crossref","unstructured":"Yong Liu Ran Yu Fei Yin Xinyuan Zhao Wei Zhao Weihao Xia and Yujiu Yang. 2022. Learning Quality-aware Dynamic Memory for Video Object Segmentation. In ECCV.","DOI":"10.1007\/978-3-031-19818-2_27"},{"key":"e_1_3_2_2_22_1","unstructured":"Xiankai Lu Wenguan Wang Martin Danelljan Tianfei Zhou Jianbing Shen and Luc Van Gool. 2020a. Video object segmentation with episodic graph memory networks. In ECCV."},{"key":"e_1_3_2_2_23_1","unstructured":"Xiankai Lu Wenguan Wang Chao Ma Jianbing Shen Ling Shao and Fatih Porikli. 2019. See more know more: Unsupervised video object segmentation with co-attention siamese networks. In CVPR."},{"key":"e_1_3_2_2_24_1","unstructured":"Xiankai Lu Wenguan Wang Jianbing Shen Yu-Wing Tai David J Crandall and Steven CH Hoi. 2020b. Learning video object segmentation from unlabeled videos. In CVPR."},{"key":"e_1_3_2_2_25_1","unstructured":"Sachin Mehta and Mohammad Rastegari. 2022. Mobilevit: light-weight general-purpose and mobile-friendly vision transformer. In ICLR."},{"key":"e_1_3_2_2_26_1","volume-title":"Segmentation of moving objects by long term video analysis. TPAMI","author":"Ochs Peter","year":"2013","unstructured":"Peter Ochs, Jitendra Malik, and Thomas Brox. 2013. Segmentation of moving objects by long term video analysis. TPAMI (2013)."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"crossref","unstructured":"Youwei Pang Xiaoqi Zhao Lihe Zhang and Huchuan Lu. 2020. Multi-scale interactive network for salient object detection. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00943"},{"key":"e_1_3_2_2_28_1","unstructured":"Namuk Park and Songkuk Kim. 2022. How Do Vision Transformers Work?. In ICLR."},{"key":"e_1_3_2_2_29_1","unstructured":"Gensheng Pei Yazhou Yao Guo-Sen Xie Fumin Shen Zhenmin Tang and Jinhui Tang. 2022. Hierarchical Feature Alignment Network for Unsupervised Video Object Segmentation. In ECCV."},{"key":"e_1_3_2_2_30_1","volume-title":"Markus Gross, and Alexander Sorkine-Hornung.","author":"Perazzi Federico","year":"2016","unstructured":"Federico Perazzi, Jordi Pont-Tuset, Brian McWilliams, Luc Van Gool, Markus Gross, and Alexander Sorkine-Hornung. 2016. A benchmark dataset and evaluation methodology for video object segmentation. In CVPR."},{"key":"e_1_3_2_2_31_1","unstructured":"Yongming Rao Wenliang Zhao Zheng Zhu Jiwen Lu and Jie Zhou. 2021. Global filter networks for image classification. In NeurIPS."},{"key":"e_1_3_2_2_32_1","volume-title":"Tenet: Triple excitation network for video salient object detection. In ECCV.","author":"Ren Sucheng","year":"2020","unstructured":"Sucheng Ren, Chu Han, Xin Yang, Guoqiang Han, and Shengfeng He. 2020. Tenet: Triple excitation network for video salient object detection. In ECCV."},{"key":"e_1_3_2_2_33_1","unstructured":"Sucheng Ren Wenxi Liu Yongtuo Liu Haoxin Chen Guoqiang Han and Shengfeng He. 2021. Reciprocal transformations for unsupervised video object segmentation. In CVPR."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","unstructured":"Christian Schmidt Ali Athar Sabarinath Mahadevan and Bastian Leibe. 2022. D2Conv3D: Dynamic Dilated Convolutions for Object Segmentation in Videos. In WACV.","DOI":"10.1109\/WACV51458.2022.00199"},{"key":"e_1_3_2_2_35_1","volume-title":"Sequencer: Deep LSTM for Image Classification. In NeurIPS.","author":"Tatsunami Yuki","year":"2022","unstructured":"Yuki Tatsunami and Masato Taki. 2022. Sequencer: Deep LSTM for Image Classification. In NeurIPS."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"crossref","unstructured":"Wenguan Wang Xiankai Lu Jianbing Shen David J Crandall and Ling Shao. 2019a. Zero-shot video object segmentation via attentive graph neural networks. In ICCV.","DOI":"10.1109\/ICCV.2019.00933"},{"key":"e_1_3_2_2_37_1","volume-title":"Consistent video saliency using local gradient flow optimization and global refinement. TIP","author":"Wang Wenguan","year":"2015","unstructured":"Wenguan Wang, Jianbing Shen, and Ling Shao. 2015. Consistent video saliency using local gradient flow optimization and global refinement. TIP (2015)."},{"key":"e_1_3_2_2_38_1","volume-title":"Steven CH Hoi, and Haibin Ling","author":"Wang Wenguan","year":"2019","unstructured":"Wenguan Wang, Hongmei Song, Shuyang Zhao, Jianbing Shen, Sanyuan Zhao, Steven CH Hoi, and Haibin Ling. 2019b. Learning unsupervised video object segmentation through visual attention. In CVPR."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"crossref","unstructured":"Jun Wei Shuhui Wang and Qingming Huang. 2020. F3Net: fusion feedback and focus for salient object detection. In AAAI.","DOI":"10.1609\/aaai.v34i07.6916"},{"key":"e_1_3_2_2_40_1","unstructured":"Jiannan Wu Yi Jiang Peize Sun Zehuan Yuan and Ping Luo. 2022. Language as Queries for Referring Video Object Segmentation. In CVPR."},{"key":"e_1_3_2_2_41_1","unstructured":"Jiangtao Xie Fei Long Jiaming Lv Qilong Wang and Peihua Li. 2022. Joint Distribution Matters: Deep Brownian Distance Covariance for Few-Shot Classification. In CVPR."},{"key":"e_1_3_2_2_42_1","volume-title":"Youtube-vos: Sequence-to-sequence video object segmentation. In ECCV.","author":"Xu Ning","year":"2018","unstructured":"Ning Xu, Linjie Yang, Yuchen Fan, Jianchao Yang, Dingcheng Yue, Yuchen Liang, Brian Price, Scott Cohen, and Thomas Huang. 2018. Youtube-vos: Sequence-to-sequence video object segmentation. In ECCV."},{"key":"e_1_3_2_2_43_1","unstructured":"Qinwei Xu Ruipeng Zhang Ya Zhang Yanfeng Wang and Qi Tian. 2021. A fourier-based framework for domain generalization. In CVPR."},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"crossref","unstructured":"Shu Yang Lu Zhang Jinqing Qi Huchuan Lu Shuo Wang and Xiaoxing Zhang. 2021. Learning Motion-Appearance Co-Attention for Zero-Shot Video Object Segmentation. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00159"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"crossref","unstructured":"Zhao Yang Qiang Wang Luca Bertinetto Weiming Hu Song Bai and Philip HS Torr. 2019. Anchor diffusion for unsupervised video object segmentation. In ICCV.","DOI":"10.1109\/ICCV.2019.00102"},{"key":"e_1_3_2_2_46_1","unstructured":"Hongxu Yin Arash Vahdat Jose M Alvarez Arun Mallya Jan Kautz and Pavlo Molchanov. 2022. A-ViT: Adaptive Tokens for Efficient Vision Transformer. In CVPR."},{"key":"e_1_3_2_2_47_1","unstructured":"Bingyao Yu Wanhua Li Xiu Li Jiwen Lu and Jie Zhou. 2021. Frequency-aware spatiotemporal transformers for video inpainting detection. In ICCV."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"crossref","unstructured":"Kaihua Zhang Zicheng Zhao Dong Liu Qingshan Liu and Bo Liu. 2021b. Deep Transport Network for Unsupervised Video Object Segmentation. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00866"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"crossref","unstructured":"Miao Zhang Jie Liu Yifei Wang Yongri Piao Shunyu Yao Wei Ji Jingjing Li Huchuan Lu and Zhongxuan Luo. 2021a. Dynamic context-sensitive filtering network for video salient object detection. In ICCV.","DOI":"10.1109\/ICCV48922.2021.00158"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"crossref","unstructured":"Xiaoqi Zhao Youwei Pang Lihe Zhang Huchuan Lu and Lei Zhang. 2020. Suppress and balance: A simple gated network for salient object detection. In ECCV.","DOI":"10.1007\/978-3-030-58536-5_3"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"crossref","unstructured":"Mingmin Zhen Shiwei Li Lei Zhou Jiaxiang Shang Haoan Feng Tian Fang and Long Quan. 2020. Learning discriminative feature with crf for unsupervised video object segmentation. In ECCV.","DOI":"10.1007\/978-3-030-58583-9_27"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"crossref","unstructured":"Tianfei Zhou Jianwu Li Xueyi Li and Ling Shao. 2021. Target-aware object discovery and association for unsupervised video multi-object segmentation. In CVPR.","DOI":"10.1109\/CVPR46437.2021.00691"},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3225573"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"crossref","unstructured":"Tianfei Zhou Shunzhou Wang Yi Zhou Yazhou Yao Jianwu Li and Ling Shao. 2020. Motion-attentive transition for zero-shot video object segmentation. In AAAI.","DOI":"10.1109\/TIP.2020.3013162"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612017","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612017","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:00:11Z","timestamp":1755820811000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612017"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":54,"alternative-id":["10.1145\/3581783.3612017","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612017","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}