{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:45:16Z","timestamp":1765309516585,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":88,"publisher":"ACM","funder":[{"name":"Ministry of Education, Singapore","award":["MOE-MOET32022-0001"],"award-info":[{"award-number":["MOE-MOET32022-0001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3754863","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:56:44Z","timestamp":1761375404000},"page":"1013-1022","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["LEAF-Mamba: Local Emphatic and Adaptive Fusion State Space Model for RGB-D Salient Object Detection"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-6420-5971","authenticated-orcid":false,"given":"Lanhu","family":"Wu","sequence":"first","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1757-9349","authenticated-orcid":false,"given":"Zilin","family":"Gao","sequence":"additional","affiliation":[{"name":"Dalian University of Technology, Dalian, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3026-6347","authenticated-orcid":false,"given":"Hao","family":"Fei","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9636-388X","authenticated-orcid":false,"given":"Mong-Li","family":"Lee","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4142-8893","authenticated-orcid":false,"given":"Wynne","family":"Hsu","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"1597","article-title":"Frequency-tuned salient region detection","author":"Achanta Radhakrishna","year":"2009","unstructured":"Radhakrishna Achanta, Sheila Hemami, Francisco Estrada, and Sabine Susstrunk. 2009. Frequency-tuned salient region detection. In CVPR. 1597-1604.","journal-title":"CVPR."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2020.2968250"},{"key":"e_1_3_2_1_3_1","first-page":"357","article-title":"CrossViT: Cross-attention multi-scale vision transformer for image classification","author":"Richard Chen Chun-Fu","year":"2021","unstructured":"Chun-Fu Richard Chen, Quanfu Fan, and Rameswar Panda. 2021. CrossViT: Cross-attention multi-scale vision transformer for image classification. In ICCV. 357-366.","journal-title":"ICCV."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3215979"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2024.3358858"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3364022"},{"key":"e_1_3_2_1_7_1","volume-title":"Towards reasoning era: A survey of long chain-of-thought for reasoning large language models. arXiv preprint arXiv:2503.09567","author":"Chen Qiguang","year":"2025","unstructured":"Qiguang Chen, Libo Qin, Jinhao Liu, Dengyun Peng, Jiannan Guan, Peng Wang, Mengkang Hu, Yuhang Zhou, Te Gao, and Wanxiang Che. 2025. Towards reasoning era: A survey of long chain-of-thought for reasoning large language models. arXiv preprint arXiv:2503.09567 (2025)."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Lun-Wei Ku, Andre Martins, and Vivek Srikumar (Eds.). 8199-8221","author":"Chen Qiguang","year":"2024","unstructured":"Qiguang Chen, Libo Qin, Jin Zhang, Zhi Chen, Xiao Xu, and Wanxiang Che. 2024a. M^3CoT: A Novel Benchmark for Multi-Domain Multi-step Multi-modal Chain-of-Thought. In Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Lun-Wei Ku, Andre Martins, and Vivek Srikumar (Eds.). 8199-8221."},{"volume-title":"Progressively guided alternate refinement network for RGB-D salient object detection","author":"Chen Shuhan","key":"e_1_3_2_1_9_1","unstructured":"Shuhan Chen and Yun Fu. 2020. Progressively guided alternate refinement network for RGB-D salient object detection. In ECCV. Springer, 520-538."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i22.34538"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3216198"},{"key":"e_1_3_2_1_12_1","first-page":"406","article-title":"Point-aware interaction and cnn-induced refinement network for RGB-D salient object detection","author":"Cong Runmin","year":"2023","unstructured":"Runmin Cong, Hongyu Liu, Chen Zhang, Wei Zhang, Feng Zheng, Ran Song, and Sam Kwong. 2023. Point-aware interaction and cnn-induced refinement network for RGB-D salient object detection. In ACM MM. 406-416.","journal-title":"ACM MM."},{"key":"e_1_3_2_1_13_1","volume-title":"Huy Hoang Nguyen, and Aleksei Tiulpin","author":"Quoc Dang Trung Dinh","year":"2024","unstructured":"Trung Dinh Quoc Dang, Huy Hoang Nguyen, and Aleksei Tiulpin. 2024. LoG-VMamba: Local-Global Vision Mamba for Medical Image Segmentation. In ACCV. 548-565."},{"key":"e_1_3_2_1_14_1","first-page":"10041","article-title":"Transformers are SSMs: generalized models and efficient algorithms through structured state space duality","author":"Dao Tri","year":"2024","unstructured":"Tri Dao and Albert Gu. 2024. Transformers are SSMs: generalized models and efficient algorithms through structured state space duality. In ICML. 10041-10071.","journal-title":"ICML."},{"key":"e_1_3_2_1_15_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2021. An image is worth 16x16 words: Transformers for image recognition at scale. ICLR (2021)."},{"key":"e_1_3_2_1_16_1","first-page":"4548","article-title":"Structure-measure: A new way to evaluate foreground maps","author":"Fan Deng-Ping","year":"2017","unstructured":"Deng-Ping Fan, Ming-Ming Cheng, Yun Liu, Tao Li, and Ali Borji. 2017. Structure-measure: A new way to evaluate foreground maps. In ICCV. 4548-4557.","journal-title":"ICCV."},{"key":"e_1_3_2_1_17_1","volume-title":"Enhanced-alignment measure for binary foreground map evaluation. IJCAI","author":"Fan Deng-Ping","year":"2018","unstructured":"Deng-Ping Fan, Cheng Gong, Yang Cao, Bo Ren, Ming-Ming Cheng, and Ali Borji. 2018. Enhanced-alignment measure for binary foreground map evaluation. IJCAI (2018), 698-704."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2020.2996406"},{"volume-title":"BBS-Net: RGB-D salient object detection with a bifurcated backbone strategy network","author":"Fan Deng-Ping","key":"e_1_3_2_1_19_1","unstructured":"Deng-Ping Fan, Yingjie Zhai, Ali Borji, Jufeng Yang, and Ling Shao. 2020b. BBS-Net: RGB-D salient object detection with a bifurcated backbone strategy network. In ECCV. Springer, 275-292."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the International Conference on Machine Learning, ICML, . 6373-6391","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Wei Ji, Hanwang Zhang, Meishan Zhang, Mong Li Lee, and Wynne Hsu. 2024a. Video-of-Thought: Step-by-Step Video Reasoning from Perception to Cognition. In Proceedings of the International Conference on Machine Learning, ICML, . 6373-6391."},{"key":"e_1_3_2_1_21_1","volume-title":"Editing. Proceedings of the Advances in neural information processing systems.","author":"Fei Hao","year":"2024","unstructured":"Hao Fei, Shengqiong Wu, Hanwang Zhang, Tat-Seng Chua, and Shuicheng Yan. 2024b. VITRON: A Unified Pixel-level Vision LLM for Understanding, Generating, Segmenting, Editing. Proceedings of the Advances in neural information processing systems."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3393452"},{"key":"e_1_3_2_1_23_1","volume-title":"Proceedings of the International Conference on Machine Learning, ICML, .","author":"Fei Hao","year":"2025","unstructured":"Hao Fei, Yuan Zhou, Juncheng Li, Xiangtai Li, Qingshan Xu, Bobo Li, Shengqiong Wu, Yaoting Wang, Junbao Zhou, Jiahao Meng, et al., 2025. On path to multimodal generalist: General-level and general-bench. In Proceedings of the International Conference on Machine Learning, ICML, ."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2024.112022"},{"key":"e_1_3_2_1_25_1","volume-title":"MSFMamba: Multi-Scale Feature Fusion State Space Model for Multi-Source Remote Sensing Image Classification","author":"Gao Feng","year":"2025","unstructured":"Feng Gao, Xuepeng Jin, Xiaowei Zhou, Junyu Dong, and Qian Du. 2025. MSFMamba: Multi-Scale Feature Fusion State Space Model for Multi-Source Remote Sensing Image Classification. IEEE Transactions on Geoscience and Remote Sensing (2025)."},{"key":"e_1_3_2_1_26_1","volume-title":"Mamba: Linear-time sequence modeling with selective state spaces. COLM","author":"Gu Albert","year":"2023","unstructured":"Albert Gu and Tri Dao. 2023. Mamba: Linear-time sequence modeling with selective state spaces. COLM (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"Efficiently modeling long sequences with structured state spaces. ICLR","author":"Gu Albert","year":"2021","unstructured":"Albert Gu, Karan Goel, and Christopher R\u00e9. 2021. Efficiently modeling long sequences with structured state spaces. ICLR (2021)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02020-y"},{"key":"e_1_3_2_1_29_1","volume-title":"LocalMamba: Visual state space model with windowed selective scan. arXiv preprint arXiv:2403.09338","author":"Huang Tao","year":"2024","unstructured":"Tao Huang, Xiaohuan Pei, Shan You, Fei Wang, Chen Qian, and Chang Xu. 2024. LocalMamba: Visual state space model with windowed selective scan. arXiv preprint arXiv:2403.09338 (2024)."},{"key":"e_1_3_2_1_30_1","first-page":"9471","article-title":"Calibrated RGB-D salient object detection","author":"Ji Wei","year":"2021","unstructured":"Wei Ji, Jingjing Li, Shuang Yu, Miao Zhang, Yongri Piao, Shunyu Yao, Qi Bi, Kai Ma, Yefeng Zheng, Huchuan Lu, et al., 2021. Calibrated RGB-D salient object detection. In CVPR. 9471-9481.","journal-title":"CVPR."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3154931"},{"key":"e_1_3_2_1_32_1","first-page":"1115","article-title":"Depth saliency based on anisotropic center-surround difference","author":"Ju Ran","year":"2014","unstructured":"Ran Ju, Ling Ge, Wenjing Geng, Tongwei Ren, and Gangshan Wu. 2014. Depth saliency based on anisotropic center-surround difference. In ICIP. IEEE, 1115-1119.","journal-title":"ICIP. IEEE"},{"key":"e_1_3_2_1_33_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_34_1","volume-title":"NeurIPS","volume":"25","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. NeurIPS, Vol. 25 (2012)."},{"volume-title":"Cross-modal weighting network for RGB-D salient object detection","author":"Li Gongyang","key":"e_1_3_2_1_35_1","unstructured":"Gongyang Li, Zhi Liu, Linwei Ye, Yang Wang, and Haibin Ling. 2020. Cross-modal weighting network for RGB-D salient object detection. In ECCV. Springer, 665-681."},{"volume-title":"VideoMamba: State space model for efficient video understanding","author":"Li Kunchang","key":"e_1_3_2_1_36_1","unstructured":"Kunchang Li, Xinhao Li, Yi Wang, Yinan He, Yali Wang, Limin Wang, and Yu Qiao. 2024. VideoMamba: State space model for efficient video understanding. In ECCV. Springer, 237-255."},{"key":"e_1_3_2_1_37_1","first-page":"2806","article-title":"Saliency detection on light field","author":"Li Nianyi","year":"2014","unstructured":"Nianyi Li, Jinwei Ye, Yu Ji, Haibin Ling, and Jingyi Yu. 2014. Saliency detection on light field. In CVPR. 2806-2813.","journal-title":"CVPR."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.106"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3140168"},{"key":"e_1_3_2_1_40_1","volume-title":"VST: Efficient and stronger visual saliency transformer","author":"Liu Nian","year":"2024","unstructured":"Nian Liu, Ziyang Luo, Ni Zhang, and Junwei Han. 2024a. VST: Efficient and stronger visual saliency transformer. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_41_1","first-page":"103031","article-title":"VMamba: Visual state space model","volume":"37","author":"Liu Yue","year":"2024","unstructured":"Yue Liu, Yunjie Tian, Yuzhong Zhao, Hongtian Yu, Lingxi Xie, Yaowei Wang, Qixiang Ye, Jianbin Jiao, and Yunfan Liu. 2024b. VMamba: Visual state space model. NeurIPS, Vol. 37 (2024), 103031-103063.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_42_1","first-page":"4481","article-title":"TriTransNet: RGB-D salient object detection with a triplet transformer embedding network","author":"Liu Zhengyi","year":"2021","unstructured":"Zhengyi Liu, Yuan Wang, Zhengzheng Tu, Yun Xiao, and Bin Tang. 2021. TriTransNet: RGB-D salient object detection with a triplet transformer embedding network. In ACM MM. 4481-4490.","journal-title":"ACM MM."},{"key":"e_1_3_2_1_43_1","volume-title":"HFMDNet: Hierarchical fusion and multi-level decoder network for RGB-D salient object detection","author":"Luo Yi","year":"2024","unstructured":"Yi Luo, Feng Shao, Zhengxuan Xie, Huizhi Wang, Hangwei Chen, Baoyang Mu, and Qiuping Jiang. 2024. HFMDNet: Hierarchical fusion and multi-level decoder network for RGB-D salient object detection. IEEE Transactions on Instrumentation and Measurement (2024)."},{"key":"e_1_3_2_1_44_1","first-page":"1007","article-title":"Saliency-based discriminant tracking","author":"Mahadevan Vijay","year":"2009","unstructured":"Vijay Mahadevan and Nuno Vasconcelos. 2009. Saliency-based discriminant tracking. In CVPR. IEEE, 1007-1013.","journal-title":"CVPR. IEEE"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3498326"},{"key":"e_1_3_2_1_46_1","first-page":"6894","article-title":"Vim4Path: Self-supervised vision mamba for histopathology images","author":"Nasiri-Sarvi Ali","year":"2024","unstructured":"Ali Nasiri-Sarvi, Vincent Quoc-Huy Trinh, Hassan Rivaz, and Mahdi S Hosseini. 2024. Vim4Path: Self-supervised vision mamba for histopathology images. In CVPR. 6894-6903.","journal-title":"CVPR."},{"key":"e_1_3_2_1_47_1","first-page":"454","article-title":"Leveraging stereopsis for saliency analysis","author":"Niu Yuzhen","year":"2012","unstructured":"Yuzhen Niu, Yujie Geng, Xueqing Li, and Feng Liu. 2012. Leveraging stereopsis for saliency analysis. In CVPR. IEEE, 454-461.","journal-title":"CVPR. IEEE"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3234702"},{"volume-title":"RGBD salient object detection: A benchmark and algorithms","author":"Peng Houwen","key":"e_1_3_2_1_49_1","unstructured":"Houwen Peng, Bing Li, Weihua Xiong, Weiming Hu, and Rongrong Ji. 2014. RGBD salient object detection: A benchmark and algorithms. In ECCV. Springer, 92-109."},{"key":"e_1_3_2_1_50_1","first-page":"7254","article-title":"Depth-induced multi-scale recurrent attention network for saliency detection","author":"Piao Yongri","year":"2019","unstructured":"Yongri Piao, Wei Ji, Jingjing Li, Miao Zhang, and Huchuan Lu. 2019. Depth-induced multi-scale recurrent attention network for saliency detection. In CVPR. 7254-7263.","journal-title":"CVPR."},{"key":"e_1_3_2_1_51_1","first-page":"9060","article-title":"A2Dele: Adaptive and attentive depth distiller for efficient RGB-D salient object detection","author":"Piao Yongri","year":"2020","unstructured":"Yongri Piao, Zhengkun Rong, Miao Zhang, Weisong Ren, and Huchuan Lu. 2020. A2Dele: Adaptive and attentive depth distiller for efficient RGB-D salient object detection. In CVPR. 9060-9069.","journal-title":"CVPR."},{"key":"e_1_3_2_1_52_1","first-page":"7479","article-title":"BasNet: Boundary-aware salient object detection","author":"Qin Xuebin","year":"2019","unstructured":"Xuebin Qin, Zichen Zhang, Chenyang Huang, Chao Gao, Masood Dehghan, and Martin Jagersand. 2019. BasNet: Boundary-aware salient object detection. In CVPR. 7479-7489.","journal-title":"CVPR."},{"key":"e_1_3_2_1_53_1","volume-title":"Simplified state space layers for sequence modeling. ICLR","author":"Smith Jimmy TH","year":"2023","unstructured":"Jimmy TH Smith, Andrew Warrington, and Scott W Linderman. 2023. Simplified state space layers for sequence modeling. ICLR (2023)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3294003"},{"key":"e_1_3_2_1_55_1","volume-title":"Divide-and-Conquer: Confluent Triple-Flow Network for RGB-T Salient Object Detection","author":"Tang Hao","year":"2024","unstructured":"Hao Tang, Zechao Li, Dong Zhang, Shengfeng He, and Jinhui Tang. 2024. Divide-and-Conquer: Confluent Triple-Flow Network for RGB-T Salient Object Detection. IEEE Transactions on Pattern Analysis and Machine Intelligence (2024)."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2021.3087412"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3171688"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2924578"},{"key":"e_1_3_2_1_59_1","volume-title":"NeurIPS","volume":"30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. NeurIPS, Vol. 30 (2017)."},{"key":"e_1_3_2_1_60_1","volume-title":"Sigma: Siamese Mamba network for multi-modal semantic segmentation. In WACV.","author":"Wan Zifu","year":"2024","unstructured":"Zifu Wan, Pingping Zhang, Yuhao Wang, Silong Yong, Simon Stepputtis, Katia Sycara, and Yaqi Xie. 2024. Sigma: Siamese Mamba network for multi-modal semantic segmentation. In WACV."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jag.2024.104092"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3140606"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-981-13-1702-6_36"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3375505"},{"key":"e_1_3_2_1_65_1","volume-title":"Multimodal chain-of-thought reasoning: A comprehensive survey. arXiv preprint arXiv:2503.12605","author":"Wang Yaoting","year":"2025","unstructured":"Yaoting Wang, Shengqiong Wu, Yuecheng Zhang, Shuicheng Yan, Ziwei Liu, Jiebo Luo, and Hao Fei. 2025. Multimodal chain-of-thought reasoning: A comprehensive survey. arXiv preprint arXiv:2503.12605 (2025)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6916"},{"key":"e_1_3_2_1_67_1","first-page":"3","article-title":"CBAM: Convolutional block attention module","author":"Woo Sanghyun","year":"2018","unstructured":"Sanghyun Woo, Jongchan Park, Joon-Young Lee, and In So Kweon. 2018. CBAM: Convolutional block attention module. In ECCV. 3-19.","journal-title":"ECCV."},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3275308"},{"key":"e_1_3_2_1_69_1","volume-title":"Edge-Guided Bidirectional-Attention Residual Network for Polyp Segmentation. In Chinese Conference on Pattern Recognition and Computer Vision (PRCV). Springer, 249-263","author":"Wu Lanhu","year":"2024","unstructured":"Lanhu Wu, Miao Zhang, Yongri Piao, Zhiwei Li, and Huchuan Lu. 2024c. Edge-Guided Bidirectional-Attention Residual Network for Polyp Segmentation. In Chinese Conference on Pattern Recognition and Computer Vision (PRCV). Springer, 249-263."},{"key":"e_1_3_2_1_70_1","volume-title":"CNN-Transformer rectified collaborative learning for medical image segmentation","author":"Wu Lanhu","year":"2024","unstructured":"Lanhu Wu, Miao Zhang, Yongri Piao, Zhenyan Yao, Weibing Sun, Feng Tian, and Huchuan Lu. 2024d. CNN-Transformer rectified collaborative learning for medical image segmentation. IEEE Transactions on Circuits and Systems for Video Technology (2024)."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01321"},{"key":"e_1_3_2_1_72_1","volume-title":"Towards Semantic Equivalence of Tokenization in Multimodal LLM. arXiv preprint arXiv:2406.05127","author":"Wu Shengqiong","year":"2024","unstructured":"Shengqiong Wu, Hao Fei, Xiangtai Li, Jiayi Ji, Hanwang Zhang, Tat-Seng Chua, and Shuicheng Yan. 2024a. Towards Semantic Equivalence of Tokenization in Multimodal LLM. arXiv preprint arXiv:2406.05127 (2024)."},{"key":"e_1_3_2_1_73_1","volume-title":"Proceedings of the International Conference on Machine Learning. 53366-53397","author":"Wu Shengqiong","year":"2024","unstructured":"Shengqiong Wu, Hao Fei, Leigang Qu, Wei Ji, and Tat-Seng Chua. 2024b. NExT-GPT: Any-to-Any Multimodal LLM. In Proceedings of the International Conference on Machine Learning. 53366-53397."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3134684"},{"key":"e_1_3_2_1_75_1","first-page":"3907","article-title":"Cascaded partial decoder for fast and accurate salient object detection","author":"Wu Zhe","year":"2019","unstructured":"Zhe Wu, Li Su, and Qingming Huang. 2019. Cascaded partial decoder for fast and accurate salient object detection. In CVPR. 3907-3916.","journal-title":"CVPR."},{"key":"e_1_3_2_1_76_1","first-page":"3455","article-title":"Object segmentation by mining cross-modal semantics","author":"Wu Zongwei","year":"2023","unstructured":"Zongwei Wu, Jingjing Wang, Zhuyun Zhou, Zhaochong An, Qiuping Jiang, C\u00e9dric Demonceaux, Guolei Sun, and Radu Timofte. 2023b. Object segmentation by mining cross-modal semantics. In ACM MM. 3455-3464.","journal-title":"ACM MM."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3241196"},{"key":"e_1_3_2_1_78_1","volume-title":"PlainMamba: Improving non-hierarchical mamba in visual recognition. arXiv preprint arXiv:2403.17695","author":"Yang Chenhongyi","year":"2024","unstructured":"Chenhongyi Yang, Zehui Chen, Miguel Espinosa, Linus Ericsson, Zhenyu Wang, Jiaming Liu, and Elliot J Crowley. 2024a. PlainMamba: Improving non-hierarchical mamba in visual recognition. arXiv preprint arXiv:2403.17695 (2024)."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72083-3_28"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3315511"},{"key":"e_1_3_2_1_81_1","volume-title":"Jie Liu, Shuang Xu, Yongri Piao, and Huchuan Lu.","author":"Zhang Miao","year":"2020","unstructured":"Miao Zhang, Sun Xiao Fei, Jie Liu, Shuang Xu, Yongri Piao, and Huchuan Lu. 2020a. Asymmetric two-stream architecture for accurate RGB-D saliency detection. In ECCV. Springer, 374-390."},{"key":"e_1_3_2_1_82_1","first-page":"3472","article-title":"Select, supplement and focus for RGB-D saliency detection","author":"Zhang Miao","year":"2020","unstructured":"Miao Zhang, Weisong Ren, Yongri Piao, Zhengkun Rong, and Huchuan Lu. 2020b. Select, supplement and focus for RGB-D saliency detection. In CVPR. 3472-3481.","journal-title":"CVPR."},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3187856"},{"key":"e_1_3_2_1_84_1","first-page":"731","article-title":"Depth quality-inspired feature manipulation for efficient RGB-D salient object detection","author":"Zhang Wenbo","year":"2021","unstructured":"Wenbo Zhang, Ge-Peng Ji, Zhuo Wang, Keren Fu, and Qijun Zhao. 2021. Depth quality-inspired feature manipulation for efficient RGB-D salient object detection. In ACM MM. 731-740.","journal-title":"ACM MM."},{"key":"e_1_3_2_1_85_1","first-page":"3927","article-title":"Contrast prior and fluid pyramid integration for RGBD salient object detection","author":"Zhao Jia-Xing","year":"2019","unstructured":"Jia-Xing Zhao, Yang Cao, Deng-Ping Fan, Ming-Ming Cheng, Xuan-Yi Li, and Le Zhang. 2019. Contrast prior and fluid pyramid integration for RGBD salient object detection. In CVPR. 3927-3936.","journal-title":"CVPR."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2023.3242775"},{"key":"e_1_3_2_1_87_1","volume-title":"ICCV workshop. 3008-3014","author":"Zhu Chunbiao","year":"2017","unstructured":"Chunbiao Zhu and Ge Li. 2017. A three-pathway psychobiological framework of salient object detection using stereoscopic technology. In ICCV workshop. 3008-3014."},{"key":"e_1_3_2_1_88_1","first-page":"62429","article-title":"Vision Mamba","author":"Zhu Lianghui","year":"2024","unstructured":"Lianghui Zhu, Bencheng Liao, Qian Zhang, Xinlong Wang, Wenyu Liu, and Xinggang Wang. 2024. Vision Mamba: Efficient Visual Representation Learning with Bidirectional State Space Model. In ICML. PMLR, 62429-62442.","journal-title":"In ICML. PMLR"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3754863","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:40:25Z","timestamp":1765309225000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3754863"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":88,"alternative-id":["10.1145\/3746027.3754863","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3754863","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}