{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:51:51Z","timestamp":1765309911381,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":47,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62273241"],"award-info":[{"award-number":["62273241"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003453","name":"Natural Science Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2024A1515011946"],"award-info":[{"award-number":["2024A1515011946"]}],"id":[{"id":"10.13039\/501100003453","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Hong Kong RGC Collaborative Research Fund","award":["C5055-24G"],"award-info":[{"award-number":["C5055-24G"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755223","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:38Z","timestamp":1761377198000},"page":"3865-3874","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Hierarchical Spatiotemporal Context Aggregation and Speckle-aware Deformable Convolution for Echocardiography Video Segmentation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2218-591X","authenticated-orcid":false,"given":"Jingxing","family":"Guo","sequence":"first","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-4981-7529","authenticated-orcid":false,"given":"Guilian","family":"Chen","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7011-1729","authenticated-orcid":false,"given":"Yimu","family":"Sun","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0399-9089","authenticated-orcid":false,"given":"Huisi","family":"Wu","sequence":"additional","affiliation":[{"name":"Shenzhen University, Shenzhen, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7059-0929","authenticated-orcid":false,"given":"Jing","family":"Qin","sequence":"additional","affiliation":[{"name":"Hong Kong Polytechnic University, Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Xcit: Cross-covariance image transformers. Advances in neural information processing systems","author":"Ali Alaaeldin","year":"2021","unstructured":"Alaaeldin Ali, Hugo Touvron, Mathilde Caron, Piotr Bojanowski, Matthijs Douze, Armand Joulin, Ivan Laptev, Natalia Neverova, Gabriel Synnaeve, Jakob Verbeek, et al., 2021. Xcit: Cross-covariance image transformers. Advances in neural information processing systems, Vol. 34 (2021), 20014-20027."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3122256"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TUFFC.2022.3169684"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.3389\/fcvm.2020.00025"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00304"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19815-1_37"},{"key":"e_1_3_2_1_7_1","volume-title":"Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509","author":"Child Rewon","year":"2019","unstructured":"Rewon Child, Scott Gray, Alec Radford, and Ilya Sutskever. 2019. Generating long sequences with sparse transformers. arXiv preprint arXiv:1904.10509 (2019)."},{"key":"e_1_3_2_1_8_1","volume-title":"Twins: Revisiting the design of spatial attention in vision transformers. Advances in neural information processing systems","author":"Chu Xiangxiang","year":"2021","unstructured":"Xiangxiang Chu, Zhi Tian, Yuqing Wang, Bo Zhang, Haibing Ren, Xiaolin Wei, Huaxia Xia, and Chunhua Shen. 2021. Twins: Revisiting the design of spatial attention in vision transformers. Advances in neural information processing systems, Vol. 34 (2021), 9355-9366."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00919"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1161\/01.CIR.60.4.760"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01186"},{"key":"e_1_3_2_1_12_1","volume-title":"Axial attention in multidimensional transformers. arXiv preprint arXiv:1912.12180","author":"Ho Jonathan","year":"2019","unstructured":"Jonathan Ho, Nal Kalchbrenner, Dirk Weissenborn, and Tim Salimans. 2019. Axial attention in multidimensional transformers. arXiv preprint arXiv:1912.12180 (2019)."},{"key":"e_1_3_2_1_13_1","volume-title":"Medivista-sam: Zero-shot medical video analysis with spatio-temporal sam adaptation. arXiv preprint arXiv:2309.13539","author":"Kim Sekeun","year":"2023","unstructured":"Sekeun Kim, Kyungsang Kim, Jiang Hu, Cheng Chen, Zhiliang Lyu, Ren Hui, Sunghwan Kim, Zhengliang Liu, Aoxiao Zhong, Xiang Li, et al., 2023. Medivista-sam: Zero-shot medical video analysis with spatio-temporal sam adaptation. arXiv preprint arXiv:2309.13539 (2023)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_15_1","volume-title":"Reformer: The efficient transformer. arXiv preprint arXiv:2001.04451","author":"Kitaev Nikita","year":"2020","unstructured":"Nikita Kitaev, \u0141ukasz Kaiser, and Anselm Levskaya. 2020. Reformer: The efficient transformer. arXiv preprint arXiv:2001.04451 (2020)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TUFFC.2020.3003403"},{"key":"e_1_3_2_1_17_1","volume-title":"Pierre-Marc Jodoin, Thomas Grenier, et al.","author":"Leclerc Sarah","year":"2019","unstructured":"Sarah Leclerc, Erik Smistad, Joao Pedrosa, Andreas \u00d8stvik, Frederic Cervenansky, Florian Espinosa, Torvald Espeland, Erik Andreas Rye Berg, Pierre-Marc Jodoin, Thomas Grenier, et al., 2019. Deep learning for segmentation using an open large-scale dataset in 2D echocardiography. IEEE transactions on medical imaging, Vol. 38, 9 (2019), 2198-2210."},{"key":"e_1_3_2_1_18_1","volume-title":"Videomamba: State space model for efficient video understanding. arXiv preprint arXiv:2403.06977","author":"Li Kunchang","year":"2024","unstructured":"Kunchang Li, Xinhao Li, Yi Wang, Yinan He, Yali Wang, Limin Wang, and Yu Qiao. 2024. Videomamba: State space model for efficient video understanding. arXiv preprint arXiv:2403.06977 (2024)."},{"key":"e_1_3_2_1_19_1","volume-title":"Localvit: Bringing locality to vision transformers. arXiv preprint arXiv:2104.05707","author":"Li Yawei","year":"2021","unstructured":"Yawei Li, Kai Zhang, Jiezhang Cao, Radu Timofte, and Luc Van Gool. 2021. Localvit: Bringing locality to vision transformers. arXiv preprint arXiv:2104.05707 (2021)."},{"key":"e_1_3_2_1_20_1","volume-title":"SAMUS: Adapting segment anything model for clinically-friendly and generalizable ultrasound image segmentation. arXiv preprint arXiv:2309.06824","author":"Lin Xian","year":"2023","unstructured":"Xian Lin, Yangyang Xiang, Li Zhang, Xin Yang, Zengqiang Yan, and Li Yu. 2023. SAMUS: Adapting segment anything model for clinically-friendly and generalizable ultrasound image segmentation. arXiv preprint arXiv:2309.06824 (2023)."},{"key":"e_1_3_2_1_21_1","volume-title":"Deep pyramid local attention neural network for cardiac structure segmentation in two-dimensional echocardiography. Medical image analysis","author":"Liu Fei","year":"2021","unstructured":"Fei Liu, Kun Wang, Dan Liu, Xin Yang, and Jie Tian. 2021. Deep pyramid local attention neural network for cardiac structure segmentation in two-dimensional echocardiography. Medical image analysis, Vol. 67 (2021), 101873."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01170"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00320"},{"key":"e_1_3_2_1_24_1","first-page":"565","article-title":"V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 2016 fourth international conference on 3D vision (3DV)","author":"Milletari Fausto","year":"2016","unstructured":"Fausto Milletari, Nassir Navab, and Seyed-Ahmad Ahmadi. 2016. V-net: Fully convolutional neural networks for volumetric medical image segmentation. In 2016 fourth international conference on 3D vision (3DV). Ieee, 565-571.","journal-title":"Ieee"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.ejmp.2019.10.001"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19772-7_1"},{"key":"e_1_3_2_1_27_1","unstructured":"David Ouyang Bryan He Amirata Ghorbani Matt P Lungren Euan A Ashley David H Liang and James Y Zou. 2019. Echonet-dynamic: a large new cardiac motion video data resource for medical machine learning. In NeurIPS ML4H Workshop. 1-11."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-020-2145-8"},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the IEEE\/CVF winter conference on applications of computer vision. 3531-3539","author":"Shen Zhuoran","year":"2021","unstructured":"Zhuoran Shen, Mingyuan Zhang, Haiyu Zhao, Shuai Yi, and Hongsheng Li. 2021. Efficient attention: Attention with linear complexities. In Proceedings of the IEEE\/CVF winter conference on applications of computer vision. 3531-3539."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-59725-2_45"},{"key":"e_1_3_2_1_31_1","volume-title":"Statistics of natural image categories. Network: computation in neural systems","author":"Torralba Antonio","year":"2003","unstructured":"Antonio Torralba and Aude Oliva. 2003. Statistics of natural image categories. Network: computation in neural systems, Vol. 14, 3 (2003), 391."},{"key":"e_1_3_2_1_32_1","volume-title":"Hieu Pham Huy, and Long Tran Quoc","author":"Van Phi Nguyen","year":"2023","unstructured":"Phi Nguyen Van, Hieu Pham Huy, and Long Tran Quoc. 2023. Echocardiography segmentation using neural ode-based diffeomorphic registration field. arXiv preprint arXiv:2306.09687 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Attention is all you need. Advances in Neural Information Processing Systems","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_1_34_1","first-page":"1451","article-title":"Understanding convolution for semantic segmentation. In 2018 IEEE winter conference on applications of computer vision (WACV)","author":"Wang Panqu","year":"2018","unstructured":"Panqu Wang, Pengfei Chen, Ye Yuan, Ding Liu, Zehua Huang, Xiaodi Hou, and Garrison Cottrell. 2018. Understanding convolution for semantic segmentation. In 2018 IEEE winter conference on applications of computer vision (WACV). Ieee, 1451-1460.","journal-title":"Ieee"},{"key":"e_1_3_2_1_35_1","volume-title":"Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768","author":"Wang Sinong","year":"2020","unstructured":"Sinong Wang, Belinda Z Li, Madian Khabsa, Han Fang, and Hao Ma. 2020. Linformer: Self-attention with linear complexity. arXiv preprint arXiv:2006.04768 (2020)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-59713-9_60"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i3.25381"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.media.2022.102397"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00009"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01276"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i16.17664"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/JBHI.2022.3221429"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00224"},{"key":"e_1_3_2_1_44_1","first-page":"30008","article-title":"Focal attention for long-range interactions in vision transformers","volume":"34","author":"Yang Jianwei","year":"2021","unstructured":"Jianwei Yang, Chunyuan Li, Pengchuan Zhang, Xiyang Dai, Bin Xiao, Lu Yuan, and Jianfeng Gao. 2021. Focal attention for long-range interactions in vision transformers. Advances in Neural Information Processing Systems, Vol. 34 (2021), 30008-30022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_45_1","unstructured":"Fisher Yu Vladlen Koltun and Thomas Funkhouser. 2017. Dilated Residual Networks. In Computer Vision and Pattern Recognition (CVPR)."},{"key":"e_1_3_2_1_46_1","volume-title":"Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, et al.","author":"Zaheer Manzil","year":"2020","unstructured":"Manzil Zaheer, Guru Guruganesh, Kumar Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, et al., 2020. Big bird: Transformers for longer sequences. Advances in neural information processing systems, Vol. 33 (2020), 17283-17297."},{"key":"e_1_3_2_1_47_1","volume-title":"Real-time echocardiography image analysis and quantification of cardiac indices. Medical image analysis","author":"Zamzmi Ghada","year":"2022","unstructured":"Ghada Zamzmi, Sivaramakrishnan Rajaraman, Li-Yueh Hsu, Vandana Sachdev, and Sameer Antani. 2022. Real-time echocardiography image analysis and quantification of cardiac indices. Medical image analysis, Vol. 80 (2022), 102438."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755223","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:49:22Z","timestamp":1765309762000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755223"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":47,"alternative-id":["10.1145\/3746027.3755223","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755223","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}