{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,16]],"date-time":"2026-05-16T01:30:54Z","timestamp":1778895054790,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","funder":[{"name":"Marine Conservation Enhancement Fund","award":["MCEF20107 and MCEF22112"],"award-info":[{"award-number":["MCEF20107 and MCEF22112"]}]},{"name":"internal grant from HKUST","award":["R9429"],"award-info":[{"award-number":["R9429"]}]},{"name":"HKUST Marine Robotics and Blue Economy Technology Grant"},{"name":"Sustainable Smart Campus as a Living Lab"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3758198","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T05:44:48Z","timestamp":1761371088000},"page":"12621-12628","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["MSC: A Marine Wildlife Dataset for Video Understanding with Grounded Segmentation and Clip-Level Captions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6242-2191","authenticated-orcid":false,"given":"Quang-Trung","family":"Truong","sequence":"first","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4444-5502","authenticated-orcid":false,"given":"Yuk-Kwan","family":"Wong","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4000-2217","authenticated-orcid":false,"given":"Vo","family":"Hoang Kim Tuyen Dang","sequence":"additional","affiliation":[{"name":"Ho Chi Minh University of Science, Ho Chi Minh, Vietnam"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5313-8589","authenticated-orcid":false,"given":"Rinaldi","family":"Gotama","sequence":"additional","affiliation":[{"name":"Indo Ocean Foundation, Bali, Indonesia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2285-2066","authenticated-orcid":false,"given":"Duc Thanh","family":"Nguyen","sequence":"additional","affiliation":[{"name":"Deakin University, Burwood, Australia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7974-0607","authenticated-orcid":false,"given":"Sai-Kit","family":"Yeung","sequence":"additional","affiliation":[{"name":"Hong Kong University of Science and Technology, Hong Kong SAR, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Gemini. https:\/\/gemini.google.com\/."},{"key":"e_1_3_2_1_2_1","unstructured":"GPT-4.1. https:\/\/openai.com\/index\/gpt-4--1\/."},{"key":"e_1_3_2_1_3_1","unstructured":"Hailuo. https:\/\/hailuoai.video."},{"key":"e_1_3_2_1_4_1","unstructured":"Kling 1.5. https:\/\/klingai.com\/."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings, Part V 14","author":"Anderson Peter","year":"2016","unstructured":"Peter Anderson, Basura Fernando, Mark Johnson, and Stephen Gould. 2016. Spice: Semantic propositional image caption evaluation. In Computer Vision--ECCV 2016: 14th European Conference, Amsterdam, The Netherlands, October 11--14, 2016, Proceedings, Part V 14. Springer, 382--398."},{"key":"e_1_3_2_1_6_1","volume-title":"ViCaS: A Dataset for Combining Holistic and Pixel-level Video Understanding using Captions with Grounded Segmentation. arXiv preprint arXiv:2412.09754","author":"Athar Ali","year":"2024","unstructured":"Ali Athar, Xueqing Deng, and Liang-Chieh Chen. 2024. ViCaS: A Dataset for Combining Holistic and Pixel-level Video Understanding using Captions with Grounded Segmentation. arXiv preprint arXiv:2412.09754 (2024)."},{"key":"e_1_3_2_1_7_1","volume-title":"Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966","author":"Bai Jinze","year":"2023","unstructured":"Jinze Bai, Shuai Bai, Shusheng Yang, Shijie Wang, Sinan Tan, Peng Wang, Junyang Lin, Chang Zhou, and Jingren Zhou. 2023. Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond. arXiv preprint arXiv:2308.12966 (2023)."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72","author":"Banerjee Satanjeev","year":"2005","unstructured":"Satanjeev Banerjee and Alon Lavie. 2005. METEOR: An automatic metric for MT evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 65--72."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"Otto Brookes Maksim Kukushkin Majid Mirmehdi Colleen Stephens Paula Dieguez Thurston C Hicks Sorrel Jones Kevin Lee Maureen S McCarthy Amelia Meier et al. 2025. The PanAf-FGBG Dataset: Understanding the Impact of Backgrounds in Wildlife Behaviour Recognition. arXiv preprint arXiv:2502.21201 (2025).","DOI":"10.1109\/CVPR52734.2025.00511"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00254"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00624"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01266"},{"key":"e_1_3_2_1_13_1","first-page":"48955","article-title":"Miradata: A large-scale video dataset with long durations and structured captions","volume":"37","author":"Ju Xuan","year":"2024","unstructured":"Xuan Ju, Yiming Gao, Zhaoyang Zhang, Ziyang Yuan, Xintao Wang, Ailing Zeng, Yu Xiong, Qiang Xu, and Ying Shan. 2024. Miradata: A large-scale video dataset with long durations and structured captions. Advances in Neural Information Processing Systems 37 (2024), 48955--48970.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_14_1","volume-title":"Computer Vision--ACCV 2018: 14th Asian Conference on Computer Vision, Perth, Australia, December 2--6","author":"Khoreva Anna","year":"2018","unstructured":"Anna Khoreva, Anna Rohrbach, and Bernt Schiele. 2019. Video object segmentation with language referring expressions. In Computer Vision--ACCV 2018: 14th Asian Conference on Computer Vision, Perth, Australia, December 2--6, 2018, Revised Selected Papers, Part IV 14. Springer, 123--141."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_16_1","volume-title":"William Yang Wang, et al.","author":"Li Linjie","year":"2021","unstructured":"Linjie Li, Jie Lei, Zhe Gan, Licheng Yu, Yen-Chun Chen, Rohit Pillai, Yu Cheng, Luowei Zhou, Xin Eric Wang, William Yang Wang, et al. 2021. Value: A multi-task benchmark for video-and-language understanding evaluation. arXiv preprint arXiv:2106.04632 (2021)."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00126"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the 41st International Conference on Machine Learning. 29545--29559","author":"Lian Shijie","year":"2024","unstructured":"Shijie Lian, Ziyi Zhang, Hua Li, Wenjie Li, Laurence Tianruo Yang, Sam Kwong, and Runmin Cong. 2024. Diving into underwater: Segment anything model guided underwater salient instance segmentation and a large-scale dataset. In Proceedings of the 41st International Conference on Machine Learning. 29545--29559."},{"key":"e_1_3_2_1_19_1","volume-title":"Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81.","author":"Lin Chin-Yew","year":"2004","unstructured":"Chin-Yew Lin. 2004. Rouge: A package for automatic evaluation of summaries. In Text summarization branches out. 74--81."},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of the the European Conference on Computer Vision. 740--755","author":"Lin Tsung-Yi","unstructured":"Tsung-Yi Lin, Michael Maire, Serge J. Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C. Lawrence Zitnick. 2014. Microsoft COCO: Common Objects in Context. In Proceedings of the the European Conference on Computer Vision. 740--755."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"e_1_3_2_1_22_1","volume-title":"HOIGen-1M: A Large-scale Dataset for Human-Object Interaction Video Generation. arXiv preprint arXiv:2503.23715","author":"Liu Kun","year":"2025","unstructured":"Kun Liu, Qi Liu, Xinchen Liu, Jie Li, Yongdong Zhang, Jiebo Luo, Xiaodong He, and Wu Liu. 2025. HOIGen-1M: A Large-scale Dataset for Human-Object Interaction Video Generation. arXiv preprint arXiv:2503.23715 (2025)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Shilong Liu Zhaoyang Zeng Tianhe Ren Feng Li Hao Zhang Jie Yang Chunyuan Li Jianwei Yang Hang Su Jun Zhu et al. 2024. Grounding dino: Marrying dino with grounded pre-training for open-set object detection. In arXiv preprint arXiv:2303.05499. 38--55.","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"e_1_3_2_1_24_1","volume-title":"Latte: Latent diffusion transformer for video generation. Transactions on Machine Learning Research","author":"Ma Xin","year":"2025","unstructured":"Xin Ma, Yaohui Wang, Xinyuan Chen, Gengyun Jia, Ziwei Liu, Yuan-Fang Li, Cunjian Chen, and Yu Qiao. 2025. Latte: Latent diffusion transformer for video generation. Transactions on Machine Learning Research (2025)."},{"key":"e_1_3_2_1_25_1","volume-title":"Mridul Khurana, Zhenyang Feng, Bahadir Altintas, Yasin Bakis, Elizabeth G Campolongo, et al.","author":"Mehrab Kazi Sajeed","year":"2024","unstructured":"Kazi Sajeed Mehrab, M Maruf, Arka Daw, Abhilash Neog, Harish Babu Manogaran, Mridul Khurana, Zhenyang Feng, Bahadir Altintas, Yasin Bakis, Elizabeth G Campolongo, et al. 2024. Fish-vista: A multi-purpose dataset for understanding & identification of traits from images. arXiv preprint arXiv:2407.08027 (2024)."},{"key":"e_1_3_2_1_26_1","volume-title":"Fahad Shahbaz Khan, and Salman Khan","author":"Munasinghe Shehan","year":"2024","unstructured":"Shehan Munasinghe, Hanan Gani, Wenqi Zhu, Jiale Cao, Eric Xing, Fahad Shahbaz Khan, and Salman Khan. 2024. VideoGLaMM: A Large Multimodal Model for Pixel-Level Visual Grounding in Videos. arXiv preprint arXiv:2411.04923 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"BASKET: A Large-Scale Video Dataset for Fine-Grained Skill Estimation. arXiv preprint arXiv:2503.20781","author":"Pan Yulu","year":"2025","unstructured":"Yulu Pan, Ce Zhang, and Gedas Bertasius. 2025. BASKET: A Large-Scale Video Dataset for Fine-Grained Skill Estimation. arXiv preprint arXiv:2503.20781 (2025)."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318","author":"Papineni Kishore","year":"2002","unstructured":"Kishore Papineni, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting of the Association for Computational Linguistics. 311--318."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01236"},{"key":"e_1_3_2_1_30_1","unstructured":"Nikhila Ravi Valentin Gabeur Yuan-Ting Hu Ronghang Hu Chaitanya Ryali Tengyu Ma Haitham Khedr Roman R\u00e4dle Chloe Rolland Laura Gustafson et al. 2024. Sam 2: Segment anything in images and videos. arXiv preprint arXiv:2408.00714 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"Results of the 2024 Video Browser Showdown. arXiv preprint arXiv:2502.15683","author":"Rossetto Luca","year":"2024","unstructured":"Luca Rossetto, Klaus Schoeffmann, Cathal Gurrin, Jakub Lokoc, and Werner Bailer. 2024. Results of the 2024 Video Browser Showdown. arXiv preprint arXiv:2502.15683 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"How2: a large-scale dataset for multimodal language understanding. arXiv preprint arXiv:1811.00347","author":"Sanabria Ramon","year":"2018","unstructured":"Ramon Sanabria, Ozan Caglayan, Shruti Palaskar, Desmond Elliott, Lo\u00efc Barrault, Lucia Specia, and Florian Metze. 2018. How2: a large-scale dataset for multimodal language understanding. arXiv preprint arXiv:1811.00347 (2018)."},{"key":"e_1_3_2_1_33_1","volume-title":"Proceedings, Part XV 16","author":"Seo Seonguk","year":"2020","unstructured":"Seonguk Seo, Joon-Young Lee, and Bohyung Han. 2020. Urvos: Unified referring video object segmentation network with a large-scale benchmark. In Computer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part XV 16. Springer, 208--223."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01836"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-27077-2_42"},{"key":"e_1_3_2_1_36_1","volume-title":"The IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Tuia Devis","year":"2025","unstructured":"Devis Tuia. 2025. MammAlps: A multi-view video behavior monitoring dataset of wild mammals in the Swiss Alps. In The IEEE\/CVF Conference on Computer Vision and Pattern Recognition 2025."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Lucia Vadicamo Rahel Arnold Werner Bailer Fabio Carrara Cathal Gurrin Nico Hezel Xinghan Li Jakub Lokoc Sebastian Lubos Zhixin Ma et al. 2024. Evaluating performance and trends in interactive video retrieval: Insights from the 12th vbs competition. IEEE Access (2024).","DOI":"10.1109\/ACCESS.2024.3405638"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Qiuheng Wang Yukai Shi Jiarong Ou Rui Chen Ke Lin Jiahao Wang Boyuan Jiang Haotian Yang Mingwu Zheng Xin Tao et al. 2024. Koala-36m: A large-scale video dataset improving consistency between fine-grained conditions and video content. arXiv preprint arXiv:2410.08260 (2024).","DOI":"10.1109\/CVPR52734.2025.00789"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"crossref","unstructured":"Yuk-Kwan Wong Ziqiang Zheng Mingzhe Zhang David Suggett and Sai-Kit Yeung. 2024. CoralSCOP-LAT: Labeling and Analyzing Tool for Coral Reef Images with Dense Mask. arXiv:2410.20436 [cs.CV] https:\/\/arxiv.org\/abs\/2410.20436","DOI":"10.1016\/j.ecoinf.2025.103402"},{"key":"e_1_3_2_1_41_1","volume-title":"A bilingual, openworld video text dataset and end-to-end video text spotter with transformer. arXiv preprint arXiv:2112.04888","author":"Wu Weijia","year":"2021","unstructured":"Weijia Wu, Yuanqiang Cai, Debing Zhang, Sibo Wang, Zhuang Li, Jiahong Li, Yejun Tang, and Hong Zhou. 2021. A bilingual, openworld video text dataset and end-to-end video text spotter with transformer. arXiv preprint arXiv:2112.04888 (2021)."},{"key":"e_1_3_2_1_42_1","volume-title":"Chunhua Shen, and Mike Zheng Shou.","author":"Wu Weijia","year":"2024","unstructured":"Weijia Wu, Mingyu Liu, Zeyu Zhu, Xi Xia, Haoen Feng, Wen Wang, Kevin Qinghong Lin, Chunhua Shen, and Mike Zheng Shou. 2024. MovieBench: A Hierarchical Movie Level Dataset for Long Video Generation. arXiv preprint arXiv:2411.15262 (2024)."},{"key":"e_1_3_2_1_43_1","volume-title":"See Kiong Ng, and Jiashi Feng","author":"Xu Lin","year":"2024","unstructured":"Lin Xu, Yilin Zhao, Daquan Zhou, Zhijie Lin, See Kiong Ng, and Jiashi Feng. 2024. Pllava: Parameter-free llava extension from images to videos for video dense captioning. arXiv preprint arXiv:2404.16994 (2024)."},{"key":"e_1_3_2_1_44_1","volume-title":"European Conference on Computer Vision. Springer, 239--257","author":"Zheng Ziqiang","year":"2024","unstructured":"Ziqiang Zheng, Yiwei Chen, Huimin Zeng, Tuan-Anh Vu, Binh-Son Hua, and Sai-Kit Yeung. 2024. Marineinst: A foundation model for marine image analysis with instance visual description. In European Conference on Computer Vision. Springer, 239--257."},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the European Conference on Computer Vision. Springer.","author":"Zheng Ziqiang","year":"2024","unstructured":"Ziqiang Zheng, Yiwe Chen, Huimin Zeng, Tuan-Anh Vu, Binh-Son Hua, and Sai-Kit Yeung. 2024. MarineInst: A Foundation Model for Marine Image Analysis with Instance Visual Description. In Proceedings of the European Conference on Computer Vision. Springer."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02661"}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3758198","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T19:13:53Z","timestamp":1765307633000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3758198"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":46,"alternative-id":["10.1145\/3746027.3758198","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3758198","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}