{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T01:10:24Z","timestamp":1755825024702,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","funder":[{"name":"The Hong Kong University of Science and Technology (Guangzhou)"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,6,30]]},"DOI":"10.1145\/3731715.3733461","type":"proceedings-article","created":{"date-parts":[[2025,6,25]],"date-time":"2025-06-25T18:29:43Z","timestamp":1750876183000},"page":"1452-1460","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["ViFusion: In-Network Tensor Fusion for Scalable Video Feature Indexing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-4185-6021","authenticated-orcid":false,"given":"Yisu","family":"Wang","sequence":"first","affiliation":[{"name":"Information Hub, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-2996-9703","authenticated-orcid":false,"given":"Yixiang","family":"Zhu","sequence":"additional","affiliation":[{"name":"Information Hub, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9591-4031","authenticated-orcid":false,"given":"Xinjiao","family":"Li","sequence":"additional","affiliation":[{"name":"Information Hub, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0003-8626","authenticated-orcid":false,"given":"Yulong","family":"Zhang","sequence":"additional","affiliation":[{"name":"Information Hub, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-0834-6723","authenticated-orcid":false,"given":"Ruilong","family":"Wu","sequence":"additional","affiliation":[{"name":"Information Hub, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9021-9916","authenticated-orcid":false,"given":"Dirk","family":"Kutscher","sequence":"additional","affiliation":[{"name":"Information Hub, The Hong Kong University of Science and Technology (Guangzhou), Guangzhou, Guangdong, China"}]}],"member":"320","published-online":{"date-parts":[[2025,6,30]]},"reference":[{"unstructured":"Barefoot Tofino Software Behavior Model. 2024. https:\/\/www.barefootnetworks.com\/products\/brief-p4-studio\/.","key":"e_1_3_2_1_1_1"},{"key":"e_1_3_2_1_2_1","volume-title":"TensorFlow: A system for large-scale machine learning. arXiv:1605.08695","author":"Abadi M.","year":"2016","unstructured":"Abadi, M., Barham, P., Chen, J., et al. TensorFlow: A system for large-scale machine learning. arXiv:1605.08695, 2016."},{"key":"e_1_3_2_1_3_1","volume-title":"ViViT: A Video Vision Transformer. arXiv:2103.15691","author":"Arnab A.","year":"2021","unstructured":"Arnab, A., Dehghani, M., Heigold, G., et al. ViViT: A Video Vision Transformer. arXiv:2103.15691, 2021."},{"key":"e_1_3_2_1_4_1","volume-title":"MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding. arXiv:2404.03413","author":"Ataallah K.","year":"2024","unstructured":"Ataallah, K., Shen, X., Abdelrahman, E., et al. MiniGPT4-Video: Advancing Multimodal LLMs for Video Understanding. arXiv:2404.03413, 2024."},{"key":"e_1_3_2_1_5_1","volume-title":"Goldfish: Vision-Language Understanding of Arbitrarily Long Videos. arXiv:2407.12679","author":"Ataallah K.","year":"2024","unstructured":"Ataallah, K., Shen, X., Abdelrahman, E., et al. Goldfish: Vision-Language Understanding of Arbitrarily Long Videos. arXiv:2407.12679, 2024."},{"key":"e_1_3_2_1_6_1","volume-title":"NSDI'17","author":"Crankshaw D.","year":"2017","unstructured":"Crankshaw, D., Wang, X., Zhou, G., et al. Clipper: A low-latency online prediction serving system. In NSDI'17, USENIX, 2017."},{"key":"e_1_3_2_1_7_1","volume-title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948","author":"DeepSeek AI.","year":"2025","unstructured":"DeepSeek-AI. DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning. arXiv:2501.12948, 2025."},{"key":"e_1_3_2_1_8_1","volume-title":"I've Got 99 Problems But FLOPS Ain't One. arXiv:2407.12819","author":"Gherghescu A. M.","year":"2024","unstructured":"Gherghescu, A. M., B\u0102doiu, V-A., Agache, A., et al. I've Got 99 Problems But FLOPS Ain't One. arXiv:2407.12819, 2024."},{"key":"e_1_3_2_1_9_1","volume-title":"Cephalo: Harnessing Heterogeneous GPU Clusters for Training Transformer Models. arXiv:2411.01075","author":"Guo R. B.","year":"2024","unstructured":"Guo, R. B., Anand, U., Chen, A., et al. Cephalo: Harnessing Heterogeneous GPU Clusters for Training Transformer Models. arXiv:2411.01075, 2024."},{"key":"e_1_3_2_1_10_1","volume-title":"MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video Understanding. arXiv:2404.05726","author":"He B.","year":"2024","unstructured":"He, B., Li, H., Jang, Y. K., et al. MA-LMM: Memory-Augmented Large Multimodal Model for Long-Term Video Understanding. arXiv:2404.05726, 2024."},{"key":"e_1_3_2_1_11_1","volume-title":"ASM-Loc: Action-aware Segment Modeling. arXiv:2203.15187","author":"He B.","year":"2022","unstructured":"He, B., Yang, X., Kang, L., et al. ASM-Loc: Action-aware Segment Modeling. arXiv:2203.15187, 2022."},{"key":"e_1_3_2_1_12_1","volume-title":"Gloo: Collective Communication Library","author":"Facebook Incubator","year":"2023","unstructured":"Facebook Incubator. Gloo: Collective Communication Library. 2023. https:\/\/github.com\/facebookincubator\/gloo."},{"key":"e_1_3_2_1_13_1","volume-title":"OSDI'20","author":"Jiang Y.","year":"2020","unstructured":"Jiang, Y., Zhu, Y., Lan, C., et al. A Unified Architecture for Accelerating Distributed DNN Training. In OSDI'20, USENIX, 2020."},{"key":"e_1_3_2_1_14_1","volume-title":"A Survey on In-Network Computing","author":"Kianpisheh S.","year":"2023","unstructured":"Kianpisheh, S. and Taleb, T. A Survey on In-Network Computing. IEEE Communications Surveys Tutorials, 2023."},{"key":"e_1_3_2_1_15_1","volume-title":"Efficient Memory Management for LLM Serving with PagedAttention. arXiv:2309.06180","author":"Kwon W.","year":"2023","unstructured":"Kwon, W., Li, Z., Zhuang, S., et al. Efficient Memory Management for LLM Serving with PagedAttention. arXiv:2309.06180, 2023."},{"key":"e_1_3_2_1_16_1","volume-title":"ATP: In-network Aggregation for Multi-tenant Learning. In NSDI'21","author":"Lao C.","year":"2021","unstructured":"Lao, C., Le, Y., Mahajan, K., et al. ATP: In-network Aggregation for Multi-tenant Learning. In NSDI'21, USENIX, 2021."},{"key":"e_1_3_2_1_17_1","volume-title":"Visual Instruction Tuning. arXiv:2304.08485","author":"Liu H.","year":"2023","unstructured":"Liu, H., Li, C., Wu, Q., et al. Visual Instruction Tuning. arXiv:2304.08485, 2023."},{"key":"e_1_3_2_1_18_1","volume-title":"CacheGen: KV Cache Compression for LLM Serving. arXiv:2310.07240","author":"Liu Y.","year":"2024","unstructured":"Liu, Y., Li, H., Cheng, Y., et al. CacheGen: KV Cache Compression for LLM Serving. arXiv:2310.07240, 2024."},{"key":"e_1_3_2_1_19_1","volume-title":"CLIP4Clip: An Empirical Study of CLIP for Video Clip Retrieval. arXiv:2104.08860","author":"Luo H.","year":"2021","unstructured":"Luo, H., Ji, L., Zhong, M., et al. CLIP4Clip: An Empirical Study of CLIP for Video Clip Retrieval. arXiv:2104.08860, 2021."},{"unstructured":"NVIDIA. NCCL: NVIDIA Collective Communications Library. https:\/\/developer.nvidia.com\/nccl.","key":"e_1_3_2_1_20_1"},{"unstructured":"OpenAI. GPT-4 Technical Report. arXiv:2303.08774 2024.","key":"e_1_3_2_1_21_1"},{"unstructured":"OpenAI. Sora: Creating video from text. https:\/\/openai.com\/sora.","key":"e_1_3_2_1_22_1"},{"key":"e_1_3_2_1_23_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke A.","year":"2019","unstructured":"Paszke, A., Gross, S., Massa, F., et al. PyTorch: An Imperative Style, High-Performance DL Library. In NeurIPS 32, 2019."},{"key":"e_1_3_2_1_24_1","first-page":"4195","author":"Peebles W.","year":"2023","unstructured":"Peebles, W., and Xie, S. Scalable Diffusion Models with Transformers. In ICCV 2023, 4195--4205.","journal-title":"Scalable Diffusion Models with Transformers. In"},{"key":"e_1_3_2_1_25_1","volume-title":"Learning Transferable Visual Models From Natural Language Supervision. arXiv:2103.00020","author":"Radford A.","year":"2021","unstructured":"Radford, A., Kim, J. W., Hallacy, C., et al. Learning Transferable Visual Models From Natural Language Supervision. arXiv:2103.00020, 2021."},{"key":"e_1_3_2_1_26_1","volume-title":"DeepSpeed-MoE: Advancing Mixture-of-Experts Inference. arXiv:2201.05596","author":"Rajbhandari S.","year":"2022","unstructured":"Rajbhandari, S., Li, C., Yao, Z., et al. DeepSpeed-MoE: Advancing Mixture-of-Experts Inference. arXiv:2201.05596, 2022."},{"key":"e_1_3_2_1_27_1","volume-title":"Scaling Distributed ML with In-Network Aggregation. In NSDI'21","author":"Sapio A.","year":"2021","unstructured":"Sapio, A., Canini, M., Ho, C-Y., et al. Scaling Distributed ML with In-Network Aggregation. In NSDI'21, USENIX, 2021."},{"key":"e_1_3_2_1_28_1","volume-title":"Horovod: Fast and Easy Distributed DL. arXiv:1802.05799","author":"Sergeev A.","year":"2018","unstructured":"Sergeev, A., and Del Balso, M. Horovod: Fast and Easy Distributed DL. arXiv:1802.05799, 2018."},{"key":"e_1_3_2_1_29_1","volume-title":"Generative Modeling by Estimating Gradients. arXiv:1907.05600","author":"Song Y.","year":"2020","unstructured":"Song, Y., and Ermon, S. Generative Modeling by Estimating Gradients. arXiv:1907.05600, 2020."},{"key":"e_1_3_2_1_30_1","volume-title":"IEEE SPL 31","author":"Sun Y.","year":"2024","unstructured":"Sun, Y., Xu, Y., Xie, Z., et al. GPTSee: Enhancing Moment Retrieval. IEEE SPL 31, 2024."},{"key":"e_1_3_2_1_31_1","volume-title":"The Evolution of Multimodal Model Architectures. arXiv:2405.17927","author":"Wadekar S. N.","year":"2024","unstructured":"Wadekar, S. N., Chaurasia, A., Chadha, A., et al. The Evolution of Multimodal Model Architectures. arXiv:2405.17927, 2024."},{"key":"e_1_3_2_1_32_1","volume-title":"RAT - Resilient Allreduce Tree. In APNet '20","author":"Wan X.","year":"2020","unstructured":"Wan, X., Zhang, H., Wang, H., et al. RAT - Resilient Allreduce Tree. In APNet '20, ACM, 2020."},{"volume-title":"Milvus: A Vector Data Management System. In SIGMOD '21","author":"Wang J.","unstructured":"Wang, J., Yi, X., Guo, R., et al. Milvus: A Vector Data Management System. In SIGMOD '21, ACM, 2614--2627.","key":"e_1_3_2_1_33_1"},{"key":"e_1_3_2_1_34_1","volume-title":"Improving Interpretable Embeddings for Ad-hoc Video Search. arXiv:2404.06173","author":"Wu J.","year":"2024","unstructured":"Wu, J., Ngo, C-W., Chan, W-K. Improving Interpretable Embeddings for Ad-hoc Video Search. arXiv:2404.06173, 2024."},{"key":"e_1_3_2_1_35_1","volume-title":"MSR-VTT: A Large Video Description Dataset. In CVPR","author":"Xu J.","year":"2016","unstructured":"Xu, J., Mei, T., Yao, T., et al. MSR-VTT: A Large Video Description Dataset. In CVPR 2016, 5288--5296."},{"volume-title":"SIGCOMM '23","author":"Xu W.","unstructured":"Xu, W., Zhang, Z., Feng, Y., et al. ClickINC: In-network Computing as a Service. In SIGCOMM '23, ACM, 798--815.","key":"e_1_3_2_1_36_1"},{"volume-title":"SIGCOMM '22","author":"Yang M.","unstructured":"Yang, M., Baban, A., Kugel, V., et al. Using Trio: Juniper's Programmable Chipset. In SIGCOMM '22, ACM, 633--648.","key":"e_1_3_2_1_37_1"},{"key":"e_1_3_2_1_38_1","volume-title":"The Dawn of LMMs: Explorations with GPT-4V. arXiv:2309.17421","author":"Yang Z.","year":"2023","unstructured":"Yang, Z., Li, L., Lin, K., et al. The Dawn of LMMs: Explorations with GPT-4V. arXiv:2309.17421, 2023."},{"volume-title":"Enhanced Batch Query Architecture. In CIKM '24","author":"Zhang Q.","unstructured":"Zhang, Q., Teng, Z., Wu, D., et al. Enhanced Batch Query Architecture. In CIKM '24, ACM, 5078--5085.","key":"e_1_3_2_1_39_1"},{"volume-title":"NetRPC: Enabling In-Network Computation in RPCs. In NSDI '23","author":"Zhao B.","unstructured":"Zhao, B., Wu, W., and Xu, W. NetRPC: Enabling In-Network Computation in RPCs. In NSDI '23, USENIX, 199--217.","key":"e_1_3_2_1_40_1"},{"doi-asserted-by":"publisher","key":"e_1_3_2_1_41_1","DOI":"10.1145\/3687230.3687232"},{"key":"e_1_3_2_1_42_1","volume-title":"DistServe: Disaggregating LLM Serving. arXiv:2401.09670","author":"Zhong Y.","year":"2024","unstructured":"Zhong, Y., Liu, S., Chen, J., et al. DistServe: Disaggregating LLM Serving. arXiv:2401.09670, 2024."},{"key":"e_1_3_2_1_43_1","volume-title":"MiniGPT-4: Enhancing Vision-Language Understanding. arXiv:2304.10592","author":"Zhu D.","year":"2023","unstructured":"Zhu, D., Chen, J., Shen, X., et al. MiniGPT-4: Enhancing Vision-Language Understanding. arXiv:2304.10592, 2023."}],"event":{"sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"acronym":"ICMR '25","name":"ICMR '25: International Conference on Multimedia Retrieval","location":"Chicago IL USA"},"container-title":["Proceedings of the 2025 International Conference on Multimedia Retrieval"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731715.3733461","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T04:12:26Z","timestamp":1755749546000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731715.3733461"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,30]]},"references-count":43,"alternative-id":["10.1145\/3731715.3733461","10.1145\/3731715"],"URL":"https:\/\/doi.org\/10.1145\/3731715.3733461","relation":{},"subject":[],"published":{"date-parts":[[2025,6,30]]},"assertion":[{"value":"2025-06-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}