{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T05:57:35Z","timestamp":1781157455764,"version":"3.54.1"},"reference-count":39,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,5,13]],"date-time":"2026-05-13T00:00:00Z","timestamp":1778630400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,5,13]],"date-time":"2026-05-13T00:00:00Z","timestamp":1778630400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,5,13]]},"DOI":"10.1109\/fccm68464.2026.00026","type":"proceedings-article","created":{"date-parts":[[2026,6,10]],"date-time":"2026-06-10T19:59:45Z","timestamp":1781121585000},"page":"100-108","source":"Crossref","is-referenced-by-count":0,"title":["ReCoVLM: A Reconfigurable FPGA\u2013GPU Co-Design for Edge Vision-Language Inference"],"prefix":"10.1109","author":[{"given":"Jingyu","family":"Wang","sequence":"first","affiliation":[{"name":"Tiangong University,Tianjin,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yongjiang","family":"Xue","sequence":"additional","affiliation":[{"name":"Tiangong University,Tianjin,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kailai","family":"Zhuang","sequence":"additional","affiliation":[{"name":"Tiangong University,Tianjin,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mingze","family":"Sun","sequence":"additional","affiliation":[{"name":"Tiangong University,Tianjin,China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Qingzeng","family":"Song","sequence":"additional","affiliation":[{"name":"Tiangong University,Tianjin,China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref1","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. International Conference on Machine Learning (ICML)","author":"Radford"},{"key":"ref2","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. International Conference on Machine Learning (ICML)","author":"Li"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2025.3579032"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.32388\/ob1z2a"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1189"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW66978.2025.00173"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/PerComWorkshops65533.2025.00078"},{"key":"ref8","article-title":"SpotVLM: Cloud-edge collaborative real-time VLM based on context transfer","author":"Qian","year":"2025"},{"key":"ref9","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. International Conference on Machine Learning (ICML)","author":"Li"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1516"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2026.3654458"},{"key":"ref12","article-title":"Token merging: Your ViT but faster","author":"Bolya","year":"2022"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/WACV61041.2025.00906"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73004-7_2"},{"key":"ref15","article-title":"Multi-stage vision token dropping: Towards efficient multimodal large language model","author":"Liu","year":"2024"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/FCCM62733.2025.00046"},{"key":"ref17","article-title":"TerEffic: Highly efficient ternary LLM inference on FPGA","author":"Yin","year":"2025"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ISCAS46773.2023.10181988"},{"key":"ref19","article-title":"An efficient FPGA-based accelerator for Swin Transformer","author":"Liu","year":"2023"},{"key":"ref20","first-page":"1135","article-title":"A length adaptive algorithm-hardware co-design of Transformer on FPGA through sparse attention and dynamic pipelining","volume-title":"Proc. ACM\/IEEE Design Automation Conference (DAC)","author":"Peng"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3626202.3637562"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TCSI.2025.3546256"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC62836.2024.10938498"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3656177"},{"key":"ref25","article-title":"LLaVA-OneVision: Easy visual task transfer","author":"Li","year":"2024"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCAD66269.2025.11241002"},{"key":"ref27","article-title":"Qserve: W4A8KV4 quantization and system co-design for efficient LLM serving","volume-title":"Proc. Conference on Machine Learning and Systems (MLSys)","volume":"7","author":"Lin"},{"key":"ref28","first-page":"87","article-title":"AWQ: Activation-aware weight quantization for on-device LLM compression and acceleration","volume-title":"Proc. Conference on Machine Learning and Systems (MLSys)","volume":"6","author":"Lin"},{"key":"ref29","article-title":"StreamingVLM: Real-time understanding for infinite video streams","author":"Xu","year":"2025"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00686"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72658-3_13"},{"key":"ref33","article-title":"MME: A comprehensive evaluation benchmark for multimodal large language models","volume-title":"Proc. Advances in Neural Information Processing Systems (NeurIPS)","author":"Fu"},{"key":"ref34","first-page":"2507","article-title":"Learn to explain: Multimodal reasoning via thought chains for science question answering","volume-title":"Proc. Advances in Neural Information Processing Systems (NeurIPS)","author":"Lu"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00851"},{"key":"ref36","article-title":"MM-Vet: Evaluating large multimodal models for integrated capabilities","author":"Yu","year":"2023"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01843"},{"key":"ref38","article-title":"SparseVLM: Visual token sparsification for efficient vision-language model inference","author":"Zhang","year":"2024"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72643-9_13"}],"event":{"name":"2026 IEEE 34th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)","location":"Atlanta, GA, USA","start":{"date-parts":[[2026,5,13]]},"end":{"date-parts":[[2026,5,16]]}},"container-title":["2026 IEEE 34th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11552597\/11552602\/11552648.pdf?arnumber=11552648","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T05:04:53Z","timestamp":1781154293000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11552648\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,13]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1109\/fccm68464.2026.00026","relation":{},"subject":[],"published":{"date-parts":[[2026,5,13]]}}}