{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:20:08Z","timestamp":1775578808036,"version":"3.50.1"},"reference-count":79,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62032022"],"award-info":[{"award-number":["62032022"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61929104"],"award-info":[{"award-number":["61929104"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62027827"],"award-info":[{"award-number":["62027827"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61972375"],"award-info":[{"award-number":["61972375"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Scientific Research Program of Beijing Municipal Education Commission","award":["KZ201911417048"],"award-info":[{"award-number":["KZ201911417048"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/tmm.2023.3330047","type":"journal-article","created":{"date-parts":[[2023,11,3]],"date-time":"2023-11-03T18:08:57Z","timestamp":1699034937000},"page":"2795-2808","source":"Crossref","is-referenced-by-count":32,"title":["FoodSAM: Any Food Segmentation"],"prefix":"10.1109","volume":"27","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3624-5926","authenticated-orcid":false,"given":"Xing","family":"Lan","sequence":"first","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9202-0947","authenticated-orcid":false,"given":"Jiayi","family":"Lyu","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9258-521X","authenticated-orcid":false,"given":"Hanyu","family":"Jiang","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-3714-1650","authenticated-orcid":false,"given":"Kun","family":"Dong","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-1732-8627","authenticated-orcid":false,"given":"Zehai","family":"Niu","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5926-8721","authenticated-orcid":false,"given":"Yi","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9460-802X","authenticated-orcid":false,"given":"Jian","family":"Xue","sequence":"additional","affiliation":[{"name":"University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Huggingface\u2019s transformers: State-of-the-art natural language processing","volume":"abs\/1910.03771","author":"Wolf","year":"2019","journal-title":"CoRR"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-5004"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/s11431-020-1647-3"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/j.lindif.2023.102274"},{"key":"ref5","article-title":"Emergent abilities of large language models","volume":"2022","author":"Wei","year":"2022","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref6","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wei","year":"2022"},{"key":"ref7","article-title":"Segment anything","volume":"abs\/2304.02643","author":"Kirillov","year":"2023","journal-title":"CoRR"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3600095"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1111\/1541-4337.12492"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/j.crfs.2021.03.009"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475201"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/IWSSIP48289.2020.9145130"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2023.3238524"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00742"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-68821-9_51"},{"key":"ref16","article-title":"Sparks of artificial general intelligence: Early experiments with GPT-4","volume":"abs\/2303.12712","author":"Bubeck","year":"2023","journal-title":"CoRR"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/s11023-020-09548-1"},{"key":"ref18","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Brown","year":"2020"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403237"},{"key":"ref20","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref21","article-title":"Supervision exists everywhere: A data efficient contrastive language-image pre-training paradigm","volume-title":"Proc. 10th Int. Conf. Learn Representations","author":"Li","year":"2022"},{"key":"ref22","article-title":"On the opportunities and risks of foundation models","volume":"abs\/2108.07258","author":"Bommasani","year":"2021","journal-title":"CoRR"},{"key":"ref23","article-title":"Florence: A new foundation model for computer vision","volume":"abs\/2111.11432","author":"Yuan","year":"2021","journal-title":"CoRR"},{"key":"ref24","article-title":"Llama: Open and efficient foundation language models","volume":"abs\/2302.13971","author":"Touvron","year":"2023","journal-title":"CoRR"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1016\/0031-3203(93)90135-J"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1016\/S0031-3203(00)00149-7"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2009.03.008"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1006\/gmip.1998.0480"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995323"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1016\/S0734-189X(85)90153-7"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3059968"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/s13735-017-0141-z"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/WACV.2018.00163"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2019.11.118"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00326"},{"key":"ref36","article-title":"OCNet: Object context network for scene parsing","volume":"abs\/1809.00916","author":"Yuan","year":"2018","journal-title":"CoRR"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00681"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/s13735-020-00195-x"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00925"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00963"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00902"},{"key":"ref43","article-title":"Panoptic segmentation: A review","volume":"abs\/2111.10250","author":"Elharrouss","year":"2021","journal-title":"CoRR"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00695"},{"key":"ref46","article-title":"Gpt understands, too","volume":"abs\/2103.10385","author":"Liu","year":"2021","journal-title":"CoRR"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i7.20729"},{"issue":"5","key":"ref48","article-title":"Applications of knowledge graphs for food science and industry","volume-title":"Patterns","volume":"3","author":"Min","year":"2022"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3237871"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3414031"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW54120.2021.00264"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00069"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/SEAI55746.2022.9832133"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3512527.3531426"},{"key":"ref55","article-title":"Transferring knowledge for food image segmentation using transformers and convolutions","volume":"abs\/2306.09203","author":"Sinha","year":"2023","journal-title":"CoRR"},{"key":"ref56","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","volume-title":"Proc. 9th Int. Conf. Learn. Representations","author":"Dosovitskiy","year":"2021"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref58","first-page":"5998","article-title":"Attention is all you need","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Vaswani","year":"2017"},{"key":"ref59","article-title":"Recognize anything: A strong image tagging model","volume":"abs\/2306.03514","author":"Zhang","year":"2023","journal-title":"CoRR"},{"key":"ref60","article-title":"Segment everything everywhere all at once","volume":"abs\/2304.06718","author":"Zou","year":"2023","journal-title":"CoRR"},{"key":"ref61","article-title":"Semantic segment anything","author":"Chen","year":"2023"},{"key":"ref62","article-title":"Fast segment anything","volume":"abs\/2306.12156","author":"Zhao","year":"2023","journal-title":"CoRR"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1145\/1015706.1015720"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1145\/3347448.3357162"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1802.02611"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CCIS53392.2021.9754670"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/s00530-023-01088-9"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.3390\/s21227504"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1016\/j.compeleceng.2022.108380"},{"key":"ref71","article-title":"What a MESS: Multi-domain evaluation of zero-shot semantic segmentation","volume":"abs\/2306.15521","author":"Blumenstiel","year":"2023","journal-title":"CoRR"},{"key":"ref72","first-page":"736","article-title":"A simple baseline for open-vocabulary semantic segmentation with pre-trained vision-language model","volume-title":"Proc. Comput. Vis. ECCV 17th Eur. Conf.","volume":"13689","author":"Xu","year":"2022"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01129"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01451"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00288"},{"key":"ref76","article-title":"A simple framework for open-vocabulary segmentation and detection","volume":"abs\/2303.08131","author":"Zhang","year":"2023","journal-title":"CoRR"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"ref78","article-title":"CAT-SEG: Cost aggregation for open-vocabulary semantic segmentation","volume":"abs\/2303.11797","author":"Cho","year":"2023","journal-title":"CoRR"},{"key":"ref79","article-title":"Grounding DINO: Marrying DINO with grounded pre-training for open-set object detection","volume":"abs\/2303.05499","author":"Liu","year":"2023","journal-title":"CoRR"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6046\/10844992\/10306316.pdf?arnumber=10306316","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T04:31:13Z","timestamp":1748320273000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10306316\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":79,"URL":"https:\/\/doi.org\/10.1109\/tmm.2023.3330047","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}