{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,1,17]],"date-time":"2025-01-17T08:40:25Z","timestamp":1737103225244,"version":"3.33.0"},"reference-count":26,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100020950","name":"National Science and Technology Council","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100020950","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100009950","name":"Ministry of Education","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100009950","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,15]]},"DOI":"10.1109\/bigdata62323.2024.10825835","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:23Z","timestamp":1737052283000},"page":"479-486","source":"Crossref","is-referenced-by-count":0,"title":["Visual Lifelog Retrieval through Captioning-Enhanced Interpretation"],"prefix":"10.1109","author":[{"given":"Yu-Fei","family":"Shih","sequence":"first","affiliation":[{"name":"National Taiwan University,Department of Computer Science and Information Engineering,Taipei,Taiwan"}]},{"given":"An-Zi","family":"Yen","sequence":"additional","affiliation":[{"name":"National Yang Ming Chiao Tung University,Department of Computer Science,Hsinchu,Taiwan"}]},{"given":"Hen-Hsen","family":"Huang","sequence":"additional","affiliation":[{"name":"Institute of Information Science Academia Sinica,Taipei,Taiwan"}]},{"given":"Hsin-Hsi","family":"Chen","sequence":"additional","affiliation":[{"name":"AI Research Center (AINTU) National Taiwan University,Department of Computer Science and Information Engineering,Taipei,Taiwan"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.4108\/icst.pervasivehealth.2013.252128"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i16.17678"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-023-14344-x"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3459637.3482022"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3592573.3593101"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3592573.3593105"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3512729.3533006"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3512729.3533012"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3592573.3593098"},{"key":"ref10","first-page":"14","article-title":"Overview of the ntcir-14 lifelog-3 task","volume-title":"Proceedings of the 14th NTCIR conference","author":"Gurrin"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462874"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3712059"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01727"},{"article-title":"Gpt-4 technical report","year":"2023","author":"Achiam","key":"ref14"},{"article-title":"ImageCLEF 2019: Multimedia retrieval in medicine, lifelogging, security and nature","volume-title":"Experimental IR Meets Multilinguality, Multimodality, and Interaction","author":"lonescu","key":"ref15"},{"article-title":"Learning transferable visual models from natural language supervision","year":"2021","author":"Radford","key":"ref16"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01100"},{"article-title":"Data filtering networks","year":"2023","author":"Fang","key":"ref18"},{"article-title":"What matters when building vision-language models?","year":"2024","author":"Laurencon","key":"ref19"},{"article-title":"Llava-next: Improved reasoning, ocr, and world knowledge","year":"2024","author":"Liu","key":"ref20"},{"article-title":"Internlm-xcomposer2: Mastering free-form text-image composition and comprehension in vision-language large model","year":"2024","author":"Dong","key":"ref21"},{"article-title":"Video-llava: Learning united visual representation by alignment before projection","year":"2023","author":"Lin","key":"ref22"},{"article-title":"World model on million-length video and language with blockwise ringattention","year":"2024","author":"Liu","key":"ref23"},{"article-title":"Dinov2: Learning robust visual features without supervision","year":"2024","author":"Oquab","key":"ref24"},{"article-title":"Towards general text embeddings with multi-stage contrastive learning","year":"2023","author":"Li","key":"ref25"},{"article-title":"Bge m3-embedding: Multi-lingual, multi-functionality, multi-granularity text embeddings through self-knowledge distillation","year":"2024","author":"Chen","key":"ref26"}],"event":{"name":"2024 IEEE International Conference on Big Data (BigData)","start":{"date-parts":[[2024,12,15]]},"location":"Washington, DC, USA","end":{"date-parts":[[2024,12,18]]}},"container-title":["2024 IEEE International Conference on Big Data (BigData)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10824975\/10824942\/10825835.pdf?arnumber=10825835","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,17]],"date-time":"2025-01-17T08:07:26Z","timestamp":1737101246000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10825835\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,15]]},"references-count":26,"URL":"https:\/\/doi.org\/10.1109\/bigdata62323.2024.10825835","relation":{},"subject":[],"published":{"date-parts":[[2024,12,15]]}}}