{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,1,18]],"date-time":"2025-01-18T05:06:52Z","timestamp":1737176812434,"version":"3.33.0"},"reference-count":47,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,15]],"date-time":"2024-12-15T00:00:00Z","timestamp":1734220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,12,15]]},"DOI":"10.1109\/bigdata62323.2024.10825297","type":"proceedings-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T18:31:23Z","timestamp":1737052283000},"page":"2549-2557","source":"Crossref","is-referenced-by-count":0,"title":["Beyond Essentials: Nuanced and Diverse Text-to-video Retrieval"],"prefix":"10.1109","author":[{"given":"Yuchen","family":"Yang","sequence":"first","affiliation":[{"name":"EPFL,Laboratory of Experimental Museology,Lausanne,Switzerland"}]}],"member":"263","reference":[{"doi-asserted-by":"publisher","key":"ref1","DOI":"10.1007\/s13735-023-00267-8"},{"doi-asserted-by":"publisher","key":"ref2","DOI":"10.1007\/s10462-021-10104-1"},{"doi-asserted-by":"publisher","key":"ref3","DOI":"10.1109\/CVPR.2016.571"},{"year":"2022","author":"Li","article-title":"Taking an emotional look at video paragraph captioning","key":"ref4"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.1109\/CVPR46437.2021.00365"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.1016\/j.neucom.2022.07.028"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1109\/CVPR52688.2022.00495"},{"doi-asserted-by":"publisher","key":"ref8","DOI":"10.1007\/978-3-030-58548-8_13"},{"doi-asserted-by":"publisher","key":"ref9","DOI":"10.1109\/CVPRW53098.2021.00374"},{"doi-asserted-by":"publisher","key":"ref10","DOI":"10.1007\/978-3-031-73013-9_23"},{"year":"2023","author":"Chen","article-title":"Valor: Vision-audio-language omni-perception pretraining model and dataset","key":"ref11"},{"doi-asserted-by":"publisher","key":"ref12","DOI":"10.1145\/3627167"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.1109\/TPAMI.2021.3059295"},{"doi-asserted-by":"publisher","key":"ref14","DOI":"10.1109\/TCSVT.2022.3150959"},{"doi-asserted-by":"publisher","key":"ref15","DOI":"10.1109\/CVPR46437.2021.00504"},{"year":"2021","author":"Fang","article-title":"Clip2video: Mastering video-text retrieval via image clip","key":"ref16"},{"doi-asserted-by":"publisher","key":"ref17","DOI":"10.1109\/CVPR52729.2023.01031"},{"doi-asserted-by":"publisher","key":"ref18","DOI":"10.3390\/jimaging7050076"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.1109\/ICCV.2019.00272"},{"doi-asserted-by":"publisher","key":"ref20","DOI":"10.1145\/3206025.3206064"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.1007\/978-3-030-01234-2_29"},{"year":"2020","author":"Dosovitskiy","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","key":"ref22"},{"key":"ref23","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"year":"2018","author":"Miech","article-title":"Learning a text-video embedding from incomplete and heterogeneous data","key":"ref24"},{"year":"2019","author":"Liu","article-title":"Use what you have: Video retrieval using representations from collaborative experts","key":"ref25"},{"doi-asserted-by":"publisher","key":"ref26","DOI":"10.1109\/CVPR52688.2022.01939"},{"doi-asserted-by":"publisher","key":"ref27","DOI":"10.1007\/978-3-031-19830-4_24"},{"doi-asserted-by":"publisher","key":"ref28","DOI":"10.1109\/ICCV51070.2023.01107"},{"doi-asserted-by":"publisher","key":"ref29","DOI":"10.1016\/j.cviu.2022.103581"},{"key":"ref30","first-page":"72 842","article-title":"Vast: A vision-audio-subtitle-text omni-modality foundation model and dataset","volume":"36","author":"Chen","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"doi-asserted-by":"publisher","key":"ref31","DOI":"10.1007\/BF01238023"},{"volume-title":"Story and Discourse - Narrative Structure in Fiction and Film.","year":"1978","author":"Chatman","key":"ref32"},{"year":"2023","author":"Wang","article-title":"Internvid: A large-scale video-text dataset for multi-modal understanding and generation","key":"ref33"},{"key":"ref34","first-page":"28 492","article-title":"Robust speech recognition via large-scale weak supervision","volume-title":"International conference on machine learning","author":"Radford"},{"doi-asserted-by":"publisher","key":"ref35","DOI":"10.1177\/0963947013511723"},{"doi-asserted-by":"publisher","key":"ref36","DOI":"10.1075\/ssol.2.1.03gre"},{"key":"ref37","doi-asserted-by":"crossref","DOI":"10.1038\/s42256-020-00280-0","article-title":"Estimation of continuous valence and arousal levels from faces in naturalistic conditions","volume-title":"Nature Machine Intelligence","author":"Toisoul","year":"2021"},{"issue":"3","key":"ref38","first-page":"6","article-title":"Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality","volume":"2","author":"Chiang","year":"2023"},{"doi-asserted-by":"publisher","key":"ref39","DOI":"10.48550\/arXiv.2102.05095"},{"doi-asserted-by":"publisher","key":"ref40","DOI":"10.21437\/Interspeech.2021-698"},{"year":"2018","author":"Oord","article-title":"Representation learning with contrastive predictive coding","key":"ref41"},{"year":"2024","author":"Cheng","article-title":"Videollama 2: Advancing spatial-temporal modeling and audio understanding in video-llms","key":"ref42"},{"doi-asserted-by":"publisher","key":"ref43","DOI":"10.1109\/CVPR.2019.00432"},{"key":"ref44","first-page":"10 347","article-title":"Training data-efficient image transformers & distillation through attention","volume-title":"International conference on machine learning","author":"Touvron"},{"key":"ref45","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume":"32","author":"Paszke","year":"2019","journal-title":"Advances in neural information processing systems"},{"year":"2014","author":"Kingma","article-title":"Adam: A method for stochastic optimization","key":"ref46"},{"doi-asserted-by":"publisher","key":"ref47","DOI":"10.1145\/3404835.3463257"}],"event":{"name":"2024 IEEE International Conference on Big Data (BigData)","start":{"date-parts":[[2024,12,15]]},"location":"Washington, DC, USA","end":{"date-parts":[[2024,12,18]]}},"container-title":["2024 IEEE International Conference on Big Data (BigData)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10824975\/10824942\/10825297.pdf?arnumber=10825297","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,17]],"date-time":"2025-01-17T07:46:09Z","timestamp":1737099969000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10825297\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,15]]},"references-count":47,"URL":"https:\/\/doi.org\/10.1109\/bigdata62323.2024.10825297","relation":{},"subject":[],"published":{"date-parts":[[2024,12,15]]}}}