{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T10:23:15Z","timestamp":1777890195001,"version":"3.51.4"},"reference-count":69,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.02115","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"22783-22793","source":"Crossref","is-referenced-by-count":0,"title":["Multi-Modal Multi-Task Unified Embedding Model (M3T-UEM): A Task-Adaptive Representation Learning Framework"],"prefix":"10.1109","author":[{"given":"Rohan","family":"Sharma","sequence":"first","affiliation":[{"name":"Amazon"}]},{"given":"Changyou","family":"Chen","sequence":"additional","affiliation":[{"name":"Amazon"}]},{"given":"Feng-Ju","family":"Chang","sequence":"additional","affiliation":[{"name":"Amazon"}]},{"given":"Seongjun","family":"Yun","sequence":"additional","affiliation":[{"name":"Amazon"}]},{"given":"Xiaohu","family":"Xie","sequence":"additional","affiliation":[{"name":"Amazon"}]},{"given":"Rui","family":"Meng","sequence":"additional","affiliation":[{"name":"Amazon"}]},{"given":"Dehong","family":"Xu","sequence":"additional","affiliation":[{"name":"Amazon"}]},{"given":"Alejandro","family":"Mottini","sequence":"additional","affiliation":[{"name":"Amazon"}]},{"given":"Qingjun","family":"Cui","sequence":"additional","affiliation":[{"name":"Amazon"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Towards zeroshot cross-lingual image retrieval","author":"Aggarwal","year":"2020","journal-title":"arXiv preprint arXiv"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1723"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01600"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2454"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2454"},{"key":"ref7","first-page":"1597","article-title":"A simple framework for contrastive learning of visual representations","volume-title":"International conference on machine learning","author":"Chen","year":"2020"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.925"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.02283"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.202"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01617"},{"key":"ref12","first-page":"215","article-title":"An analysis of single-layer networks in unsupervised feature learning","volume-title":"Proceedings of the Fourteenth International Conference on Artificial Intelligence and Statistics","author":"Coates"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3637528.3671474"},{"key":"ref14","article-title":"Dreamsim: Learning new dimensions of human visual similarity using synthetic data","author":"Fu","year":"2023","journal-title":"arXiv preprint arXiv"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0486"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.163"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01282"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3994"},{"key":"ref19","article-title":"Deepspeed-fastgen: High-throughput text generation for 11 ms via mii and deepspeed-inference","author":"Holmes","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1355"},{"key":"ref21","article-title":"Lora: Low-rank adaptation of large language models","author":"Hu","year":"2021","journal-title":"arXiv preprint arXiv"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01108"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v40i7.37427"},{"key":"ref24","volume-title":"Openclip, 2021. If you use this software, please cite it as below","author":"Ilharco"},{"key":"ref25","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","volume-title":"International conference on machine learning","author":"Jia","year":"2021"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00740"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.findings-emnlp.181"},{"key":"ref28","article-title":"Vlm2vec: Training vision-language models for massive multimodal embedding tasks","author":"Jiang","journal-title":"2024. 2, 3, 8"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.215"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00915"},{"key":"ref31","article-title":"Nvembed: Improved techniques for training llms as generalist embedding models","author":"Lee","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0675"},{"key":"ref33","first-page":"12888","article-title":"Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"International conference on machine learning","author":"Li","year":"2022"},{"key":"ref34","article-title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"In-ternational conference on machine learning","author":"Li","year":"2023"},{"key":"ref35","article-title":"MM-EMBED: UNIVERSAL MULTIMODAL RETRIEVAL WITH MULTIMODAL LLMS","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Lin","year":"2025"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.542"},{"key":"ref38","first-page":"36","article-title":"Visual instruction tuning","author":"Liu","year":"2024","journal-title":"Advances in neural information processing systems"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.297"},{"key":"ref40","first-page":"2125","article-title":"Image retrieval on real-life images with pretrained vision-and-language models","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","author":"Liu","year":"2021"},{"key":"ref41","article-title":"Unified-io: A unified model for vision, language, and multi-modal tasks","volume-title":"The Eleventh International Conference on Learning Representations","author":"Lu","year":"2022"},{"key":"ref42","article-title":"Multi-modal generative embedding model","author":"Ma","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref43","volume-title":"dsprites: Disentanglement testing sprites dataset","author":"Matthey","year":"2017"},{"key":"ref44","article-title":"Generative representational instruction tuning","author":"Muennighoff","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.2118\/18761-MS"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.2307\/3318671"},{"key":"ref47","article-title":"Representation learning with contrastive predictive coding","author":"van den Oord","year":"2018","journal-title":"arXiv preprint arXiv"},{"key":"ref48","article-title":"A probability contrastive learning framework for 3d molecular representation learning","volume-title":"The Thirtyeighth Annual Conference on Neural Information Processing Systems","author":"Qin","year":"2024"},{"key":"ref49","article-title":"Unified text-to-image generation and retrieval","author":"Qu","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref50","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford","year":"2021"},{"key":"ref51","article-title":"Contrastive learning with hard negative samples","author":"Robinson","year":"2020","journal-title":"arXiv preprint arXiv"},{"key":"ref52","volume-title":"Advancements in modern recommender systems: Industrial applications in social media, e-commerce, entertainment, and beyond","author":"Sankalp","year":"2024"},{"key":"ref53","article-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs","author":"Schuhmann","year":"2021","journal-title":"arXiv preprint arXiv"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref55","article-title":"Auc-cl: A batchsize-robust framework for self-supervised contrastive representation learning","volume-title":"The Twelfth International Conference on Learning Representations","author":"Sharma","year":"2023"},{"key":"ref56","first-page":"3109431116","article-title":"Flexgen: High-throughput generative inference of large language models with a single gpu","volume-title":"International Conference on Machine Learning","author":"Sheng","year":"2023"},{"key":"ref57","first-page":"36","article-title":"Cwcl: Cross-modal transfer with continuously weighted contrastive loss","author":"Sharma Srinivasa","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00517"},{"key":"ref59","article-title":"Text embeddings by weakly-supervised contrastive pretraining","author":"Wang","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73021-4_23"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01115"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"ref63","article-title":"Clip-vip: Adapting pretrained image-text model to video-language representation alignment","author":"Xue","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"ref65","article-title":"Coca: Contrastive captioners are image-text foundation models","author":"Yu","year":"2022","journal-title":"arXiv preprint arXiv"},{"key":"ref66","first-page":"12310","article-title":"Barlow twins: Self-supervised learning via redundancy reduction","volume-title":"International conference on machine learning","author":"Zbontar","year":"2021"},{"key":"ref67","volume-title":"The visual task adaptation benchmark","author":"Zhai","year":"2019"},{"key":"ref68","article-title":"Magiclens: Self-supervised image retrieval with open-ended instructions","author":"Zhang","year":"2024","journal-title":"arXiv preprint arXiv"},{"key":"ref69","article-title":"Minigpt-4: Enhancing vision-language understanding with advanced large language models","author":"Zhu","year":"2023","journal-title":"arXiv preprint arXiv"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444542.pdf?arnumber=11444542","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T05:30:05Z","timestamp":1777613405000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444542\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":69,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.02115","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}