{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,6]],"date-time":"2025-02-06T05:09:30Z","timestamp":1738818570907,"version":"3.37.0"},"reference-count":40,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,9,18]],"date-time":"2024-09-18T00:00:00Z","timestamp":1726617600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,9,18]],"date-time":"2024-09-18T00:00:00Z","timestamp":1726617600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,9,18]]},"DOI":"10.1109\/cbmi62980.2024.10859218","type":"proceedings-article","created":{"date-parts":[[2025,2,4]],"date-time":"2025-02-04T18:30:52Z","timestamp":1738693852000},"page":"1-7","source":"Crossref","is-referenced-by-count":0,"title":["A Comparison of Late-Fusion Training Strategies for Quad-Modal Joint Embeddings"],"prefix":"10.1109","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8784-2519","authenticated-orcid":false,"given":"Domenic Luca","family":"F\u00fcrer","sequence":"first","affiliation":[{"name":"University of Zurich,Department of Informatics,Zurich,Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0128-4602","authenticated-orcid":false,"given":"Abraham","family":"Bernstein","sequence":"additional","affiliation":[{"name":"University of Zurich,Department of Informatics,Zurich,Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5389-9465","authenticated-orcid":false,"given":"Luca","family":"Rossetto","sequence":"additional","affiliation":[{"name":"University of Zurich,Department of Informatics,Zurich,Switzerland"}]}],"member":"263","reference":[{"key":"ref1","first-page":"8821","article-title":"Zero-shot text-to-image generation","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18\u201324 July 2021, Virtual Event, ser. Proceedings of Machine Learning Research","volume":"139","author":"Ramesh"},{"key":"ref2","first-page":"10674","article-title":"Highresolution image synthesis with latent diffusion models","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2022, New Orleans, LA, USA, June 18\u201324, 2022","author":"Rombach"},{"key":"ref3","first-page":"21450","article-title":"Audioldm: Text-to-audio generation with latent diffusion models","volume-title":"International Conference on Machine Learning, ICML 2023, 23\u201329 July 2023, Honolulu, Hawaii, USA, ser. Proceedings of Machine Learning Research","volume":"202","author":"Liu"},{"article-title":"Make-a-video: Text-to-video generation without text-video data","volume-title":"The Eleventh International Conference on Learning Representations, ICLR 2023, Kigali, Rwanda, May 1\u20135, 2023. OpenReview.net","author":"Singer","key":"ref4"},{"issue":"6","key":"ref5","doi-asserted-by":"crossref","first-page":"96","DOI":"10.1109\/MSP.2017.2738401","article-title":"Deep multimodal learning: A survey on recent advances and trends","volume":"34","author":"Ramachandram","year":"2017","journal-title":"IEEE Signal Process. Mag."},{"issue":"3","key":"ref6","doi-asserted-by":"crossref","first-page":"583","DOI":"10.1109\/TCSVT.2015.2400779","article-title":"Semi-supervised cross-media feature learning with unified patch graph regularization","volume":"26","author":"Peng","year":"2016","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-021-02166-7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2016.541"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/tcsvt.2017.2705068"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350974"},{"key":"ref11","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18\u201324 July 2021, Virtual Event, ser. Proceedings of Machine Learning Research","volume":"139","author":"Radford"},{"key":"ref12","first-page":"13286","article-title":"MMTM: multimodal transfer module for CNN fusion","volume-title":"2020 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, CVPR 2020, Seattle, WA, USA, June 13\u201319, 2020. Computer Vision Foundation","author":"Joze","year":"2020"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/1101149.1101236"},{"key":"ref14","first-page":"7181","article-title":"Look, imagine and match: Improving textual-visual cross-modal retrieval with generative models","volume-title":"2018 IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2018, Salt Lake City, UT, USA, June 18\u201322, 2018. Computer Vision Foundation","author":"Gu"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2016.2598339"},{"key":"ref16","first-page":"12888","article-title":"BLIP: bootstrapping languageimage pre-training for unified vision-language understanding and generation","volume-title":"International Conference on Machine Learning, ICML 2022, 17\u201323 July 2022, Baltimore, Maryland, USA, ser. Proceedings of Machine Learning Research","volume":"162","author":"Li"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/icip.2019.8802922"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2018.2832602"},{"key":"ref19","first-page":"6787","article-title":"Videoclip: Contrastive pretraining for zero-shot video-text understanding","volume-title":"Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, EMNLP 2021, Virtual Event \/ Punta Cana, Dominican Republic, 7\u201311 November, 2021. Association for Computational Linguistics","author":"Xu"},{"key":"ref20","doi-asserted-by":"crossref","first-page":"100741","DOI":"10.1016\/j.websem.2022.100741","article-title":"Towards the web of embeddings: Integrating multiple knowledge graph embedding spaces with fedcoder","volume":"75","author":"Baumgartner","year":"2023","journal-title":"J. Web Semant."},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350906"},{"key":"ref22","article-title":"Hybrid contrastive learning of tri-modal representation for multimodal sentiment analysis","volume-title":"CoRR","volume":"2109.01797","author":"Mai","year":"2021"},{"key":"ref23","article-title":"Onellm: One framework to align all modalities with language","volume-title":"CoRR","volume":"2312.03700","author":"Han","year":"2023"},{"key":"ref24","article-title":"UnIVAL: Unified model for image, video, audio and language tasks","volume-title":"Transactions on Machine Learning Research","author":"Shukor","year":"2023"},{"key":"ref25","doi-asserted-by":"crossref","DOI":"10.18653\/v1\/2024.acl-long.521","article-title":"Anygpt: Unified multimodal llm with discrete sequence modeling","volume-title":"CoRR","author":"Zhan","year":"2024"},{"key":"ref26","first-page":"5998","article-title":"Attention is all you need","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4\u20139, 2017, Long Beach, CA, USA","author":"Vaswani","year":"2017"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/tip.2018.2852503"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/219717.219748"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-27077-2_37"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2016.90"},{"key":"ref31","first-page":"248","article-title":"Imagenet: A large-scale hierarchical image database","volume-title":"2009 IEEE Computer Society Conference on Computer Vision and Pattern Recognition (CVPR 2009), 20\u201325 June 2009, Miami, Florida, USA. IEEE Computer Society","author":"Deng"},{"key":"ref32","first-page":"16020","article-title":"Movinets: Mobile video networks for efficient video recognition","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition, CVPR 2021, virtual, June 19\u201325, 2021. Computer Vision Foundation","author":"Kondratyuk"},{"key":"ref33","article-title":"A short note on the kinetics-700 human action dataset","volume-title":"CoRR","volume":"1907.06987","author":"Carreira","year":"2019"},{"key":"ref34","first-page":"131","article-title":"CNN architectures for large-scale audio classification","volume-title":"2017 IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2017, New Orleans, LA, USA, March 5\u20139, 2017","author":"Hershey"},{"key":"ref35","article-title":"Youtube-8m: A large-scale video classification benchmark","volume-title":"CoRR","volume":"1609.08675","author":"Abu-El-Haija","year":"2016"},{"key":"ref36","first-page":"4171","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019, Minneapolis, MN, USA, June 27, 2019","volume":"1","author":"Devlin"},{"key":"ref37","article-title":"Bridging nonlinearities and stochastic regularizers with gaussian error linear units","volume-title":"CoRR","volume":"1606.08415","author":"Hendrycks","year":"2016"},{"key":"ref38","first-page":"1708","article-title":"Frozen in time: A joint video and image encoder for end-to-end retrieval","volume-title":"2021 IEEE\/CVF International Conference on Computer Vision, ICCV 2021, Montreal, QC, Canada, October 10\u201317, 2021. IEEE, 2021","author":"Bain"},{"key":"ref39","first-page":"4203","article-title":"Attention-based multimodal fusion for video description","volume-title":"IEEE International Conference on Computer Vision, ICCV 2017, Venice, Italy, October 22\u201329, 2017","author":"Hori"},{"key":"ref40","article-title":"Representation learning with contrastive predictive coding","volume-title":"CoRR","volume":"1807.03748","author":"Van den Oord","year":"2018"}],"event":{"name":"2024 International Conference on Content-Based Multimedia Indexing (CBMI)","start":{"date-parts":[[2024,9,18]]},"location":"Reykjavik, Iceland","end":{"date-parts":[[2024,9,20]]}},"container-title":["2024 International Conference on Content-Based Multimedia Indexing (CBMI)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10858870\/10858871\/10859218.pdf?arnumber=10859218","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,5]],"date-time":"2025-02-05T06:04:30Z","timestamp":1738735470000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10859218\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,18]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/cbmi62980.2024.10859218","relation":{},"subject":[],"published":{"date-parts":[[2024,9,18]]}}}