{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,27]],"date-time":"2025-06-27T12:44:11Z","timestamp":1751028251163,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":18,"publisher":"ACM","license":[{"start":{"date-parts":[[2016,10,1]],"date-time":"2016-10-01T00:00:00Z","timestamp":1475280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Center for Brains, Minds and Machines (CBMM)","award":["CCF-1231216"],"award-info":[{"award-number":["CCF-1231216"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2016,10]]},"DOI":"10.1145\/2964284.2967212","type":"proceedings-article","created":{"date-parts":[[2016,9,29]],"date-time":"2016-09-29T19:17:32Z","timestamp":1475176652000},"page":"207-211","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":28,"title":["Joint Image-Text Representation by Gaussian Visual-Semantic Embedding"],"prefix":"10.1145","author":[{"given":"Zhou","family":"Ren","sequence":"first","affiliation":[{"name":"University of California, Los Angeles, Los Angeles, CA, USA"}]},{"given":"Hailin","family":"Jin","sequence":"additional","affiliation":[{"name":"Adobe Research, San Jose, CA, USA"}]},{"given":"Zhe","family":"Lin","sequence":"additional","affiliation":[{"name":"Adobe Research, San Jose, CA, USA"}]},{"given":"Chen","family":"Fang","sequence":"additional","affiliation":[{"name":"Adobe Research, San Jose, CA, USA"}]},{"given":"Alan","family":"Yuille","sequence":"additional","affiliation":[{"name":"Adobe Research, San Jose, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2016,10]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_2_1","volume-title":"NIPS","author":"Frome A.","year":"2013","unstructured":"A. Frome , G. Corrado , J. Shlens , S. Bengio , J. Dean , M. Ranzato , and T. Mikolov . Devise: A deep visual-semantic embedding model . In NIPS , 2013 . A. Frome, G. Corrado, J. Shlens, S. Bengio, J. Dean, M. Ranzato, and T. Mikolov. Devise: A deep visual-semantic embedding model. In NIPS, 2013."},{"key":"e_1_3_2_1_3_1","volume-title":"ICLR","author":"Gong Y.","year":"2014","unstructured":"Y. Gong , Y. Jia , T. K. Leung , A. Toshev , and S. Ioffe . Deep convolutional ranking for multilabel image annotation . In ICLR , 2014 . Y. Gong, Y. Jia, T. K. Leung, A. Toshev, and S. Ioffe. Deep convolutional ranking for multilabel image annotation. In ICLR, 2014."},{"key":"e_1_3_2_1_4_1","volume-title":"Caffe: Convolutional architecture for fast feature embedding. arXiv preprint arXiv:1408.5093","author":"Jia Y.","year":"2014","unstructured":"Y. Jia , E. Shelhamer , J. Donahue , S. Karayev , J. Long , R. Girshick , S. Guadarrama , and T. Darrell . Caffe: Convolutional architecture for fast feature embedding. arXiv preprint arXiv:1408.5093 , 2014 . Y. Jia, E. Shelhamer, J. Donahue, S. Karayev, J. Long, R. Girshick, S. Guadarrama, and T. Darrell. Caffe: Convolutional architecture for fast feature embedding. arXiv preprint arXiv:1408.5093, 2014."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_6_1","volume-title":"TACL","author":"Kiros R.","year":"2015","unstructured":"R. Kiros , R. Salakhutdinov , and R. Zemel . Unifying visual-semantic embeddings with multimodal neural language models . In TACL , 2015 . R. Kiros, R. Salakhutdinov, and R. Zemel. Unifying visual-semantic embeddings with multimodal neural language models. In TACL, 2015."},{"key":"e_1_3_2_1_7_1","volume-title":"TACL","author":"Kiros R.","year":"2015","unstructured":"R. Kiros , R. Salakhutdinov , and R. S. Zemel . Unifying visual-semantic embeddings with multimodal neural language models . In TACL , 2015 . R. Kiros, R. Salakhutdinov, and R. S. Zemel. Unifying visual-semantic embeddings with multimodal neural language models. In TACL, 2015."},{"key":"e_1_3_2_1_8_1","volume-title":"ICLR","author":"Mao J.","year":"2015","unstructured":"J. Mao , W. Xu , Y. Yang , J. Wang , Z. Huang , and A. L. Yuille . Deep captioning with multimodal recurrent neural networks (m-rnn) . In ICLR , 2015 . J. Mao, W. Xu, Y. Yang, J. Wang, Z. Huang, and A. L. Yuille. Deep captioning with multimodal recurrent neural networks (m-rnn). In ICLR, 2015."},{"key":"e_1_3_2_1_9_1","volume-title":"NIPS","author":"Mikolov T.","year":"2013","unstructured":"T. Mikolov , I. Sutskever , K. Chen , G. Corrado , and J. Dean . Distributed representations of words and phrases and their compositionality . In NIPS , 2013 . T. Mikolov, I. Sutskever, K. Chen, G. Corrado, and J. Dean. Distributed representations of words and phrases and their compositionality. In NIPS, 2013."},{"key":"e_1_3_2_1_10_1","volume-title":"ICLR","author":"Norouzi M.","year":"2014","unstructured":"M. Norouzi , T. Mikolov , S. Bengio , Y. Singer , J. Shlens , A. Frome , G. Corrado , and J. Dean . Zero-shot learning by convex combination of semantic embeddings . In ICLR , 2014 . M. Norouzi, T. Mikolov, S. Bengio, Y. Singer, J. Shlens, A. Frome, G. Corrado, and J. Dean. Zero-shot learning by convex combination of semantic embeddings. In ICLR, 2014."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"e_1_3_2_1_12_1","volume-title":"Multi-instance visual-semantic embedding. In arXiv preprint arXiv:1512.06963","author":"Ren Z.","year":"2015","unstructured":"Z. Ren , H. Jin , Z. Lin , C. Fang , and A. Yuille . Multi-instance visual-semantic embedding. In arXiv preprint arXiv:1512.06963 , 2015 . Z. Ren, H. Jin, Z. Lin, C. Fang, and A. Yuille. Multi-instance visual-semantic embedding. In arXiv preprint arXiv:1512.06963, 2015."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.287"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1006\/jvci.1999.0413"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_2_1_16_1","volume-title":"ICLR","author":"Vilnis L.","year":"2015","unstructured":"L. Vilnis and A. McCallum . Word representations via gaussian embedding . In ICLR , 2015 . L. Vilnis and A. McCallum. Word representations via gaussian embedding. In ICLR, 2015."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_18_1","volume-title":"NIPS","author":"Zhou B.","year":"2014","unstructured":"B. Zhou , A. Lapedriza , J. Xiao , A. Torralba , and A. Oliva . Learning deep features for scene recognition using places database . In NIPS , 2014 . B. Zhou, A. Lapedriza, J. Xiao, A. Torralba, and A. Oliva. Learning deep features for scene recognition using places database. In NIPS, 2014."}],"event":{"name":"MM '16: ACM Multimedia Conference","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Amsterdam The Netherlands","acronym":"MM '16"},"container-title":["Proceedings of the 24th ACM international conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2964284.2967212","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2964284.2967212","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T03:39:55Z","timestamp":1750217995000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2964284.2967212"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,10]]},"references-count":18,"alternative-id":["10.1145\/2964284.2967212","10.1145\/2964284"],"URL":"https:\/\/doi.org\/10.1145\/2964284.2967212","relation":{},"subject":[],"published":{"date-parts":[[2016,10]]},"assertion":[{"value":"2016-10-01","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}