{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,5]],"date-time":"2025-10-05T19:57:00Z","timestamp":1759694220276,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2017,10,23]],"date-time":"2017-10-23T00:00:00Z","timestamp":1508716800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Shenzhen Key Laboratory for Intelligent Multimedia and Virtual Reality","award":["ZDSYS201703031405467"],"award-info":[{"award-number":["ZDSYS201703031405467"]}]},{"name":"Guangdong Science and Technology Project","award":["2014B010117007"],"award-info":[{"award-number":["2014B010117007"]}]},{"name":"Shenzhen Peacock Plan","award":["20130408183003656"],"award-info":[{"award-number":["20130408183003656"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2017,10,23]]},"DOI":"10.1145\/3123266.3123369","type":"proceedings-article","created":{"date-parts":[[2017,10,20]],"date-time":"2017-10-20T13:04:26Z","timestamp":1508504666000},"page":"1698-1706","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":14,"title":["Cross-media Retrieval by Learning Rich Semantic Embeddings of Multimedia"],"prefix":"10.1145","author":[{"given":"Mengdi","family":"Fan","sequence":"first","affiliation":[{"name":"Peking University, Shenzhen, China"}]},{"given":"Wenmin","family":"Wang","sequence":"additional","affiliation":[{"name":"Peking University, Shenzhen, China"}]},{"given":"Peilei","family":"Dong","sequence":"additional","affiliation":[{"name":"Peking University, Shenzhen, China"}]},{"given":"Liang","family":"Han","sequence":"additional","affiliation":[{"name":"Peking University, Shenzhen, China"}]},{"given":"Ronggang","family":"Wang","sequence":"additional","affiliation":[{"name":"Peking University, Shenzhen, China"}]},{"given":"Ge","family":"Li","sequence":"additional","affiliation":[{"name":"Peking University, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2017,10,23]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/2578726.2578728"},{"key":"e_1_3_2_1_2_1","unstructured":"D. M. Blei A. Y. Ng and M. I. Jordan. 2003. Latent dirichlet allocation. Journal of Machine Learning Research (2003).   D. M. Blei A. Y. Ng and M. I. Jordan. 2003. Latent dirichlet allocation. Journal of Machine Learning Research (2003)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Y. Chen L. Wang W. Wang and Z. Zhang. 2012. Continuum regression for cross-modal multimedia retrieval. In ICIP.  Y. Chen L. Wang W. Wang and Z. Zhang. 2012. Continuum regression for cross-modal multimedia retrieval. In ICIP.","DOI":"10.1109\/ICIP.2012.6467268"},{"key":"e_1_3_2_1_4_1","volume-title":"Extracting visual patterns from deep learning representations. Eprint Arxiv","author":"Dario G.","year":"2015","unstructured":"G. Dario , B. Javier , C. Ulises , and others. 2015. Extracting visual patterns from deep learning representations. Eprint Arxiv ( 2015 ). G. Dario, B. Javier, C. Ulises, and others. 2015. Extracting visual patterns from deep learning representations. Eprint Arxiv (2015)."},{"key":"e_1_3_2_1_5_1","unstructured":"J. Dong X. Li and C. Snoek. 2016. Word2VisualVec: Cross- media retrieval by visual feature prediction. Eprint Arxiv (2016).  J. Dong X. Li and C. Snoek. 2016. Word2VisualVec: Cross- media retrieval by visual feature prediction. Eprint Arxiv (2016)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2647868.2654902"},{"key":"e_1_3_2_1_7_1","volume-title":"Devise: A deep visual-semantic embedding model. In NIPS.","author":"Frome A.","year":"2013","unstructured":"A. Frome , G. Corrado , J. Shlens , and others. 2013 . Devise: A deep visual-semantic embedding model. In NIPS. A. Frome, G. Corrado, J. Shlens, and others. 2013. Devise: A deep visual-semantic embedding model. In NIPS."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2671188.2749403"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"crossref","unstructured":"S. Hwang and K. Grauman. 2010. Accounting for the relative importance of objects in image retrieval. In BMVC.  S. Hwang and K. Grauman. 2010. Accounting for the relative importance of objects in image retrieval. In BMVC.","DOI":"10.5244\/C.24.58"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2011.190"},{"key":"e_1_3_2_1_11_1","unstructured":"D. W. Jacobs H. Daume A. Kumar and A. Sharma. 2012. Generalized multiview analysis: A discriminative latent space. In CVPR.  D. W. Jacobs H. Daume A. Kumar and A. Sharma. 2012. Generalized multiview analysis: A discriminative latent space. In CVPR."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"A. Karpathy and L. Fei-Fei. 2015. Deep visual-semantic alignments for generating image descriptions. In CVPR.  A. Karpathy and L. Fei-Fei. 2015. Deep visual-semantic alignments for generating image descriptions. In CVPR.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"e_1_3_2_1_13_1","unstructured":"A. Karpathy A. Joulin and L. Fei-Fei. 2014. Deep fragment embeddings for bidirectional image sentence mapping. In NIPS.   A. Karpathy A. Joulin and L. Fei-Fei. 2014. Deep fragment embeddings for bidirectional image sentence mapping. In NIPS."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"Benjamin Klein Guy Lev Gil Sadeh and Lior Wolf. 2015. Associating neural word embeddings with deep image representations using Fisher Vectors. In CVPR.  Benjamin Klein Guy Lev Gil Sadeh and Lior Wolf. 2015. Associating neural word embeddings with deep image representations using Fisher Vectors. In CVPR.","DOI":"10.1109\/CVPR.2015.7299073"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2911451.2911527"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"T. Lin M. Maire S. Belongie and others. 2014. Microsoft CO-CO: Common objects in context. In ECCV.  T. Lin M. Maire S. Belongie and others. 2014. Microsoft CO-CO: Common objects in context. In ECCV.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.301"},{"key":"e_1_3_2_1_18_1","volume-title":"Zero-shot learning by convex combination of semantic embeddings. Eprint Arxiv","author":"Norouzi M.","year":"2014","unstructured":"M. Norouzi , T. Mikolov , S. Bengio , and others. 2014. Zero-shot learning by convex combination of semantic embeddings. Eprint Arxiv ( 2014 ). M. Norouzi, T. Mikolov, S. Bengio, and others. 2014. Zero-shot learning by convex combination of semantic embeddings. Eprint Arxiv (2014)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"J. Palmer P. Verghese and M. Pavel. 2000. The psychophysics of visual search. Vision Research (2000).  J. Palmer P. Verghese and M. Pavel. 2000. The psychophysics of visual search. Vision Research (2000).","DOI":"10.1016\/S0042-6989(99)00244-8"},{"key":"e_1_3_2_1_20_1","volume":"201","author":"Peng Y.","unstructured":"Y. Peng , X. Huang , and J. Qi. 201 6. Cross-media shared repre- sentation by hierarchical learning with multiple deep networks. In IJCAI. Y. Peng, X. Huang, and J. Qi. 2016. Cross-media shared repre- sentation by hierarchical learning with multiple deep networks. In IJCAI.","journal-title":"J. Qi."},{"key":"e_1_3_2_1_21_1","volume-title":"On the role of correlation and abstraction in cross-modal multimedia retrieval","author":"Pereira J.","year":"2013","unstructured":"J. Pereira , E. Coviello , G. Doyle , and others. 2013. On the role of correlation and abstraction in cross-modal multimedia retrieval . IEEE Transactions on Software Engineering( 2013 ). J. Pereira, E. Coviello, G. Doyle, and others. 2013. On the role of correlation and abstraction in cross-modal multimedia retrieval. IEEE Transactions on Software Engineering(2013)."},{"key":"e_1_3_2_1_22_1","volume":"201","author":"Rashtchian C.","unstructured":"C. Rashtchian , P. Young , M. Hodosh , and J. Hockenmaier. 201 0. Collecting image annotations using Amazon's Mechanical Turk. In NAACL Workshop. C. Rashtchian, P. Young, M. Hodosh, and J. Hockenmaier. 2010. Collecting image annotations using Amazon's Mechanical Turk. In NAACL Workshop.","journal-title":"J. Hockenmaier."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/1873951.1873987"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2014.131"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_26_1","unstructured":"N. Srivastava and R. Salakhutdinov. 2012. Multimodal learning with deep boltzmann machines. NIPS (2012).   N. Srivastava and R. Salakhutdinov. 2012. Multimodal learning with deep boltzmann machines. NIPS (2012)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.298"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"C. Szegedy V. Vanhoucke S. Ioffe and others. 2015. Rethinking the inception architecture for computer vision. Eprint Arxiv(2015).  C. Szegedy V. Vanhoucke S. Ioffe and others. 2015. Rethinking the inception architecture for computer vision. Eprint Arxiv(2015).","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"O. Vinyals A. Toshev S. Bengio and others. 2015. Show and tell: A neural image caption generator. In CVPR.  O. Vinyals A. Toshev S. Bengio and others. 2015. Show and tell: A neural image caption generator. In CVPR.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2587640"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICTAI.2015.45"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2671188.2749341"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2505311"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.261"},{"key":"e_1_3_2_1_35_1","volume-title":"Learning Deep Structure-Preserving Image-Text Embeddings. CVPR","author":"Wang Liwei","year":"2015","unstructured":"Liwei Wang , Yin Li , and Svetlana Lazebnik . 2015. Learning Deep Structure-Preserving Image-Text Embeddings. CVPR ( 2015 ). Liwei Wang, Yin Li, and Svetlana Lazebnik. 2015. Learning Deep Structure-Preserving Image-Text Embeddings. CVPR (2015)."},{"key":"e_1_3_2_1_36_1","unstructured":"X. Wang W. Ma and X. Li. 2004. Data-driven approach for bridging the cognitive gap in image retrieval. In ICME.  X. Wang W. Ma and X. Li. 2004. Data-driven approach for bridging the cognitive gap in image retrieval. In ICME."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Y. Wei Y. Zhao C. Lu and S. Wei. 2016. Cross-modal retrieval with CNN visual features: A new baseline. IEEE Transactions on Cybernetics(2016).  Y. Wei Y. Zhao C. Lu and S. Wei. 2016. Cross-modal retrieval with CNN visual features: A new baseline. IEEE Transactions on Cybernetics(2016).","DOI":"10.1109\/TCYB.2016.2519449"}],"event":{"name":"MM '17: ACM Multimedia Conference","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Mountain View California USA","acronym":"MM '17"},"container-title":["Proceedings of the 25th ACM international conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3123266.3123369","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3123266.3123369","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T02:14:03Z","timestamp":1750212843000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3123266.3123369"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,10,23]]},"references-count":37,"alternative-id":["10.1145\/3123266.3123369","10.1145\/3123266"],"URL":"https:\/\/doi.org\/10.1145\/3123266.3123369","relation":{},"subject":[],"published":{"date-parts":[[2017,10,23]]},"assertion":[{"value":"2017-10-23","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}