{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T05:57:52Z","timestamp":1780379872656,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","license":[{"start":{"date-parts":[[2017,10,1]],"date-time":"2017-10-01T00:00:00Z","timestamp":1506816000000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"DARPA","award":["FA8750-13-2-0026"],"award-info":[{"award-number":["FA8750-13-2-0026"]}]},{"name":"Google Faculty Research Award"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2016,10]]},"DOI":"10.1145\/2964284.2984066","type":"proceedings-article","created":{"date-parts":[[2016,9,29]],"date-time":"2016-09-29T15:17:32Z","timestamp":1475162252000},"page":"1092-1096","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":100,"title":["Multimodal Video Description"],"prefix":"10.1145","author":[{"given":"Vasili","family":"Ramanishka","sequence":"first","affiliation":[{"name":"University of Massachusetts Lowell, Lowell, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Abir","family":"Das","sequence":"additional","affiliation":[{"name":"University of Massachusetts Lowell, Lowell, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dong Huk","family":"Park","sequence":"additional","affiliation":[{"name":"University of California, Berkeley, Berkeley, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Subhashini","family":"Venugopalan","sequence":"additional","affiliation":[{"name":"University of Texas at Austin, Austin, TX, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lisa Anne","family":"Hendricks","sequence":"additional","affiliation":[{"name":"University of California, Berkeley, Berkeley, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Marcus","family":"Rohrbach","sequence":"additional","affiliation":[{"name":"University of California, Berkeley, Berkeley, CA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kate","family":"Saenko","sequence":"additional","affiliation":[{"name":"University of Massachusetts Lowell, Lowell, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2016,10]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems. arXiv preprint arXiv:1603.04467","author":"Abadi M.","year":"2016","unstructured":"M. Abadi, A. Agarwal, P. Barham, E. Brevdo, Z. Chen, C. Citro, G. S. Corrado, A. Davis, J. Dean, M. Devin, et al. TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems. arXiv preprint arXiv:1603.04467, 2016."},{"key":"e_1_3_2_1_2_1","volume-title":"International Conference on Learning Representations","author":"Bahdanau D.","year":"2015","unstructured":"D. Bahdanau, K. Cho, and Y. Bengio. Neural Machine Translation by Jointly Learning to Align and Translate. In International Conference on Learning Representations, 2015."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.5555\/1626355.1626389"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSPCS.2008.4813723"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.340"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.5555\/1888089.1888092"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1371\/journal.pone.0144610"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.337"},{"key":"e_1_3_2_1_9_1","volume-title":"Deep Residual Learning for Image Recognition. In IEEE Conference on Computer Vision and Pattern Recognition","author":"He K.","year":"2016","unstructured":"K. He, X. Zhang, S. Ren, and J. Sun. Deep Residual Learning for Image Recognition. In IEEE Conference on Computer Vision and Pattern Recognition, 2016."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/MSP.2012.2205597"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.223"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.5555\/2891460.2891535"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.5555\/2018936.2018962"},{"key":"e_1_3_2_1_16_1","volume-title":"Rouge: A Package for Automatic Evaluation of Summaries. In Association for Computational Linguistics Workshop","volume":"8","author":"Lin C.-Y.","year":"2004","unstructured":"C.-Y. Lin. Rouge: A Package for Automatic Evaluation of Summaries. In Association for Computational Linguistics Workshop, volume 8, 2004."},{"key":"e_1_3_2_1_17_1","volume-title":"Mel Frequency Cepstral Coefficients for Music Modeling. In International Symposium on Music Information Retrieval","author":"Logan B.","year":"2000","unstructured":"B. Logan. Mel Frequency Cepstral Coefficients for Music Modeling. In International Symposium on Music Information Retrieval, 2000."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"e_1_3_2_1_19_1","volume-title":"Glove: Global Vectors for Word Representation. In Conference on Empirical Methods in Natural Language Processing","author":"Pennington J.","year":"2014","unstructured":"J. Pennington, R. Socher, and C. D. Manning. Glove: Global Vectors for Word Representation. In Conference on Empirical Methods in Natural Language Processing, 2014."},{"key":"e_1_3_2_1_20_1","volume-title":"The Long-Short Story of Movie Description. In German Conference on Pattern Recognition","author":"Rohrbach A.","year":"2015","unstructured":"A. Rohrbach, M. Rohrbach, and B. Schiele. The Long-Short Story of Movie Description. In German Conference on Pattern Recognition, 2015."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.5555\/2969033.2969173"},{"key":"e_1_3_2_1_22_1","volume-title":"International Conference on Computational Linguistics","author":"Thomason J.","year":"2014","unstructured":"J. Thomason, S. Venugopalan, S. Guadarrama, K. Saenko, and R. J. Mooney. Integrating Language and Vision to Generate Natural Language Descriptions of Videos in the Wild. In International Conference on Computational Linguistics, 2014."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.510"},{"key":"e_1_3_2_1_24_1","first-page":"2579","article-title":"Visualizing Data using t-SNE","volume":"9","author":"van der Maaten L.","year":"2008","unstructured":"L. van der Maaten and G. Hinton. Visualizing Data using t-SNE. Journal of Machine Learning Research, 9:2579--2605, 2008.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_25_1","volume-title":"Cider: Consensus-based Image Description Evaluation. In IEEE Conference on Computer Vision and Pattern Recognition","author":"Vedantam R.","year":"2015","unstructured":"R. Vedantam, L. C. Zitnick, and D. Parikh. Cider: Consensus-based Image Description Evaluation. In IEEE Conference on Computer Vision and Pattern Recognition, 2015."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.515"},{"key":"e_1_3_2_1_27_1","volume-title":"MSR-VTT: A Large Video Description Dataset for Bridging Video and Language. In IEEE Conference on Computer Vision and Pattern Recognition","author":"Xu J.","year":"2016","unstructured":"J. Xu, T. Mei, T. Yao, and Y. Rui. MSR-VTT: A Large Video Description Dataset for Bridging Video and Language. In IEEE Conference on Computer Vision and Pattern Recognition, 2016."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2010.2050411"}],"event":{"name":"MM '16: ACM Multimedia Conference","location":"Amsterdam The Netherlands","acronym":"MM '16","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 24th ACM international conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2964284.2984066","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2964284.2984066","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2964284.2984066","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,18]],"date-time":"2025-11-18T09:28:24Z","timestamp":1763458104000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2964284.2984066"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,10]]},"references-count":28,"alternative-id":["10.1145\/2964284.2984066","10.1145\/2964284"],"URL":"https:\/\/doi.org\/10.1145\/2964284.2984066","relation":{},"subject":[],"published":{"date-parts":[[2016,10]]},"assertion":[{"value":"2016-10-01","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}