{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,28]],"date-time":"2026-05-28T16:04:46Z","timestamp":1779984286526,"version":"3.53.1"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2009,10,19]],"date-time":"2009-10-19T00:00:00Z","timestamp":1255910400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2009,10,19]]},"DOI":"10.1145\/1631272.1631277","type":"proceedings-article","created":{"date-parts":[[2009,10,20]],"date-time":"2009-10-20T08:43:40Z","timestamp":1256028220000},"page":"5-14","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":44,"title":["Short-term audio-visual atoms for generic video concept classification"],"prefix":"10.1145","author":[{"given":"Wei","family":"Jiang","sequence":"first","affiliation":[{"name":"Columbia University, New York, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Courtenay","family":"Cotton","sequence":"additional","affiliation":[{"name":"Columbia University, New York, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shih-Fu","family":"Chang","sequence":"additional","affiliation":[{"name":"Columbia University, New York, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dan","family":"Ellis","sequence":"additional","affiliation":[{"name":"Columbia University, New York, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Alexander","family":"Loui","sequence":"additional","affiliation":[{"name":"Eastman Kodak, Rochester, NY, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2009,10,19]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"CogSys","author":"Anemueller J.","year":"2008","unstructured":"J. Anemueller and et al. Biologically motivated audio-visual cue integration for object categorization. In CogSys, 2008."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2007.383344"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2003.1206512"},{"key":"e_1_3_2_1_4_1","unstructured":"S. Birchfield. KLT: An Implementation of the Kanade-Lucas-Tomasi Feature Tracker. http:\/\/vision.stanford.edu\/birch."},{"key":"e_1_3_2_1_5_1","volume-title":"NIST TRECVID workshop","author":"Chang S.F.","year":"2005","unstructured":"S.F. Chang and et al. Columbia university TRECVID-2005 video search and high-level feature extraction. In NIST TRECVID workshop, Gaithersburg, MD, 2005."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/1290082.1290118"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.5555\/1005332.1016789"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2006.886263"},{"key":"e_1_3_2_1_9_1","first-page":"1","volume-title":"Proc. ICASSP","author":"Chu S.","year":"2008","unstructured":"S. Chu and et al. Environmental sound recognition using MP-based features. in Proc. ICASSP, pages 1--4, 2008."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.177"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/957013.957124"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/34.946985"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1214\/aos\/1016218223"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2005.239"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2005.199"},{"key":"e_1_3_2_1_16_1","volume-title":"NIPS","author":"Hershey J.","year":"1999","unstructured":"J. Hershey and J. Movellan. Audio-vision: Using audio-visual synchrony to locate sounds. In NIPS, 1999."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1155\/2007\/64506"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2003.1233903"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.5555\/645310.648998"},{"key":"e_1_3_2_1_20_1","unstructured":"R. Gribonval and S. Krstulovic. MPTK the matching pursuit toolkit. http:\/\/mptk.irisa.fr\/"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/1290082.1290117"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000029664.99615.94"},{"key":"e_1_3_2_1_23_1","first-page":"121","volume-title":"Proc. Imaging understanding workshop","author":"Lucas B.D.","year":"1981","unstructured":"B.D. Lucas and T. Kanade. An iterative image registration technique with an application to stereo vision. In Proc. Imaging understanding workshop, pages 121--130, 1981."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/78.258082"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.5555\/302528.302753"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-88693-8_39"},{"key":"e_1_3_2_1_27_1","volume-title":"TREC Video Retrieval Evaluation (TRECVID). 2001 --","author":"NIST.","year":"2008","unstructured":"NIST. TREC Video Retrieval Evaluation (TRECVID). 2001 -- 2008. http:\/\/www-nlpir.nist.gov\/projects\/trecvid\/"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2007.366659"},{"key":"e_1_3_2_1_29_1","unstructured":"F. Petitcolas. MPEG for MATLAB. http:\/\/www.petitcolas.net\/fabien\/software\/mpeg"},{"key":"e_1_3_2_1_30_1","first-page":"593","volume-title":"Proc. CVPR","author":"Shi J.","year":"1994","unstructured":"J. Shi and C. Tomasi. Good features to track. In Proc. CVPR, pages 593--600, 1994."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/34.868677"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1023\/B:VISI.0000004830.93820.78"},{"key":"e_1_3_2_1_33_1","volume-title":"Statistical learning theory","author":"Vapnik V.","year":"1998","unstructured":"V. Vapnik. Statistical learning theory. Wiley-Interscience, New York, 1998."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/11744078_9"},{"key":"e_1_3_2_1_35_1","first-page":"2391","volume-title":"Proc. ICIP","author":"Wu Y.","year":"2004","unstructured":"Y. Wu and et al. Multimodal information fusion for video concept detection. in Proc. ICIP, pages 2391--2394, 2004."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2006.250"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/1291233.1291416"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2008.08.006"},{"key":"e_1_3_2_1_39_1","unstructured":"J.C. Niebles and et al.. Extracting moving people from"}],"event":{"name":"MM09: ACM Multimedia Conference","location":"Beijing China","acronym":"MM09","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 17th ACM international conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/1631272.1631277","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/1631272.1631277","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,28]],"date-time":"2026-05-28T15:25:55Z","timestamp":1779981955000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/1631272.1631277"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2009,10,19]]},"references-count":39,"alternative-id":["10.1145\/1631272.1631277","10.1145\/1631272"],"URL":"https:\/\/doi.org\/10.1145\/1631272.1631277","relation":{},"subject":[],"published":{"date-parts":[[2009,10,19]]},"assertion":[{"value":"2009-10-19","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}