{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,26]],"date-time":"2025-03-26T16:54:36Z","timestamp":1743008076568,"version":"3.40.3"},"publisher-location":"Cham","reference-count":28,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030317256"},{"type":"electronic","value":"9783030317263"}],"license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-31726-3_39","type":"book-chapter","created":{"date-parts":[[2019,10,31]],"date-time":"2019-10-31T00:05:31Z","timestamp":1572480331000},"page":"454-465","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["Deep Voice-Visual Cross-Modal Retrieval with Deep Feature Similarity Learning"],"prefix":"10.1007","author":[{"given":"Yaxiong","family":"Chen","sequence":"first","affiliation":[]},{"given":"Xiaoqiang","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Yachuang","family":"Feng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,10,31]]},"reference":[{"key":"39_CR1","doi-asserted-by":"crossref","unstructured":"Arandjelovi, R., Zisserman, A.: Look, listen and learn. In: Proceedings of ICCV, pp. 609\u2013617 (2017)","DOI":"10.1109\/ICCV.2017.73"},{"issue":"9","key":"39_CR2","doi-asserted-by":"publisher","first-page":"2542","DOI":"10.1109\/TCYB.2017.2742705","volume":"48","author":"G Cao","year":"2018","unstructured":"Cao, G., Iosifidis, A., Chen, K., Gabbouj, M.: Generalized multi-view embedding for visual recognition and cross-modal retrieval. IEEE Trans. Cybern. 48(9), 2542\u20132555 (2018)","journal-title":"IEEE Trans. Cybern."},{"key":"39_CR3","doi-asserted-by":"crossref","unstructured":"Gong, Y., Lazebnik, S.: Iterative quantization: a procrustean approach to learning binary codes. In: Proceedings of CVPR, pp. 817\u2013824 (2011)","DOI":"10.1109\/CVPR.2011.5995432"},{"key":"39_CR4","doi-asserted-by":"crossref","unstructured":"Gu, J., Cai, J., Joty, S., Niu, L., Wang, G.: Look, imagine and match: improving textual-visual cross-modal retrieval with generative models. In: Proceedings of the IEEE International Conference on Computer Vision and Pattern Recognition, pp. 7181\u20137189 (2018)","DOI":"10.1109\/CVPR.2018.00750"},{"key":"39_CR5","unstructured":"Harwath, D.: Unsupervised learning of spoken language with visual context. In: Proceedings of Advances in Neural Information Processing Systems, pp. 1858\u20131866 (2016)"},{"key":"39_CR6","doi-asserted-by":"crossref","unstructured":"Harwath, D., Glass, J.R.: Learning word-like units from joint audio-visual analysis. In: Proceedings of Annual Meeting of the Association for Computational Linguistics, pp. 506\u2013517 (2017)","DOI":"10.18653\/v1\/P17-1047"},{"key":"39_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"659","DOI":"10.1007\/978-3-030-01231-1_40","volume-title":"Computer Vision \u2013 ECCV 2018","author":"D Harwath","year":"2018","unstructured":"Harwath, D., Recasens, A., Sur\u00eds, D., Chuang, G., Torralba, A., Glass, J.: Jointly discovering visual objects and spoken words from raw sensory input. In: Ferrari, V., Hebert, M., Sminchisescu, C., Weiss, Y. (eds.) ECCV 2018. LNCS, vol. 11210, pp. 659\u2013677. Springer, Cham (2018). https:\/\/doi.org\/10.1007\/978-3-030-01231-1_40"},{"key":"39_CR8","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh, M., Young, P., Hockenmaier, J.: Framing image description as a ranking task: data, models and evaluation metrics. J. Artif. Intell. Res. 47, 853\u2013899 (2013)","journal-title":"J. Artif. Intell. Res."},{"key":"39_CR9","unstructured":"Huang, F., Zhang, X., Xu, J., Zhao, Z., Li, Z.: Multimodal learning of social image representation by exploiting social relations. IEEE Trans. Cybern. (2019)"},{"key":"39_CR10","doi-asserted-by":"crossref","unstructured":"Jiang, Q.Y., Li, W.J.: Deep cross-modal hashing. In: Proceedings of CVPR, pp. 3270\u20133278 (2017)","DOI":"10.1109\/CVPR.2017.348"},{"key":"39_CR11","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization (2015)"},{"key":"39_CR12","doi-asserted-by":"crossref","unstructured":"Li, D., Dimitrova, N., Li, M., Sethi, I.K.: Multimedia content processing through cross-modal association. In: Proceedings of the ACM International Conference on Multimedia, pp. 604\u2013611 (2003)","DOI":"10.1145\/957013.957143"},{"issue":"6","key":"39_CR13","doi-asserted-by":"publisher","first-page":"1220","DOI":"10.1109\/TMM.2016.2646219","volume":"19","author":"Z Liang","year":"2017","unstructured":"Liang, Z., Ma, B., Li, G., Huang, Q., Qi, T.: Cross-modal retrieval using multi-ordered discriminative structured subspace learning. IEEE Trans. Multimed. 19(6), 1220\u20131233 (2017)","journal-title":"IEEE Trans. Multimed."},{"issue":"1","key":"39_CR14","doi-asserted-by":"publisher","first-page":"128","DOI":"10.1109\/TMM.2017.2723841","volume":"20","author":"Z Liang","year":"2018","unstructured":"Liang, Z., Ma, B., Li, G., Huang, Q., Qi, T.: Generalized semi-supervised and structured subspace learning for cross-modal retrieval. IEEE Trans. Multimed. 20(1), 128\u2013141 (2018)","journal-title":"IEEE Trans. Multimed."},{"key":"39_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"39_CR16","doi-asserted-by":"crossref","unstructured":"Liu, H., Wang, R., Shan, S., Chen, X.: Deep supervised hashing for fast image retrieval. In: Proceedings of CVPR, pp. 2064\u20132072 (2016)","DOI":"10.1109\/CVPR.2016.227"},{"issue":"1","key":"39_CR17","doi-asserted-by":"publisher","first-page":"106","DOI":"10.1109\/TIP.2017.2755766","volume":"27","author":"X Lu","year":"2018","unstructured":"Lu, X., Chen, Y., Li, X.: Hierarchical recurrent neural hashing for image retrieval with hierarchical convolutional features. IEEE Trans. Image Process. 27(1), 106\u2013120 (2018)","journal-title":"IEEE Trans. Image Process."},{"issue":"1","key":"39_CR18","doi-asserted-by":"publisher","first-page":"102","DOI":"10.1109\/TIP.2018.2863040","volume":"28","author":"D Mandal","year":"2018","unstructured":"Mandal, D., Chaudhury, K.N., Biswas, S.: Generalized semantic preserving hashing for cross-modal retrieval. IEEE Trans. Image Process. 28(1), 102\u2013112 (2018)","journal-title":"IEEE Trans. Image Process."},{"key":"39_CR19","doi-asserted-by":"crossref","unstructured":"Mao, G., Yuan, Y., Lu, X.: Deep cross-modal retrieval for remote sensing image and audio. In: Proceedings of IAPR Workshop on Pattern Recognition in Remote Sensing, pp. 1\u20137 (2018)","DOI":"10.1109\/PRRS.2018.8486338"},{"issue":"12","key":"39_CR20","doi-asserted-by":"publisher","first-page":"4049","DOI":"10.1109\/TCYB.2016.2595620","volume":"47","author":"M Mao","year":"2017","unstructured":"Mao, M., Lu, J., Zhang, G., Zhang, J.: Multirelational social recommendations via multigraph ranking. IEEE Trans. Cybern. 47(12), 4049\u20134061 (2017)","journal-title":"IEEE Trans. Cybern."},{"issue":"1","key":"39_CR21","doi-asserted-by":"publisher","first-page":"52","DOI":"10.1109\/MSP.2018.2868887","volume":"36","author":"M Mueller","year":"2019","unstructured":"Mueller, M., Arzt, A., Balke, S., Dorfer, M., Widmer, G.: Cross-modal music retrieval and applications: an overview of key methodologies. IEEE Signal Process. Mag. 36(1), 52\u201362 (2019)","journal-title":"IEEE Signal Process. Mag."},{"key":"39_CR22","doi-asserted-by":"crossref","unstructured":"Nagrani, A., Albanie, S., Zisserman, A.: Seeing voices and hearing faces: cross-modal biometric matching. In: Proceedings of CVPR (2018)","DOI":"10.1109\/CVPR.2018.00879"},{"key":"39_CR23","unstructured":"Nair, V., Hinton, G.E.: Rectified linear units improve restricted Boltzmann machines. In: Proceedings of ICML, pp. 807\u2013814 (2010)"},{"key":"39_CR24","doi-asserted-by":"crossref","unstructured":"Owens, A., Isola, P., McDermott, J., Torralba, A., Adelson, E.H., Freeman, W.T.: Visually indicated sounds. In: Proceedings of CVPR, pp. 2405\u20132413 (2016)","DOI":"10.1109\/CVPR.2016.264"},{"key":"39_CR25","doi-asserted-by":"publisher","first-page":"22081","DOI":"10.1109\/ACCESS.2017.2761539","volume":"5","author":"A Torfi","year":"2017","unstructured":"Torfi, A., Iranmanesh, S.M., Nasrabadi, N., Dawson, J.: 3D convolutional neural networks for cross audio-visual matching recognition. IEEE Access 5, 22081\u201322091 (2017)","journal-title":"IEEE Access"},{"issue":"2","key":"39_CR26","first-page":"449","volume":"47","author":"Y Wei","year":"2016","unstructured":"Wei, Y., et al.: Cross-modal retrieval with CNN visual features: a new baseline. IEEE Trans. Cybern. 47(2), 449\u2013460 (2016)","journal-title":"IEEE Trans. Cybern."},{"issue":"4","key":"39_CR27","doi-asserted-by":"publisher","first-page":"723","DOI":"10.1109\/TPAMI.2011.170","volume":"34","author":"Y Yang","year":"2012","unstructured":"Yang, Y., Nie, F., Xu, D., Luo, J., Zhuang, Y., Pan, Y.: A multimedia retrieval framework based on semi-supervised ranking and relevance feedback. IEEE Trans. Pattern Anal. Mach. Intell. 34(4), 723\u2013742 (2012)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"39_CR28","doi-asserted-by":"crossref","unstructured":"Zhang, H., Zhuang, Y., Wu, F.: Cross-modal correlation learning for clustering on image-audio dataset. In: Proceedings of the ACM International Conference on Multimedia, pp. 273\u2013276 (2007)","DOI":"10.1145\/1291233.1291290"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition and Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-31726-3_39","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:18:42Z","timestamp":1730333922000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-31726-3_39"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"ISBN":["9783030317256","9783030317263"],"references-count":28,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-31726-3_39","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2019]]},"assertion":[{"value":"31 October 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"PRCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Chinese Conference on Pattern Recognition and Computer Vision  (PRCV)","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Xi'an","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2019","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 November 2019","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 November 2019","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ccprcv2019","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.prcv2019.com\/en\/index.html","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"412","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"165","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"40% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}