{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,2]],"date-time":"2026-07-02T09:47:50Z","timestamp":1782985670084,"version":"3.54.5"},"publisher-location":"Singapore","reference-count":37,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819981441","type":"print"},{"value":"9789819981458","type":"electronic"}],"license":[{"start":{"date-parts":[[2023,11,27]],"date-time":"2023-11-27T00:00:00Z","timestamp":1701043200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,11,27]],"date-time":"2023-11-27T00:00:00Z","timestamp":1701043200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-981-99-8145-8_44","type":"book-chapter","created":{"date-parts":[[2023,11,26]],"date-time":"2023-11-26T23:02:30Z","timestamp":1701039750000},"page":"579-594","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Self-supervised Multimodal Representation Learning for Product Identification and Retrieval"],"prefix":"10.1007","author":[{"given":"Yiquan","family":"Jiang","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kengte","family":"Liao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shoude","family":"Lin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hongming","family":"Qiao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kefeng","family":"Yu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chengwei","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yinqi","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2023,11,27]]},"reference":[{"issue":"1","key":"44_CR1","doi-asserted-by":"publisher","first-page":"50","DOI":"10.14778\/3421424.3421431","volume":"14","author":"Y Li","year":"2020","unstructured":"Li, Y., Li, J., Suhara, Y., et al.: Deep entity matching with pre-trained language models. Proc. VLDB Endowment 14(1), 50\u201360 (2020)","journal-title":"Proc. VLDB Endowment"},{"key":"44_CR2","doi-asserted-by":"crossref","unstructured":"Shah, K., Kopru, S., Ruvini, J.D.: Neural network based extreme classification and similarity models for product matching. In: Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, vol. 3 (Industry Papers), pp. 8\u201315 (2018)","DOI":"10.18653\/v1\/N18-3002"},{"key":"44_CR3","doi-asserted-by":"publisher","first-page":"136","DOI":"10.1007\/s10791-019-09360-1","volume":"23","author":"J Li","year":"2020","unstructured":"Li, J., Dou, Z., Zhu, Y., et al.: Deep cross-platform product matching in e-commerce. Inf. Retrieval J. 23, 136\u2013158 (2020)","journal-title":"Inf. Retrieval J."},{"issue":"5","key":"44_CR4","doi-asserted-by":"publisher","first-page":"707","DOI":"10.3233\/SW-180300","volume":"9","author":"P Ristoski","year":"2018","unstructured":"Ristoski, P., Petrovski, P., Mika, P., et al.: A machine learning approach for product matching and categorization. Seman. Web 9(5), 707\u2013728 (2018)","journal-title":"Seman. Web"},{"key":"44_CR5","doi-asserted-by":"crossref","unstructured":"Das, N., Joshi, A., Yenigalla, P., Agrwal, G.: MAPS: multimodal attention for product similarity. In: Proceedings of the IEEE\/CVF Winter Conference on Applications of Computer Vision, pp. 3338\u20133346 (2022)","DOI":"10.1109\/WACV51458.2022.00304"},{"key":"44_CR6","doi-asserted-by":"crossref","unstructured":"Petrovski, P., Bryl, V., Bizer, C.: Integrating product data from websites offering microdata markup. In: Proceedings of the 23rd International Conference on World Wide Web, pp. 1299\u20131304 (2014)","DOI":"10.1145\/2567948.2579704"},{"key":"44_CR7","doi-asserted-by":"crossref","unstructured":"K\u00f6pcke, H., Thor, A., Thomas, S., Rahm, E.: Tailoring entity resolution for matching product offers. In: Proceedings of the 15th International Conference on Extending Database Technology, pp. 545\u2013550 (2012)","DOI":"10.1145\/2247596.2247662"},{"key":"44_CR8","doi-asserted-by":"crossref","unstructured":"Melli, G.: Shallow semantic parsing of product offering titles (for better automatic hyperlink insertion) In: Proceedings of the 20th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 1670\u20131678 (2014)","DOI":"10.1145\/2623330.2623343"},{"key":"44_CR9","doi-asserted-by":"publisher","unstructured":"Ristoski, P., Mika, P.: Enriching product ads with metadata from HTML annotations. In: Sack, H., Blomqvist, E., d'Aquin, M., Ghidini, C., Ponzetto, S., Lange, C. (eds.) The Semantic Web. Latest Advances and New Domains. ESWC 2016. LSCS, vol. 9678, pp. 151-167. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-34129-3_10","DOI":"10.1007\/978-3-319-34129-3_10"},{"issue":"8","key":"44_CR10","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., Wu, J., Child, R., et al.: Language models are unsupervised multitask learners. OpenAI blog 1(8), 9 (2019)","journal-title":"OpenAI blog"},{"key":"44_CR11","unstructured":"Devlin, J., Chang, M.W., Lee, K.: BERT: pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)"},{"key":"44_CR12","unstructured":"Brunner, U., Stockinger, K.: Entity matching with transformer architectures-a step forward in data integration. In: 23rd International Conference on Extending Database Technology, Copenhagen, 30 March-2 April 2020. OpenProceedings (2020)"},{"key":"44_CR13","doi-asserted-by":"publisher","first-page":"8410","DOI":"10.1109\/TIP.2021.3115658","volume":"30","author":"J Dong","year":"2021","unstructured":"Dong, J., Ma, Z., Mao, X., et al.: Fine-grained fashion similarity prediction by attribute-specific embedding learning. IEEE Trans. Image Process. 30, 8410\u20138425 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"44_CR14","doi-asserted-by":"publisher","first-page":"188","DOI":"10.1016\/j.inffus.2020.06.001","volume":"64","author":"A Zadeh","year":"2020","unstructured":"Zadeh, A., Liang, P.P., Morency, L.P.: Foundations of multimodal co-learning. Inf. Fusion 64, 188\u2013193 (2020)","journal-title":"Inf. Fusion"},{"key":"44_CR15","doi-asserted-by":"crossref","unstructured":"Zadeh, A., Liang, P.P., Mazumder, N., Poria, S., Cambria, E., Morency, L.P.: Memory fusion network for multi-view sequential learning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32, no. 1 (2018)","DOI":"10.1609\/aaai.v32i1.12021"},{"key":"44_CR16","doi-asserted-by":"crossref","unstructured":"Liang, P.P., Liu, Z., Zadeh, A., Morency, L.P.: Multimodal language analysis with recurrent multistage fusion. arXiv preprint arXiv:1808.03920 (2018)","DOI":"10.18653\/v1\/D18-1014"},{"key":"44_CR17","doi-asserted-by":"crossref","unstructured":"Lazaridou, A., Pham, N.T., Baroni, M.: Combining language and vision with a multimodal skip-gram model. arXiv preprint arXiv:1501.02598 (2015)","DOI":"10.3115\/v1\/N15-1016"},{"key":"44_CR18","doi-asserted-by":"crossref","unstructured":"Zablocki, E., Piwowarski. B., Soulier. L., Gallinari, P.: Learning multi-modal word representation grounded in visual context. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 32, no. 1 (2018)","DOI":"10.1609\/aaai.v32i1.11939"},{"key":"44_CR19","unstructured":"Lu, J., Batra, D., Parikh, D.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"44_CR20","unstructured":"Su, W., et al.: Vl-BERT: pre-training of generic visual-linguistic representations. arXiv preprint arXiv:1908.08530 (2019)"},{"key":"44_CR21","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"104","DOI":"10.1007\/978-3-030-58577-8_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Y-C Chen","year":"2020","unstructured":"Chen, Y.-C., et al.: UNITER: UNiversal Image-TExt representation learning. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12375, pp. 104\u2013120. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58577-8_7"},{"key":"44_CR22","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"44_CR23","unstructured":"Jia, C., et al.: Scaling up visual and vision-language representation learning with noisy text supervision. In: International Conference on Machine Learning, pp. 4904\u20134916. PMLR (2021)"},{"key":"44_CR24","first-page":"24206","volume":"34","author":"H Akbari","year":"2021","unstructured":"Akbari, H., Yuan, L., Qian, R., et al.: VATT: transformers for multimodal self-supervised learning from raw video, audio and text. Adv. Neural Inf. Process. Syst. 34, 24206\u201324221 (2021)","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"44_CR25","doi-asserted-by":"crossref","unstructured":"Singh, A., et al.: FLAVA: a foundational language and vision alignment model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15638\u201315650 (2022)","DOI":"10.1109\/CVPR52688.2022.01519"},{"key":"44_CR26","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"44_CR27","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"44_CR28","unstructured":"Lin, Z., et al.: A structured self-attentive sentence embedding. arXiv preprint arXiv:1703.03130 (2017)"},{"key":"44_CR29","doi-asserted-by":"crossref","unstructured":"Collell, G., Zhang, T., Moens, M.F.: Imagined visual representations as multimodal embeddings. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 31, no. 1 (2017)","DOI":"10.1609\/aaai.v31i1.11155"},{"key":"44_CR30","doi-asserted-by":"crossref","unstructured":"Cao, Z., Qin, T., Liu, T.Y., Tsai, M.F., Li, H.: Learning to rank: from pairwise approach to listwise approach. In: Proceedings of the 24th International Conference on Machine Learning, pp. 129\u2013136 (2007)","DOI":"10.1145\/1273496.1273513"},{"key":"44_CR31","doi-asserted-by":"crossref","unstructured":"Gan, Z., Pu, Y., Henao, R., Li, C., He, X., Carin, L.: Learning generic sentence representations using convolutional neural networks. In: Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing, pp. 2390-2400 (2017)","DOI":"10.18653\/v1\/D17-1254"},{"key":"44_CR32","unstructured":"Mikolov, T., Grave, E., Bojanowski, P., Puhrsch, C., Joulin, A.: Advances in pre-training distributed word representations. In: Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018) (2018)"},{"key":"44_CR33","unstructured":"Yang, A., et al.: Chinese CLIP: contrastive vision-language pretraining in Chinese. arXiv preprint arXiv:2211.01335 (2022)"},{"key":"44_CR34","doi-asserted-by":"crossref","unstructured":"Cui, Y., Che, W., Liu, T., Qin, B., Wang, S., Hu, G.: Revisiting pre-trained models for Chinese natural language processing. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 657\u2013668 (2020)","DOI":"10.18653\/v1\/2020.findings-emnlp.58"},{"issue":"11","key":"44_CR35","doi-asserted-by":"publisher","first-page":"2673","DOI":"10.1109\/78.650093","volume":"45","author":"M Schuster","year":"1997","unstructured":"Schuster, M., Paliwal, K.K.: Bidirectional recurrent neural networks. IEEE Trans. Sig. Process. 45(11), 2673\u20132681 (1997)","journal-title":"IEEE Trans. Sig. Process."},{"key":"44_CR36","unstructured":"Mikolov, T., Chen, K., Corrado, G., Dean, J.: Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781 (2013)"},{"key":"44_CR37","unstructured":"Chung, J., Gulcehre, C., Cho, K., Bengio, Y.: Empirical evaluation of gated recurrent neural networks on sequence modeling. arXiv preprint arXiv:1412.3555 (2014)"}],"container-title":["Communications in Computer and Information Science","Neural Information Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-99-8145-8_44","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,13]],"date-time":"2024-03-13T19:02:09Z","timestamp":1710356529000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-99-8145-8_44"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,11,27]]},"ISBN":["9789819981441","9789819981458"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-981-99-8145-8_44","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"value":"1865-0929","type":"print"},{"value":"1865-0937","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,11,27]]},"assertion":[{"value":"27 November 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICONIP","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Neural Information Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Changsha","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 November 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 November 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"iconip2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/iconip2023.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"EasyChair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1274","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"650","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"51% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.14","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.46","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}