{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,26]],"date-time":"2025-05-26T19:25:49Z","timestamp":1748287549325,"version":"3.40.3"},"publisher-location":"Cham","reference-count":42,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031533105"},{"type":"electronic","value":"9783031533112"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-53311-2_30","type":"book-chapter","created":{"date-parts":[[2024,1,27]],"date-time":"2024-01-27T21:37:36Z","timestamp":1706391456000},"page":"410-424","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Multi-modal Video Topic Segmentation with\u00a0Dual-Contrastive Domain Adaptation"],"prefix":"10.1007","author":[{"given":"Linzi","family":"Xing","sequence":"first","affiliation":[]},{"given":"Quan","family":"Tran","sequence":"additional","affiliation":[]},{"given":"Fabian","family":"Caba","sequence":"additional","affiliation":[]},{"given":"Franck","family":"Dernoncourt","sequence":"additional","affiliation":[]},{"given":"Seunghyun","family":"Yoon","sequence":"additional","affiliation":[]},{"given":"Zhaowen","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Trung","family":"Bui","sequence":"additional","affiliation":[]},{"given":"Giuseppe","family":"Carenini","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,1,28]]},"reference":[{"key":"30_CR1","doi-asserted-by":"crossref","unstructured":"Alam, T., Khan, A., Alam, F.: Punctuation restoration using transformer models for high-and low-resource languages. In: Proceedings of the Sixth Workshop on Noisy User-Generated Text (W-NUT 2020), pp. 132\u2013142 (2020)","DOI":"10.18653\/v1\/2020.wnut-1.18"},{"key":"30_CR2","doi-asserted-by":"crossref","unstructured":"Baraldi, L., Grana, C., Cucchiara, R.: A deep siamese network for scene detection in broadcast videos. In: Proceedings of ACM MM 2015, pp. 1199\u20131202 (2015)","DOI":"10.1145\/2733373.2806316"},{"issue":"1","key":"30_CR3","doi-asserted-by":"publisher","first-page":"177","DOI":"10.1023\/A:1007506220214","volume":"34","author":"D Beeferman","year":"1999","unstructured":"Beeferman, D., Berger, A., Lafferty, J.: Statistical models for text segmentation. Mach. Learn. 34(1), 177\u2013210 (1999). https:\/\/doi.org\/10.1023\/A:1007506220214","journal-title":"Mach. Learn."},{"key":"30_CR4","unstructured":"Cao, X., Chen, Z., Le, C., Meng, L.: Multi-modal video chapter generation. ArXiv abs\/2209.12694 (2022)"},{"key":"30_CR5","doi-asserted-by":"crossref","unstructured":"Chen, D., Wang, D., Darrell, T., Ebrahimi, S.: Contrastive test-time adaptation. In: Proceedings of CVPR 2022, pp. 295\u2013305 (2022)","DOI":"10.1109\/CVPR52688.2022.00039"},{"key":"30_CR6","doi-asserted-by":"crossref","unstructured":"Chen, S., Nie, X., Fan, D.D., Zhang, D., Bhat, V., Hamid, R.: Shot contrastive self-supervised learning for scene boundary detection. In: Proceedings of CVPR 2021, pp. 9791\u20139800 (2021)","DOI":"10.1109\/CVPR46437.2021.00967"},{"key":"30_CR7","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL 2019, pp. 4171\u20134186. Association for Computational Linguistics (2019)"},{"key":"30_CR8","doi-asserted-by":"crossref","unstructured":"Eisenstein, J., Barzilay, R.: Bayesian unsupervised topic segmentation. In: Proceedings of EMNLP 2008, pp. 334\u2013343 (2008)","DOI":"10.3115\/1613715.1613760"},{"key":"30_CR9","doi-asserted-by":"crossref","unstructured":"Fraser, C., Kim, J., Shin, H., Brandt, J., Dontcheva, M.: Temporal segmentation of creative live streams. In: Proceedings of CHI 2020, pp. 1\u201312 (2020)","DOI":"10.1145\/3313831.3376437"},{"key":"30_CR10","doi-asserted-by":"crossref","unstructured":"Georgescul, M., Clark, A., Armstrong, S.: An analysis of quantitative aspects in the evaluation of thematic segmentation algorithms. In: Proceedings of SIGdial 2006, pp. 144\u2013151 (2006)","DOI":"10.3115\/1654595.1654622"},{"key":"30_CR11","doi-asserted-by":"crossref","unstructured":"Glava\u0161, G., Nanni, F., Ponzetto, S.P.: Unsupervised text segmentation using semantic relatedness graphs. In: Proceedings of the Fifth Joint Conference on Lexical and Computational Semantics, pp. 125\u2013130 (2016)","DOI":"10.18653\/v1\/S16-2016"},{"key":"30_CR12","unstructured":"Glavas, G., Somasundaran, S.: Two-level transformer and auxiliary coherence modeling for improved text segmentation. In: Proceeding of AAAI-2020, pp. 2306\u20132315 (2020)"},{"key":"30_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"505","DOI":"10.1007\/978-3-319-10584-0_33","volume-title":"Computer Vision \u2013 ECCV 2014","author":"M Gygli","year":"2014","unstructured":"Gygli, M., Grabner, H., Riemenschneider, H., Van Gool, L.: Creating summaries from user videos. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8695, pp. 505\u2013520. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10584-0_33"},{"key":"30_CR14","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of CVPR 2016, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"1","key":"30_CR15","first-page":"33","volume":"23","author":"MA Hearst","year":"1997","unstructured":"Hearst, M.A.: Text tiling: segmenting text into multi-paragraph subtopic passages. Comput. Linguist. 23(1), 33\u201364 (1997)","journal-title":"Comput. Linguist."},{"key":"30_CR16","doi-asserted-by":"crossref","unstructured":"Jadon, S., Jasim, M.: Unsupervised video summarization framework using keyframe extraction and video skimming. In: Proceedings of ICCCA 2020, pp. 140\u2013145 (2020)","DOI":"10.1109\/ICCCA49541.2020.9250764"},{"key":"30_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"95","DOI":"10.1007\/978-3-642-23017-2_7","volume-title":"Semantic Multimedia","author":"N James","year":"2011","unstructured":"James, N., Todorov, K., Hudelot, C.: Combining visual and textual modalities for multimedia ontology matching. In: Declerck, T., Granitzer, M., Grzegorzek, M., Romanelli, M., R\u00fcger, S., Sintek, M. (eds.) SAMT 2010. LNCS, vol. 6725, pp. 95\u2013110. Springer, Heidelberg (2011). https:\/\/doi.org\/10.1007\/978-3-642-23017-2_7"},{"key":"30_CR18","doi-asserted-by":"crossref","unstructured":"Jayaraman, D., Grauman, K.: Slow and steady feature analysis: higher order temporal coherence in video. In: Proceeding of CVPR 2016, pp. 3852\u20133861 (2016)","DOI":"10.1109\/CVPR.2016.418"},{"key":"30_CR19","doi-asserted-by":"crossref","unstructured":"Kang, G., Jiang, L., Yang, Y., Hauptmann, A.G.: Contrastive adaptation network for unsupervised domain adaptation. In: Proceedings of CVPR 2019, pp. 4893\u20134902 (2019)","DOI":"10.1109\/CVPR.2019.00503"},{"key":"30_CR20","doi-asserted-by":"crossref","unstructured":"Kim, D., et al.: Learning cross-modal contrastive features for video domain adaptation. In: Proceedings of ICCV 2021, pp. 13598\u201313607 (2021)","DOI":"10.1109\/ICCV48922.2021.01336"},{"key":"30_CR21","doi-asserted-by":"crossref","unstructured":"Koshorek, O., Cohen, A., Mor, N., Rotman, M., Berant, J.: Text segmentation as a supervised learning task. In: Proceedings of NAACL 2018, pp. 469\u2013473 (2018)","DOI":"10.18653\/v1\/N18-2075"},{"key":"30_CR22","unstructured":"Kumar, A., Mittal, T., Manocha, D.: MCQA: multimodal co-attention based network for question answering. CoRR abs\/2004.12238 (2020)"},{"key":"30_CR23","doi-asserted-by":"crossref","unstructured":"Li, J., Sun, A., Joty, S.: SegBot: a generic neural text segmentation model with pointer network. In: Proceedings of IJCAI-2018, pp. 4166\u20134172 (2018)","DOI":"10.24963\/ijcai.2018\/579"},{"key":"30_CR24","doi-asserted-by":"crossref","unstructured":"Lo, K., Jin, Y., Tan, W., Liu, M., Du, L., Buntine, W.: Transformer over pre-trained transformer for neural text segmentation with enhanced topic coherence. In: EMNLP 2021 (Findings), pp. 3334\u20133340 (2021)","DOI":"10.18653\/v1\/2021.findings-emnlp.283"},{"key":"30_CR25","unstructured":"Lu, J., Yang, J., Batra, D., Parikh, D.: Hierarchical question-image co-attention for visual question answering. In: Proceedings of NeurIPS 2016, pp. 289\u2013297 (2016)"},{"key":"30_CR26","doi-asserted-by":"crossref","unstructured":"Lukasik, M., Dadachev, B., Papineni, K., Sim\u00f5es, G.: Text segmentation by cross segment attention. In: Proceedings of EMNLP 2020, pp. 4707\u20134716 (2020)","DOI":"10.18653\/v1\/2020.emnlp-main.380"},{"key":"30_CR27","unstructured":"van den Oord, A., Li, Y., Vinyals, O.: Representation learning with contrastive predictive coding. ArXiv abs\/1807.03748 (2018)"},{"key":"30_CR28","doi-asserted-by":"publisher","unstructured":"Pevzner, L., Hearst, M.A.: A critique and improvement of an evaluation metric for text segmentation. Comput. Linguist. 28(1), 19\u201336 (2002). https:\/\/doi.org\/10.1162\/089120102317341756, https:\/\/aclanthology.org\/J02-1002","DOI":"10.1162\/089120102317341756"},{"key":"30_CR29","doi-asserted-by":"crossref","unstructured":"Qiu, J., et al.: Semantics-consistent cross-domain summarization via optimal transport alignment. ArXiv abs\/2210.04722 (2022)","DOI":"10.18653\/v1\/2023.findings-acl.101"},{"key":"30_CR30","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. ArXiv abs\/2212.04356 (2022)"},{"key":"30_CR31","doi-asserted-by":"crossref","unstructured":"Rao, A., et al.: A local-to-global approach to multi-modal movie scene segmentation. In: Proceedings of CVPR 2020, pp. 10146\u201310155 (2020)","DOI":"10.1109\/CVPR42600.2020.01016"},{"key":"30_CR32","doi-asserted-by":"crossref","unstructured":"Rasheed, Z., Shah, M.: Scene detection in Hollywood movies and TV shows. In: Proceedings of CVPR 2003, p. II-343 (2003)","DOI":"10.1109\/CVPR.2003.1211489"},{"key":"30_CR33","unstructured":"Rui, Y., Huang, T.S., Mehrotra, S.: Exploring video structure beyond the shots. In: Proceedings of the IEEE International Conference on Multimedia Computing and Systems, pp. 237\u2013240 (1998)"},{"key":"30_CR34","doi-asserted-by":"crossref","unstructured":"Song, Y., Vallmitjana, J., Stent, A., Jaimes, A.: TVSum: summarizing web videos using titles. In: Proceeding of CVPR 2015, pp. 5179\u20135187 (2015)","DOI":"10.1109\/CVPR.2015.7299154"},{"key":"30_CR35","unstructured":"Sou\u010dek, T., Loko\u010d, J.: TransNet V2: an effective deep network architecture for fast shot transition detection. ArXiv abs\/2008.04838 (2020)"},{"key":"30_CR36","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"30_CR37","unstructured":"Wang, Z., Zhong, Y., Miao, Y., Ma, L., Specia, L.: Contrastive video-language learning with fine-grained frame sampling. In: Proceedings of AACL-IJCNLP 2022, pp. 694\u2013705 (2022)"},{"key":"30_CR38","doi-asserted-by":"crossref","unstructured":"Xiao, S., et al.: Boundary proposal network for two-stage natural language video localization. In: Proceedings of AAAI-2021 (2021)","DOI":"10.1609\/aaai.v35i4.16406"},{"key":"30_CR39","doi-asserted-by":"crossref","unstructured":"Xing, L., Carenini, G.: Improving unsupervised dialogue topic segmentation with utterance-pair coherence scoring. In: Proceedings of SIGdial 2021, pp. 167\u2013177 (2021)","DOI":"10.18653\/v1\/2021.sigdial-1.18"},{"key":"30_CR40","unstructured":"Xing, L., Hackinen, B., Carenini, G., Trebbi, F.: Improving context modeling in neural topic segmentation. In: Proceedings of AACL-IJCNLP 2020, pp. 626\u2013636 (2020)"},{"key":"30_CR41","unstructured":"Xing, L., Huber, P., Carenini, G.: Improving topic segmentation by injecting discourse dependencies. In: Proceedings of the 3rd Workshop on Computational Approaches to Discourse, pp. 7\u201318 (2022)"},{"key":"30_CR42","unstructured":"Zhu, W., Pang, B., Thapliyal, A.V., Wang, W.Y., Soricut, R.: End-to-end dense video captioning as sequence generation. In: Proceedings of COLING 2022, pp. 5651\u20135665 (2022)"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-53311-2_30","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T09:55:52Z","timestamp":1731146152000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-53311-2_30"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031533105","9783031533112"],"references-count":42,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-53311-2_30","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"28 January 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 January 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 February 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"ConfTool Pro","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"297","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"112","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"38% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.2","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.2","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}