2015 |
P Koutras, A Zlatintsi, E.Iosif, A Katsamanis, P Maragos, A Potamianos Predicting Audio-Visual Salient Events Based on Visual, Audio and Text Modalities for Movie Summarization Conference Proc. {IEEE} Int'l Conf. Acous., Speech, and Signal Processing, Quebec, Canada, 2015. Abstract | BibTeX | Links: [PDF] @conference{KZI+15, title = {Predicting Audio-Visual Salient Events Based on Visual, Audio and Text Modalities for Movie Summarization}, author = {P Koutras and A Zlatintsi and E.Iosif and A Katsamanis and P Maragos and A Potamianos}, url = {http://robotics.ntua.gr/wp-content/publications/KZIKMP_MovieSum2_ICIP-2015.pdf}, year = {2015}, date = {2015-09-01}, booktitle = {Proc. {IEEE} Int'l Conf. Acous., Speech, and Signal Processing}, address = {Quebec, Canada}, abstract = {In this paper, we present a new and improved synergistic approach to the problem of audio-visual salient event detection and movie summarization based on visual, audio and text modalities. Spatio-temporal visual saliency is estimated through a perceptually inspired frontend based on 3D (space, time) Gabor filters and frame-wise features are extracted from the saliency volumes. For the auditory salient event detection we extract features based on Teager-Kaiser Energy Operator, while text analysis incorporates part-of-speech tag-ging and affective modeling of single words on the movie subtitles. For the evaluation of the proposed system, we employ an elementary and non-parametric classification technique like KNN. Detection results are reported on the MovSum database, using objective evaluations against ground-truth denoting the perceptually salient events, and human evaluations of the movie summaries. Our evaluation verifies the appropriateness of the proposed methods compared to our baseline system. Finally, our newly proposed summarization algorithm produces summaries that consist of salient and meaningful events, also improving the comprehension of the semantics.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } In this paper, we present a new and improved synergistic approach to the problem of audio-visual salient event detection and movie summarization based on visual, audio and text modalities. Spatio-temporal visual saliency is estimated through a perceptually inspired frontend based on 3D (space, time) Gabor filters and frame-wise features are extracted from the saliency volumes. For the auditory salient event detection we extract features based on Teager-Kaiser Energy Operator, while text analysis incorporates part-of-speech tag-ging and affective modeling of single words on the movie subtitles. For the evaluation of the proposed system, we employ an elementary and non-parametric classification technique like KNN. Detection results are reported on the MovSum database, using objective evaluations against ground-truth denoting the perceptually salient events, and human evaluations of the movie summaries. Our evaluation verifies the appropriateness of the proposed methods compared to our baseline system. Finally, our newly proposed summarization algorithm produces summaries that consist of salient and meaningful events, also improving the comprehension of the semantics. |
P. Koutras, A. Zlatintsi, E. Iosif, A. Katsamanis, P. Maragos, A. Potamianos Predicting audio-visual salient events based on visual, audio and text modalities for movie summarization Conference Proceedings - International Conference on Image Processing, ICIP, 2015-December , 2015, ISSN: 15224880. @conference{307, title = {Predicting audio-visual salient events based on visual, audio and text modalities for movie summarization}, author = { P. Koutras and A. Zlatintsi and E. Iosif and A. Katsamanis and P. Maragos and A. Potamianos}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/KZIKMP_MovieSum2_ICIP-2015.pdf}, doi = {10.1109/ICIP.2015.7351630}, issn = {15224880}, year = {2015}, date = {2015-01-01}, booktitle = {Proceedings - International Conference on Image Processing, ICIP}, volume = {2015-December}, pages = {4361--4365}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
2013 |
Georgios Evangelopoulos, Athanasia Zlatintsi, Alexandros Potamianos, Petros Maragos, Konstantinos Rapantzikos, Georgios Skoumas, Yannis Avrithis Multimodal saliency and fusion for movie summarization based on aural, visual, and textual attention Journal Article IEEE Transactions on Multimedia, 15 (7), pp. 1553–1568, 2013, ISSN: 15209210. Abstract | BibTeX | Links: [PDF] @article{141, title = {Multimodal saliency and fusion for movie summarization based on aural, visual, and textual attention}, author = {Georgios Evangelopoulos and Athanasia Zlatintsi and Alexandros Potamianos and Petros Maragos and Konstantinos Rapantzikos and Georgios Skoumas and Yannis Avrithis}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/EZPMRSA_MultimodalSaliencyFusionMovieSumAVTattention_ieeetMM13.pdf}, doi = {10.1109/TMM.2013.2267205}, issn = {15209210}, year = {2013}, date = {2013-01-01}, journal = {IEEE Transactions on Multimedia}, volume = {15}, number = {7}, pages = {1553--1568}, abstract = {Multimodal streams of sensory information are naturally parsed and integrated by humans using signal-level feature extraction and higher level cognitive processes. Detection of attention-invoking audiovisual segments is formulated in this work on the basis of saliency models for the audio, visual, and textual information conveyed in a video stream. Aural or auditory saliency is assessed by cues that quantify multifrequency waveform modulations, extracted through nonlinear operators and energy tracking. Visual saliency is measured through a spatiotemporal attention model driven by intensity, color, and orientation. Textual or linguistic saliency is extracted from part-of-speech tagging on the subtitles information available with most movie distributions. The individual saliency streams, obtained from modality-depended cues, are integrated in a multimodal saliency curve, modeling the time-varying perceptual importance of the composite video stream and signifying prevailing sensory events. The multimodal saliency representation forms the basis of a generic, bottom-up video summarization algorithm. Different fusion schemes are evaluated on a movie database of multimodal saliency annotations with comparative results provided across modalities. The produced summaries, based on low-level features and content-independent fusion and selection, are of subjectively high aesthetic and informative quality.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Multimodal streams of sensory information are naturally parsed and integrated by humans using signal-level feature extraction and higher level cognitive processes. Detection of attention-invoking audiovisual segments is formulated in this work on the basis of saliency models for the audio, visual, and textual information conveyed in a video stream. Aural or auditory saliency is assessed by cues that quantify multifrequency waveform modulations, extracted through nonlinear operators and energy tracking. Visual saliency is measured through a spatiotemporal attention model driven by intensity, color, and orientation. Textual or linguistic saliency is extracted from part-of-speech tagging on the subtitles information available with most movie distributions. The individual saliency streams, obtained from modality-depended cues, are integrated in a multimodal saliency curve, modeling the time-varying perceptual importance of the composite video stream and signifying prevailing sensory events. The multimodal saliency representation forms the basis of a generic, bottom-up video summarization algorithm. Different fusion schemes are evaluated on a movie database of multimodal saliency annotations with comparative results provided across modalities. The produced summaries, based on low-level features and content-independent fusion and selection, are of subjectively high aesthetic and informative quality. |
2009 |
G Evangelopoulos, A Zlatintsi, G Skoumas, K Rapantzikos, A Potamianos, P Maragos, Y Avrithis Video Event Detection and Summarization Using Audio, Visual and Text Saliency Conference Taipei, Taiwan, 2009. Abstract | BibTeX | Links: [PDF] @conference{EZS+09, title = {Video Event Detection and Summarization Using Audio, Visual and Text Saliency}, author = {G Evangelopoulos and A Zlatintsi and G Skoumas and K Rapantzikos and A Potamianos and P Maragos and Y Avrithis}, url = {http://robotics.ntua.gr/wp-content/publications/EvangelopoulosZlatintsiEtAl_VideoEventDetectionSummarizationUsingAVTSaliency_ICASSP09.pdf}, year = {2009}, date = {2009-04-01}, address = {Taipei, Taiwan}, abstract = {Detection of perceptually important video events is formulated here on the basis of saliency models for the audio, visual and textual information conveyed in a video stream. Audio saliency is assessed by cues that quantify multifrequency waveform modulations, extracted through nonlinear operators and energy tracking. Visual saliency is measured through a spatiotemporal attention model driven by intensity, color and motion. Text saliency is extracted from part-of-speech tagging on the subtitles information available with most movie distributions. The various modality curves are integrated in a single attention curve, where the presence of an event may be signised in one or multiple domains. This multimodal saliency curve is the basis of a bottom-up video summarization algorithm, that refines results from unimodal or audiovisual-based skimming. The algorithm performs favorably for video summarization in terms of informativeness and enjoyability.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Detection of perceptually important video events is formulated here on the basis of saliency models for the audio, visual and textual information conveyed in a video stream. Audio saliency is assessed by cues that quantify multifrequency waveform modulations, extracted through nonlinear operators and energy tracking. Visual saliency is measured through a spatiotemporal attention model driven by intensity, color and motion. Text saliency is extracted from part-of-speech tagging on the subtitles information available with most movie distributions. The various modality curves are integrated in a single attention curve, where the presence of an event may be signised in one or multiple domains. This multimodal saliency curve is the basis of a bottom-up video summarization algorithm, that refines results from unimodal or audiovisual-based skimming. The algorithm performs favorably for video summarization in terms of informativeness and enjoyability. |
2008 |
G Evangelopoulos, K Rapantzikos, A Potamianos, P Maragos, A Zlatintsi, Y Avrithis Movie Summarization based on Audiovisual Saliency Detection Conference Proc. {IEEE} Int'l Conf. Acous., Speech, and Signal Processing, San Diego, CA, U.S.A., 2008. Abstract | BibTeX | Links: [PDF] @conference{ERP+08, title = {Movie Summarization based on Audiovisual Saliency Detection}, author = {G Evangelopoulos and K Rapantzikos and A Potamianos and P Maragos and A Zlatintsi and Y Avrithis}, url = {http://robotics.ntua.gr/wp-content/publications/EvangelopoulosRapantzikosEtAl_MovieSum_ICIP2008_fancyhead.pdf}, year = {2008}, date = {2008-10-01}, booktitle = {Proc. {IEEE} Int'l Conf. Acous., Speech, and Signal Processing}, address = {San Diego, CA, U.S.A.}, abstract = {Based on perceptual and computational attention modeling studies, we formulate measures of saliency for an audiovisual stream. Audio saliency is captured by signal modulations and related multi-frequency band features, extracted through nonlinear operators and energy tracking. Visual saliency is measured by means of a spatiotemporal attention model driven by various feature cues (intensity, color, motion). Audio and video curves are integrated in a single attention curve, where events may be enhanced, suppressed or vanished. The presence of salient events is signified on this audiovisual curve by geometrical features such as local extrema, sharp transition points and level sets. An audiovisual saliency-based movie summarization algorithm is proposed and evaluated. The algorithm is shown to perform very well in terms of summary informativeness and enjoyability for movie clips of various genres.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Based on perceptual and computational attention modeling studies, we formulate measures of saliency for an audiovisual stream. Audio saliency is captured by signal modulations and related multi-frequency band features, extracted through nonlinear operators and energy tracking. Visual saliency is measured by means of a spatiotemporal attention model driven by various feature cues (intensity, color, motion). Audio and video curves are integrated in a single attention curve, where events may be enhanced, suppressed or vanished. The presence of salient events is signified on this audiovisual curve by geometrical features such as local extrema, sharp transition points and level sets. An audiovisual saliency-based movie summarization algorithm is proposed and evaluated. The algorithm is shown to perform very well in terms of summary informativeness and enjoyability for movie clips of various genres. |
G. Evangelopoulos, K. Rapantzikos, A. Potamianos, P. Maragos, A. Zlatintsi, Y. Avrithis Movie summarization based on audiovisual saliency detection Conference Proceedings - International Conference on Image Processing, ICIP, 2008, ISSN: 15224880. Abstract | BibTeX | Links: [PDF] @conference{203, title = {Movie summarization based on audiovisual saliency detection}, author = { G. Evangelopoulos and K. Rapantzikos and A. Potamianos and P. Maragos and A. Zlatintsi and Y. Avrithis}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/ERPMZA_MovieSummarizAVSaliency_ICIP2008.pdf}, doi = {10.1109/ICIP.2008.4712308}, issn = {15224880}, year = {2008}, date = {2008-01-01}, booktitle = {Proceedings - International Conference on Image Processing, ICIP}, pages = {2528--2531}, abstract = {Based on perceptual and computational attention modeling studies, we formulate measures of saliency for an audiovisual stream. Audio saliency is captured by signal modulations and related multi-frequency band features, extracted through nonlinear operators and energy tracking. Visual saliency is measured by means of a spatiotemporal attention model driven by various feature cues (intensity, color, motion). Audio and video curves are integrated in a single attention curve, where events may be enhanced, suppressed or vanished. The presence of salient events is signified on this audiovisual curve by geometrical features such as local extrema, sharp transition points and level sets. An audiovisual saliency-based movie summarization algorithm is proposed and evaluated. The algorithm is shown to perform very well in terms of summary informativeness and enjoyability for movie clips of various genres.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Based on perceptual and computational attention modeling studies, we formulate measures of saliency for an audiovisual stream. Audio saliency is captured by signal modulations and related multi-frequency band features, extracted through nonlinear operators and energy tracking. Visual saliency is measured by means of a spatiotemporal attention model driven by various feature cues (intensity, color, motion). Audio and video curves are integrated in a single attention curve, where events may be enhanced, suppressed or vanished. The presence of salient events is signified on this audiovisual curve by geometrical features such as local extrema, sharp transition points and level sets. An audiovisual saliency-based movie summarization algorithm is proposed and evaluated. The algorithm is shown to perform very well in terms of summary informativeness and enjoyability for movie clips of various genres. |
Copyright Notice:
Some material presented is available for download to ensure timely dissemination of scholarly and technical work. Copyright and all rights therein are retained by authors or by other copyright holders. All persons copying this information are expected to adhere to the terms and constraints invoked by each author’s copyright. In most cases, these works may not be reposted without the explicit permission of the copyright holder.
The work already published by the IEEE is under its copyright. Personal use of such material is permitted. However, permission to reprint/republish the material for advertising or promotional purposes, or for creating new collective works for resale or redistribution to servers or lists, or to reuse any copyrighted component of the work in other works must be obtained from the IEEE.