2009 |
Dimitrios Dimitriadis, Alexandros Potamianos, Petros Maragos A comparison of the squared energy and teager-kaiser operators for short-term energy estimation in additive noise Journal Article IEEE Transactions on Signal Processing, 57 (7), pp. 2569–2581, 2009, ISSN: 1053587X. Abstract | BibTeX | Links: [PDF] @article{132, title = {A comparison of the squared energy and teager-kaiser operators for short-term energy estimation in additive noise}, author = {Dimitrios Dimitriadis and Alexandros Potamianos and Petros Maragos}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/DimitriadisPotamianosMaragos_ComparisonSquaredAmpl-TKOper-EnergyEstimation_ieeetSP2008.pdf}, doi = {10.1109/TSP.2009.2019299}, issn = {1053587X}, year = {2009}, date = {2009-01-01}, journal = {IEEE Transactions on Signal Processing}, volume = {57}, number = {7}, pages = {2569--2581}, abstract = {Time-frequency distributions that evaluate the signal's energy content both in the time and frequency domains are indispensable signal processing tools, especially, for nonstationary signals. Various short-time energy computation schemes are used in practice, including the mean squared amplitude and Teager-Kaiser energy approaches. Herein, we focus primarily on the short- and medium-term properties of these two energy estimation schemes, as well as, on their performance in the presence of additive noise. To facilitate this analysis and generalize the approach, we use a harmonic noise model to approximate the noise component. The error analysis is conducted both in the continuous- and discrete-time domains, deriving similar conclusions. The estimation errors are measured in terms of normalized deviations from the expected signal energy and are shown to greatly depend on both the signals' spectral content and the analysis window length. When medium- and long-term analysis windows are employed, the Teager-Kaiser energy operator is proven superior to the common squared energy operator, provided that the spectral content of the noise is more lowpass than the corresponding signal content, and vice versa. However, for shorter window lengths, the Teager-Kaiser operator always outperforms the squared energy operator. The theoretical results are experimentally verified for synthetic signals. Finally, the performance of the proposed energy operators is evaluated for short-term analysis of noisy speech signals and the implications for speech processing applications are outlined.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Time-frequency distributions that evaluate the signal's energy content both in the time and frequency domains are indispensable signal processing tools, especially, for nonstationary signals. Various short-time energy computation schemes are used in practice, including the mean squared amplitude and Teager-Kaiser energy approaches. Herein, we focus primarily on the short- and medium-term properties of these two energy estimation schemes, as well as, on their performance in the presence of additive noise. To facilitate this analysis and generalize the approach, we use a harmonic noise model to approximate the noise component. The error analysis is conducted both in the continuous- and discrete-time domains, deriving similar conclusions. The estimation errors are measured in terms of normalized deviations from the expected signal energy and are shown to greatly depend on both the signals' spectral content and the analysis window length. When medium- and long-term analysis windows are employed, the Teager-Kaiser energy operator is proven superior to the common squared energy operator, provided that the spectral content of the noise is more lowpass than the corresponding signal content, and vice versa. However, for shorter window lengths, the Teager-Kaiser operator always outperforms the squared energy operator. The theoretical results are experimentally verified for synthetic signals. Finally, the performance of the proposed energy operators is evaluated for short-term analysis of noisy speech signals and the implications for speech processing applications are outlined. |
Vassilis Pitsikalis, Petros Maragos Analysis and classification of speech signals by generalized fractal dimension features Journal Article Speech Communication, 51 (12), pp. 1206–1223, 2009, ISSN: 01676393. Abstract | BibTeX | Links: [PDF] @article{136, title = {Analysis and classification of speech signals by generalized fractal dimension features}, author = {Vassilis Pitsikalis and Petros Maragos}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/PitsikalisMaragos_AnalysisClassificationfSpeechFractalDimFeat_SpeechCommunication09.pdf}, doi = {10.1016/j.specom.2009.06.005}, issn = {01676393}, year = {2009}, date = {2009-01-01}, journal = {Speech Communication}, volume = {51}, number = {12}, pages = {1206--1223}, abstract = {We explore nonlinear signal processing methods inspired by dynamical systems and fractal theory in order to analyze and characterize speech sounds. A speech signal is at first embedded in a multidimensional phase-space and further employed for the estimation of measurements related to the fractal dimensions. Our goals are to compute these raw measurements in the practical cases of speech signals, to further utilize them for the extraction of simple descriptive features and to address issues on the efficacy of the proposed features to characterize speech sounds. We observe that distinct feature vector elements obtain values or show statistical trends that on average depend on general characteristics such as the voicing, the manner and the place of articulation of broad phoneme classes. Moreover the way that the statistical parameters of the features are altered as an effect of the variation of phonetic characteristics seem to follow some roughly formed patterns. We also discuss some qualitative aspects concerning the linear phoneme-wise correlation between the fractal features and the commonly employed mel-frequency cepstral coefficients (MFCCs) demonstrating phonetic cases of maximal and minimal correlation. In the same context we also investigate the fractal features' spectral content, in terms of the most and least correlated components with the MFCC. Further the proposed methods are examined under the light of indicative phoneme classification experiments. These quantify the efficacy of the features to characterize broad classes of speech sounds. The results are shown to be comparable for some classification scenarios with the corresponding ones of the MFCC features. textcopyright 2009 Elsevier B.V. All rights reserved.}, keywords = {}, pubstate = {published}, tppubtype = {article} } We explore nonlinear signal processing methods inspired by dynamical systems and fractal theory in order to analyze and characterize speech sounds. A speech signal is at first embedded in a multidimensional phase-space and further employed for the estimation of measurements related to the fractal dimensions. Our goals are to compute these raw measurements in the practical cases of speech signals, to further utilize them for the extraction of simple descriptive features and to address issues on the efficacy of the proposed features to characterize speech sounds. We observe that distinct feature vector elements obtain values or show statistical trends that on average depend on general characteristics such as the voicing, the manner and the place of articulation of broad phoneme classes. Moreover the way that the statistical parameters of the features are altered as an effect of the variation of phonetic characteristics seem to follow some roughly formed patterns. We also discuss some qualitative aspects concerning the linear phoneme-wise correlation between the fractal features and the commonly employed mel-frequency cepstral coefficients (MFCCs) demonstrating phonetic cases of maximal and minimal correlation. In the same context we also investigate the fractal features' spectral content, in terms of the most and least correlated components with the MFCC. Further the proposed methods are examined under the light of indicative phoneme classification experiments. These quantify the efficacy of the features to characterize broad classes of speech sounds. The results are shown to be comparable for some classification scenarios with the corresponding ones of the MFCC features. textcopyright 2009 Elsevier B.V. All rights reserved. |
2008 |
Anastasia Sofou, Petros Maragos Generalized flooding and multicue PDE-based image segmentation Journal Article IEEE Transactions on Image Processing, 17 (3), pp. 364–376, 2008, ISSN: 10577149. Abstract | BibTeX | Links: [PDF] @article{126, title = {Generalized flooding and multicue PDE-based image segmentation}, author = {Anastasia Sofou and Petros Maragos}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/SofouMaragos_GeneralizedFloodingMulticuePDEBasedImageSegm_ImageProc08.pdf}, doi = {10.1109/TIP.2007.916156}, issn = {10577149}, year = {2008}, date = {2008-01-01}, journal = {IEEE Transactions on Image Processing}, volume = {17}, number = {3}, pages = {364--376}, abstract = {Image segmentation remains an important, but hard-to-solve, problem since it appears to be application dependent with usually no a priori information available regarding the image structure. Moreover, the increasing demands of image analysis tasks in terms of segmentation results' quality introduce the necessity of employing multiple cues for improving image segmentation results. In this paper, we attempt to incorporate cues such as intensity contrast, region size, and texture in the segmentation procedure and derive improved results compared to using individual cues separately. We emphasize on the overall segmentation procedure, and we propose efficient simplification operators and feature extraction schemes, capable of quantifying important characteristics, like geometrical complexity, rate of change in local contrast variations, and orientation, that eventually favor the final segmentation result. Based on the well-known morphological paradigm of watershed transform segmentation, which exploits intensity contrast and region size criteria, we investigate its partial differential equation (PDE) formulation, and we extend it in order to satisfy various flooding criteria, thus making it applicable to a wider range of images. Going a step further, we introduce a segmentation scheme that couples contrast criteria in flooding with texture information. The modeling of the proposed scheme is done via PDEs and the efficient incorporation of the available contrast and texture information, is done by selecting an appropriate cartoon-texture image decomposition scheme. The proposed coupled segmentation scheme is driven by two separate image components: cartoon U (for contrast information) and texture component V. The performance of the proposed segmentation scheme is demonstrated through a complete set of experimental results and substantiated using quantitative and qualitative criteria.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Image segmentation remains an important, but hard-to-solve, problem since it appears to be application dependent with usually no a priori information available regarding the image structure. Moreover, the increasing demands of image analysis tasks in terms of segmentation results' quality introduce the necessity of employing multiple cues for improving image segmentation results. In this paper, we attempt to incorporate cues such as intensity contrast, region size, and texture in the segmentation procedure and derive improved results compared to using individual cues separately. We emphasize on the overall segmentation procedure, and we propose efficient simplification operators and feature extraction schemes, capable of quantifying important characteristics, like geometrical complexity, rate of change in local contrast variations, and orientation, that eventually favor the final segmentation result. Based on the well-known morphological paradigm of watershed transform segmentation, which exploits intensity contrast and region size criteria, we investigate its partial differential equation (PDE) formulation, and we extend it in order to satisfy various flooding criteria, thus making it applicable to a wider range of images. Going a step further, we introduce a segmentation scheme that couples contrast criteria in flooding with texture information. The modeling of the proposed scheme is done via PDEs and the efficient incorporation of the available contrast and texture information, is done by selecting an appropriate cartoon-texture image decomposition scheme. The proposed coupled segmentation scheme is driven by two separate image components: cartoon U (for contrast information) and texture component V. The performance of the proposed segmentation scheme is demonstrated through a complete set of experimental results and substantiated using quantitative and qualitative criteria. |
2002 |
D Dimitriadis, P Maragos, A Potamianos Modulation features for speech recognition Conference International Conference on Acoustics, Speech, and Signal Processing, Orlando, Florida, 2002. @conference{Dimitriadis2002, title = {Modulation features for speech recognition}, author = { D Dimitriadis and P Maragos and A Potamianos}, url = {http://robotics.ntua.gr/wp-content/uploads/sites/2/dimitriadis2002.pdf}, year = {2002}, date = {2002-05-13}, booktitle = {International Conference on Acoustics, Speech, and Signal Processing}, address = {Orlando, Florida}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
V Pitsikalis, P Maragos Speech analysis and feature extraction using chaotic models Conference IEEE International Conference on Acoustics, Speech, and Signal Processing, Orlando, Florida, 2002. @conference{Pitsikalis2002, title = {Speech analysis and feature extraction using chaotic models}, author = { V Pitsikalis and P Maragos}, url = {http://robotics.ntua.gr/wp-content/uploads/sites/2/pitsikalis2002.pdf}, doi = {10.1109/ICASSP.2002.5743772}, year = {2002}, date = {2002-05-13}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing}, address = {Orlando, Florida}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
Copyright Notice:
Some material presented is available for download to ensure timely dissemination of scholarly and technical work. Copyright and all rights therein are retained by authors or by other copyright holders. All persons copying this information are expected to adhere to the terms and constraints invoked by each author’s copyright. In most cases, these works may not be reposted without the explicit permission of the copyright holder.
The work already published by the IEEE is under its copyright. Personal use of such material is permitted. However, permission to reprint/republish the material for advertising or promotional purposes, or for creating new collective works for resale or redistribution to servers or lists, or to reuse any copyrighted component of the work in other works must be obtained from the IEEE.