2014 |
Georgios Pavlakos, Stavros Theodorakis, Vassilis Pitsikalis, Athanasios Katsamanis, Petros Maragos Kinect-based multimodal gesture recognition using a two-pass fusion scheme Conference 2014 IEEE International Conference on Image Processing, ICIP 2014, 2014, ISBN: 9781479957514. Abstract | BibTeX | Links: [PDF] @conference{165, title = {Kinect-based multimodal gesture recognition using a two-pass fusion scheme}, author = { Georgios Pavlakos and Stavros Theodorakis and Vassilis Pitsikalis and Athanasios Katsamanis and Petros Maragos}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/PTPΚΜ_MultimodalGestureRecogn2PassFusion_ICIP2014.pdf}, doi = {10.1109/ICIP.2014.7025299}, isbn = {9781479957514}, year = {2014}, date = {2014-01-01}, booktitle = {2014 IEEE International Conference on Image Processing, ICIP 2014}, pages = {1495--1499}, abstract = {We present a new framework for multimodal gesture recognition that is based on a two-pass fusion scheme. In this, we deal with a demanding Kinect-based multimodal dataset, which was introduced in a recent gesture recognition challenge. We employ multiple modalities, i.e., visual cues, such as colour and depth images, as well as audio, and we specifically extract feature descriptors of the hands' movement, handshape, and audio spectral properties. Based on these features, we statistically train separate unimodal gesture-word models, namely hidden Markov models, explicitly accounting for the dynamics of each modality. Multimodal recognition of unknown gesture sequences is achieved by combining these models in a late, two-pass fusion scheme that exploits a set of unimodally generated n-best recognition hypotheses. The proposed scheme achieves 88.2% gesture recognition accuracy in the Kinect-based multimodal dataset, outperforming all recently published approaches on the same challenging multimodal gesture recognition task.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We present a new framework for multimodal gesture recognition that is based on a two-pass fusion scheme. In this, we deal with a demanding Kinect-based multimodal dataset, which was introduced in a recent gesture recognition challenge. We employ multiple modalities, i.e., visual cues, such as colour and depth images, as well as audio, and we specifically extract feature descriptors of the hands' movement, handshape, and audio spectral properties. Based on these features, we statistically train separate unimodal gesture-word models, namely hidden Markov models, explicitly accounting for the dynamics of each modality. Multimodal recognition of unknown gesture sequences is achieved by combining these models in a late, two-pass fusion scheme that exploits a set of unimodally generated n-best recognition hypotheses. The proposed scheme achieves 88.2% gesture recognition accuracy in the Kinect-based multimodal dataset, outperforming all recently published approaches on the same challenging multimodal gesture recognition task. |
2013 |
I. Rodomagoulakis, P. Giannoulis, Z. I. Skordilis, P. Maragos, G. Potamianos Experiments on far-field multichannel speech processing in smart homes Conference 2013 18th International Conference on Digital Signal Processing, DSP 2013, 2013, ISBN: 9781467358057. @conference{175, title = {Experiments on far-field multichannel speech processing in smart homes}, author = { I. Rodomagoulakis and P. Giannoulis and Z. I. Skordilis and P. Maragos and G. Potamianos}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/RGSMP_ExperimsFarfieldMultichannelSpeechProcessSmartHomes_DSP2013.pdf}, doi = {10.1109/ICDSP.2013.6622707}, isbn = {9781467358057}, year = {2013}, date = {2013-01-01}, booktitle = {2013 18th International Conference on Digital Signal Processing, DSP 2013}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
2011 |
Dimitrios Dimitriadis, Petros Maragos, Alexandros Potamianos On the effects of filterbank design and energy computation on robust speech recognition Journal Article IEEE Transactions on Audio, Speech and Language Processing, 19 (6), pp. 1504–1516, 2011, ISSN: 15587916. Abstract | BibTeX | Links: [PDF] @article{137, title = {On the effects of filterbank design and energy computation on robust speech recognition}, author = {Dimitrios Dimitriadis and Petros Maragos and Alexandros Potamianos}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/DimitriadisMaragosPotamianos_Effects-Filterbank-Design-Energy-Computation-Robust-Speech-Recognition_ieeeTASLP_aug11.pdf}, doi = {10.1109/TASL.2010.2092766}, issn = {15587916}, year = {2011}, date = {2011-01-01}, journal = {IEEE Transactions on Audio, Speech and Language Processing}, volume = {19}, number = {6}, pages = {1504--1516}, abstract = {In this paper, we examine how energy computation and filterbank design contribute to the overall front-end robustness, especially when the investigated features are applied to noisy speech signals, in mismatched training-testing conditions. In prior work (“Auditory Teager energy cepstrum coefficients for robust speech recognition,” D. Dimitriadis, P. Maragos, and A. Potamianos, in Proc. Eurospeech'05, Sep. 2005), a novel feature set called “Teager energy cepstrum coefficients” (TECCs) has been proposed, employing a dense, smooth filterbank and alternative energy computation schemes. TECCs were shown to be more robust to noise and exhibit improved performance compared to the widely used Mel frequency cepstral coefficients (MFCCs). In this paper, we attempt to interpret these results using a combined theoretical and experimental analysis framework. Specifically, we investigate in detail the connection between the filterbank design, i.e., the filter shape and bandwidth, the energy estimation scheme and the automatic speech recognition (ASR) performance under a variety of additive and/or convolutional noise conditions. For this purpose: 1) the performance of filterbanks using triangular, Gabor, and Gammatone filters with various bandwidths and filter positions are examined under different noisy speech recognition tasks, and 2) the squared amplitude and Teager–Kaiser energy operators are compared as two alternative approaches of computing the signal energy. Our end-goal is to understand how to select the most efficient filterbank and energy computation scheme that are maximally robust under both clean and noisy recording conditions. Theoretical and experimental results show that: 1) the filter bandwidth is one of the most important factors affecting speech recognition performance in noise, while the shape of the filter is of secondary importance, and 2) the Teager–Kaiser operator outperforms (on the average and for most noise types) the squared amplitude energy computation scheme for speech recognition in noisy conditions, especially, for large filter bandwidths. Experimental results show that selecting the appropriate filterbank and energy computation scheme can lead to significant error rate reduction over both MFCC and perceptual linear predicion (PLP) features for a variety of speech recognition tasks. A relative error rate reduction of up to textlessformula formulatype="inline"textgreatertextlesstex Notation="TeX"textgreater$backslashsimbackslashhbox 30backslashhboxbackslash%$textless/textextgreater textless/formulatextgreater for MFCCs and textlessformula formulatype="inline"textgreatertextlesstex Notation="TeX"textgreater$backslashsimbackslashhbox 39backslashhboxbackslash%$textless/textextgreatertextless/formulatextgreater for PLPs is shown for the Aurora-3 Spanish Task.}, keywords = {}, pubstate = {published}, tppubtype = {article} } In this paper, we examine how energy computation and filterbank design contribute to the overall front-end robustness, especially when the investigated features are applied to noisy speech signals, in mismatched training-testing conditions. In prior work (“Auditory Teager energy cepstrum coefficients for robust speech recognition,” D. Dimitriadis, P. Maragos, and A. Potamianos, in Proc. Eurospeech'05, Sep. 2005), a novel feature set called “Teager energy cepstrum coefficients” (TECCs) has been proposed, employing a dense, smooth filterbank and alternative energy computation schemes. TECCs were shown to be more robust to noise and exhibit improved performance compared to the widely used Mel frequency cepstral coefficients (MFCCs). In this paper, we attempt to interpret these results using a combined theoretical and experimental analysis framework. Specifically, we investigate in detail the connection between the filterbank design, i.e., the filter shape and bandwidth, the energy estimation scheme and the automatic speech recognition (ASR) performance under a variety of additive and/or convolutional noise conditions. For this purpose: 1) the performance of filterbanks using triangular, Gabor, and Gammatone filters with various bandwidths and filter positions are examined under different noisy speech recognition tasks, and 2) the squared amplitude and Teager–Kaiser energy operators are compared as two alternative approaches of computing the signal energy. Our end-goal is to understand how to select the most efficient filterbank and energy computation scheme that are maximally robust under both clean and noisy recording conditions. Theoretical and experimental results show that: 1) the filter bandwidth is one of the most important factors affecting speech recognition performance in noise, while the shape of the filter is of secondary importance, and 2) the Teager–Kaiser operator outperforms (on the average and for most noise types) the squared amplitude energy computation scheme for speech recognition in noisy conditions, especially, for large filter bandwidths. Experimental results show that selecting the appropriate filterbank and energy computation scheme can lead to significant error rate reduction over both MFCC and perceptual linear predicion (PLP) features for a variety of speech recognition tasks. A relative error rate reduction of up to textlessformula formulatype="inline"textgreatertextlesstex Notation="TeX"textgreater$backslashsimbackslashhbox 30backslashhboxbackslash%$textless/textextgreater textless/formulatextgreater for MFCCs and textlessformula formulatype="inline"textgreatertextlesstex Notation="TeX"textgreater$backslashsimbackslashhbox 39backslashhboxbackslash%$textless/textextgreatertextless/formulatextgreater for PLPs is shown for the Aurora-3 Spanish Task. |
2009 |
D. Dimitriadis, A. Metallinou, I. Konstantinou, G. Goumas, P. Maragos, N. Koziris GridNews: A distributed automatic Greek broadcast transcription system Conference ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings, 2009, ISSN: 15206149. Abstract | BibTeX | Links: [PDF] @conference{196, title = {GridNews: A distributed automatic Greek broadcast transcription system}, author = { D. Dimitriadis and A. Metallinou and I. Konstantinou and G. Goumas and P. Maragos and N. Koziris}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/DimitriadisMetallinouEtAl_Gridnews-AutomaticGreekTranscriptionSystem_ICASSP09.pdf}, doi = {10.1109/ICASSP.2009.4959984}, issn = {15206149}, year = {2009}, date = {2009-01-01}, booktitle = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings}, pages = {1917--1920}, abstract = {In this paper, a distributed system storing and retrieving Broad-cast News data recorded from the Greek television is presented. These multimodal data are processed in a grid computational en-vironment interconnecting distributed data storage and processing subsystems. The innovative element of this system is the implemen-tation of the signal processing algorithms in this grid environment, offering additional flexibility and computational power. Among the developed signal processing modules are: the Segmentor, cutting up the original videos into shorter ones, the Classifier, recognizing whether these short videos contain speech or not, the Greek large-vocabulary speech Recognizer, transcribing speech into written text, and finally the text Search engine and the video Retriever. All the processed data are stored and retrieved in geographically distributed storage elements. A user-friendly, web-based interface is developed, facilitating the transparent import and storage of new multimodal data, their off-line processing and finally, their search and retrieval.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } In this paper, a distributed system storing and retrieving Broad-cast News data recorded from the Greek television is presented. These multimodal data are processed in a grid computational en-vironment interconnecting distributed data storage and processing subsystems. The innovative element of this system is the implemen-tation of the signal processing algorithms in this grid environment, offering additional flexibility and computational power. Among the developed signal processing modules are: the Segmentor, cutting up the original videos into shorter ones, the Classifier, recognizing whether these short videos contain speech or not, the Greek large-vocabulary speech Recognizer, transcribing speech into written text, and finally the text Search engine and the video Retriever. All the processed data are stored and retrieved in geographically distributed storage elements. A user-friendly, web-based interface is developed, facilitating the transparent import and storage of new multimodal data, their off-line processing and finally, their search and retrieval. |
2006 |
A Katsamanis, G Papandreou, V Pitsikalis, P Maragos Multimodal fusion by adaptive compensation for feature uncertainty with application to audiovisual speech recognition Conference Proc. 14th European Signal Processing Conference (EUSIPCO-2006), Florence, Italy, Sep. 2006, 2006, ISBN: 22195491 (ISSN). Abstract | BibTeX | Links: [PDF] @conference{225, title = {Multimodal fusion by adaptive compensation for feature uncertainty with application to audiovisual speech recognition}, author = { A Katsamanis and G Papandreou and V Pitsikalis and P Maragos}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84862631884&partnerID=40&md5=ccaeee023c42f0923a6dcdec81ac7fdc}, isbn = {22195491 (ISSN)}, year = {2006}, date = {2006-01-01}, booktitle = {Proc. 14th European Signal Processing Conference (EUSIPCO-2006), Florence, Italy, Sep. 2006}, abstract = {In pattern recognition one usually relies on measuring a set of informative features to perform tasks such as classification. While the accuracy of feature measurements heavily depends on changing environmental conditions, studying the consequences of this fact has received relatively little attention to date. In this work we explicitly take into account uncertainty in feature measurements and we show in a rigorous probabilistic framework how the models used for classification should be adjusted to compensate for this effect. Our approach proves to be particularly fruitful in multimodal fusion scenarios, such as audio-visual speech recognition, where multiple streams of complementary features are integrated. For such applications, provided that an estimate of the measurement noise uncertainty for each feature stream is available, we show that the proposed framework leads to highly adaptive multimodal fusion rules which are widely applicable and easy to implement. We further show that previous multimodal fusion methods relying on stream weights fall under our scheme if certain assumptions hold; this provides novel insights into their applicability for various tasks and suggests new practical ways for estimating the stream weights adaptively. Preliminary experimental results in audio-visual speech recognition demonstrate the potential of our approach.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } In pattern recognition one usually relies on measuring a set of informative features to perform tasks such as classification. While the accuracy of feature measurements heavily depends on changing environmental conditions, studying the consequences of this fact has received relatively little attention to date. In this work we explicitly take into account uncertainty in feature measurements and we show in a rigorous probabilistic framework how the models used for classification should be adjusted to compensate for this effect. Our approach proves to be particularly fruitful in multimodal fusion scenarios, such as audio-visual speech recognition, where multiple streams of complementary features are integrated. For such applications, provided that an estimate of the measurement noise uncertainty for each feature stream is available, we show that the proposed framework leads to highly adaptive multimodal fusion rules which are widely applicable and easy to implement. We further show that previous multimodal fusion methods relying on stream weights fall under our scheme if certain assumptions hold; this provides novel insights into their applicability for various tasks and suggests new practical ways for estimating the stream weights adaptively. Preliminary experimental results in audio-visual speech recognition demonstrate the potential of our approach. |
2002 |
D Dimitriadis, P Maragos, A Potamianos Modulation features for speech recognition Journal Article International Conference on Acoustics, 1 , pp. I–377–I–380, 2002. @article{76c, title = {Modulation features for speech recognition}, author = {D Dimitriadis and P Maragos and A Potamianos}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/DimitriadisMaragosPotamianos_RobustAMFM_Features_SpeechRecognition_ieeeSPL2005.pdf}, year = {2002}, date = {2002-01-01}, journal = {International Conference on Acoustics}, volume = {1}, pages = {I--377--I--380}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
D Dimitriadis, P Maragos, A Potamianos Modulation features for speech recognition Conference International Conference on Acoustics, 1 , 2002. @conference{253, title = {Modulation features for speech recognition}, author = { D Dimitriadis and P Maragos and A Potamianos}, year = {2002}, date = {2002-01-01}, booktitle = {International Conference on Acoustics}, volume = {1}, pages = {I--377--I--380}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
V Pitsikalis, P Maragos Speech analysis and feature extraction using chaotic models Conference International Conference on Acoustics, 1 , 2002. @conference{252, title = {Speech analysis and feature extraction using chaotic models}, author = { V Pitsikalis and P Maragos}, year = {2002}, date = {2002-01-01}, booktitle = {International Conference on Acoustics}, volume = {1}, pages = {I--533--I--536 vol.1}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
Copyright Notice:
Some material presented is available for download to ensure timely dissemination of scholarly and technical work. Copyright and all rights therein are retained by authors or by other copyright holders. All persons copying this information are expected to adhere to the terms and constraints invoked by each author’s copyright. In most cases, these works may not be reposted without the explicit permission of the copyright holder.
The work already published by the IEEE is under its copyright. Personal use of such material is permitted. However, permission to reprint/republish the material for advertising or promotional purposes, or for creating new collective works for resale or redistribution to servers or lists, or to reuse any copyrighted component of the work in other works must be obtained from the IEEE.