(+30) 210-772-2420
- pan@none.gr
- Office 2.2.19
Publications
2019 |
Panagiotis Giannoulis, Gerasimos Potamianos, Petros Maragos Room-localized speech activity detection in multi-microphone smart homes Journal Article EURASIP Journal on Audio, Speech, and Music Processing, 2019 (1), pp. 15, 2019, ISSN: 1687-4722. Abstract | BibTeX | Links: [PDF] @article{Giannoulis2019, title = {Room-localized speech activity detection in multi-microphone smart homes}, author = {Panagiotis Giannoulis and Gerasimos Potamianos and Petros Maragos}, url = {http://robotics.ntua.gr/wp-content/uploads/sites/2/2019_GiannoulisEtAl_RoomlocalizedSAD-MultiMicrophoneSmartHomes_EURASIP-JASM.pdfhttps://doi.org/10.1186/s13636-019-0158-8}, doi = {10.1186/s13636-019-0158-8}, issn = {1687-4722}, year = {2019}, date = {2019-01-01}, journal = {EURASIP Journal on Audio, Speech, and Music Processing}, volume = {2019}, number = {1}, pages = {15}, abstract = {Voice-enabled interaction systems in domestic environments have attracted significant interest recently, being the focus of smart home research projects and commercial voice assistant home devices. Within the multi-module pipelines of such systems, speech activity detection (SAD) constitutes a crucial component, providing input to their activation and speech recognition subsystems. In typical multi-room domestic environments, SAD may also convey spatial intelligence to the interaction, in addition to its traditional temporal segmentation output, by assigning speech activity at the room level. Such room-localized SAD can, for example, disambiguate user command referents, allow localized system feedback, and enable parallel voice interaction sessions by multiple subjects in different rooms. In this paper, we investigate a room-localized SAD system for smart homes equipped with multiple microphones distributed in multiple rooms, significantly extending our earlier work. The system employs a two-stage algorithm, incorporating a set of hand-crafted features specially designed to discriminate room-inside vs. room-outside speech at its second stage, refining SAD hypotheses obtained at its first stage by traditional statistical modeling and acoustic front-end processing. Both algorithmic stages exploit multi-microphone information, combining it at the signal, feature, or decision level. The proposed approach is extensively evaluated on both simulated and real data recorded in a multi-room, multi-microphone smart home, significantly outperforming alternative baselines. Further, it remains robust to reduced microphone setups, while also comparing favorably to deep learning-based alternatives.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Voice-enabled interaction systems in domestic environments have attracted significant interest recently, being the focus of smart home research projects and commercial voice assistant home devices. Within the multi-module pipelines of such systems, speech activity detection (SAD) constitutes a crucial component, providing input to their activation and speech recognition subsystems. In typical multi-room domestic environments, SAD may also convey spatial intelligence to the interaction, in addition to its traditional temporal segmentation output, by assigning speech activity at the room level. Such room-localized SAD can, for example, disambiguate user command referents, allow localized system feedback, and enable parallel voice interaction sessions by multiple subjects in different rooms. In this paper, we investigate a room-localized SAD system for smart homes equipped with multiple microphones distributed in multiple rooms, significantly extending our earlier work. The system employs a two-stage algorithm, incorporating a set of hand-crafted features specially designed to discriminate room-inside vs. room-outside speech at its second stage, refining SAD hypotheses obtained at its first stage by traditional statistical modeling and acoustic front-end processing. Both algorithmic stages exploit multi-microphone information, combining it at the signal, feature, or decision level. The proposed approach is extensively evaluated on both simulated and real data recorded in a multi-room, multi-microphone smart home, significantly outperforming alternative baselines. Further, it remains robust to reduced microphone setups, while also comparing favorably to deep learning-based alternatives. |
2017 |
A C Dometios, A Tsiami, A Arvanitakis, P Giannoulis, X S Papageorgiou, C S Tzafestas, P Maragos Integrated Speech-based Perception System for User Adaptive Robot Motion Planning in Assistive Bath Scenarios Conference Proc. of the 25th European Signal Processing Conference - Workshop: "MultiLearn 2017 - Multimodal processing, modeling and learning for human-computer/robot interaction applications", Kos, Greece, 2017. Abstract | BibTeX | Links: [PDF] @conference{DTAGPTM17, title = {Integrated Speech-based Perception System for User Adaptive Robot Motion Planning in Assistive Bath Scenarios}, author = {A C Dometios and A Tsiami and A Arvanitakis and P Giannoulis and X S Papageorgiou and C S Tzafestas and P Maragos}, url = {http://www.eurasip.org/Proceedings/Eusipco/Eusipco2017/wpapers/ML5.pdf}, year = {2017}, date = {2017-09-01}, booktitle = {Proc. of the 25th European Signal Processing Conference - Workshop: "MultiLearn 2017 - Multimodal processing, modeling and learning for human-computer/robot interaction applications"}, address = {Kos, Greece}, abstract = {Elderly people have augmented needs in performing bathing activities, since these tasks require body flexibility. Our aim is to build an assistive robotic bath system, in order to increase the independence and safety of this procedure. Towards this end, the expertise of professional carers for bathing sequences and appropriate motions have to be adopted, in order to achieve natural, physical human - robot interaction. The integration of the communication and verbal interaction between the user and the robot during the bathing tasks is a key issue for such a challenging assistive robotic application. In this paper, we tackle this challenge by developing a novel integrated real-time speech-based perception system, which will provide the necessary assistance to the frail senior citizens. This system can be suitable for installation and use in conventional home or hospital bathroom space. We employ both a speech recognition system with sub-modules to achieve a smooth and robust human-system communication and a low cost depth camera or end-effector motion planning. With a variety of spoken commands, the system can be adapted to the user’s needs and preferences. The instructed by the user washing commands are executed by a robotic manipulator, demonstrating the progress of each task. The smooth integration of ll subsystems is accomplished by a modular and hierarchical decision architecture organized as a Behavior Tree. The system was experimentally tested by successful execution of scenarios from different users with different preferences.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Elderly people have augmented needs in performing bathing activities, since these tasks require body flexibility. Our aim is to build an assistive robotic bath system, in order to increase the independence and safety of this procedure. Towards this end, the expertise of professional carers for bathing sequences and appropriate motions have to be adopted, in order to achieve natural, physical human - robot interaction. The integration of the communication and verbal interaction between the user and the robot during the bathing tasks is a key issue for such a challenging assistive robotic application. In this paper, we tackle this challenge by developing a novel integrated real-time speech-based perception system, which will provide the necessary assistance to the frail senior citizens. This system can be suitable for installation and use in conventional home or hospital bathroom space. We employ both a speech recognition system with sub-modules to achieve a smooth and robust human-system communication and a low cost depth camera or end-effector motion planning. With a variety of spoken commands, the system can be adapted to the user’s needs and preferences. The instructed by the user washing commands are executed by a robotic manipulator, demonstrating the progress of each task. The smooth integration of ll subsystems is accomplished by a modular and hierarchical decision architecture organized as a Behavior Tree. The system was experimentally tested by successful execution of scenarios from different users with different preferences. |
2015 |
Panagiotis Giannoulis, Alessio Brutti, Marco Matassoni, Alberto Abad, Athanasios Katsamanis, Miguel Matos, Gerasimos Potamianos, Petros Maragos, Fondazione Bruno Kessler MULTI-ROOM SPEECH ACTIVITY DETECTION USING A DISTRIBUTED MICROPHONE NETWORK IN DOMESTIC ENVIRONMENTS Conference Proc. European Signal Processing Conf. (EUSIPCO-2015), Nice, France, Sep. 2015, 2015, ISBN: 9780992862633. @conference{306, title = {MULTI-ROOM SPEECH ACTIVITY DETECTION USING A DISTRIBUTED MICROPHONE NETWORK IN DOMESTIC ENVIRONMENTS}, author = { Panagiotis Giannoulis and Alessio Brutti and Marco Matassoni and Alberto Abad and Athanasios Katsamanis and Miguel Matos and Gerasimos Potamianos and Petros Maragos and Fondazione Bruno Kessler}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/GiannoulisEtAl_MultiRoomSpeechActivityDetection_EUSIPCO2015_crf.pdf}, isbn = {9780992862633}, year = {2015}, date = {2015-01-01}, booktitle = {Proc. European Signal Processing Conf. (EUSIPCO-2015), Nice, France, Sep. 2015}, pages = {1281--1285}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
2014 |
Antigoni Tsiami, Isidoros Rodomagoulakis, Panagiotis Giannoulis, Athanasios Katsamanis, Gerasimos Potamianos, Petros Maragos ATHENA: A Greek Multi-Sensory Database for Home Automation Control uthor: Isidoros Rodomagoulakis (NTUA, Greece) Conference Proc. Int'l Conf. on Speech Communication and Technology (INTERSPEECH), Singapore, 2014. Abstract | BibTeX | Links: [PDF] @conference{tsiami2014athena, title = {ATHENA: A Greek Multi-Sensory Database for Home Automation Control uthor: Isidoros Rodomagoulakis (NTUA, Greece)}, author = {Antigoni Tsiami and Isidoros Rodomagoulakis and Panagiotis Giannoulis and Athanasios Katsamanis and Gerasimos Potamianos and Petros Maragos}, url = {http://robotics.ntua.gr/wp-content/publications/Tsiami+_AthenaDatabase_INTERSPEECH2014.pdf}, year = {2014}, date = {2014-09-01}, booktitle = {Proc. Int'l Conf. on Speech Communication and Technology (INTERSPEECH)}, pages = {1608-1612}, address = {Singapore}, abstract = {In this paper we present a Greek speech database with real multi-modal data in a smart home two-room environment. In total, 20 speakers were recorded in 240 one-minute long sessions. The recordings include utterances of activation keywords and commands for home automation control, but also phonetically rich sentences and conversational speech. Audio, speaker movements and gestures were captured by 20 condenser microphones installed on the walls and ceiling, 6 MEMS microphones, 2 close-talk microphones and one Kinect camera. The new publicly available database exhibits adverse noise conditions because of background noises and acoustic events performed during the recordings to better approximate a realistic everyday home scenario. Thus, it is suitable for experimentation on voice activity and event detection, source localization, speech enhancement and far-field speech recognition. We present the details of the corpus as well as baseline results on multi-channel voice activity detection and spoken command recognition.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } In this paper we present a Greek speech database with real multi-modal data in a smart home two-room environment. In total, 20 speakers were recorded in 240 one-minute long sessions. The recordings include utterances of activation keywords and commands for home automation control, but also phonetically rich sentences and conversational speech. Audio, speaker movements and gestures were captured by 20 condenser microphones installed on the walls and ceiling, 6 MEMS microphones, 2 close-talk microphones and one Kinect camera. The new publicly available database exhibits adverse noise conditions because of background noises and acoustic events performed during the recordings to better approximate a realistic everyday home scenario. Thus, it is suitable for experimentation on voice activity and event detection, source localization, speech enhancement and far-field speech recognition. We present the details of the corpus as well as baseline results on multi-channel voice activity detection and spoken command recognition. |
Panagiotis Giannoulis, Gerasimos Potamianos, Athanasios Katsamanis, Petros Maragos Multi-microphone fusion for detection of speech and acoustic events in smart spaces Conference European Signal Processing Conference, 2014, ISSN: 22195491. Abstract | BibTeX | Links: [PDF] @conference{168, title = {Multi-microphone fusion for detection of speech and acoustic events in smart spaces}, author = { Panagiotis Giannoulis and Gerasimos Potamianos and Athanasios Katsamanis and Petros Maragos}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/GiannoulisEtAl_MultimicrFusionDetectionSpeechEventsSmartspaces_EUSIPCO2014.pdf}, issn = {22195491}, year = {2014}, date = {2014-01-01}, booktitle = {European Signal Processing Conference}, pages = {2375--2379}, abstract = {In this paper, we examine the challenging problem of de- tecting acoustic events and voice activity in smart indoors environments, equipped with multiple microphones. In par- ticular, we focus on channel combination strategies, aiming to take advantage of the multiple microphones installed in the smart space, capturing the potentially noisy acoustic scene from the far-field. We propose various such approaches that can be formulated as fusion at the signal, feature, or at the decision level, as well as combinations of the above, also including multi-channel training. We apply our methods on two multi-microphone databases: (a) one recorded inside a small meeting room, containing twelve classes of isolated acoustic events; and (b) a speech corpus containing inter- fering noise sources, simulated inside a smart home with multiple rooms. Our multi-channel approaches demonstrate significant improvements, reaching relative error reductions over a single-channel baseline of 9.3% and 44.8% in the two datasets, respectively.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } In this paper, we examine the challenging problem of de- tecting acoustic events and voice activity in smart indoors environments, equipped with multiple microphones. In par- ticular, we focus on channel combination strategies, aiming to take advantage of the multiple microphones installed in the smart space, capturing the potentially noisy acoustic scene from the far-field. We propose various such approaches that can be formulated as fusion at the signal, feature, or at the decision level, as well as combinations of the above, also including multi-channel training. We apply our methods on two multi-microphone databases: (a) one recorded inside a small meeting room, containing twelve classes of isolated acoustic events; and (b) a speech corpus containing inter- fering noise sources, simulated inside a smart home with multiple rooms. Our multi-channel approaches demonstrate significant improvements, reaching relative error reductions over a single-channel baseline of 9.3% and 44.8% in the two datasets, respectively. |
2013 |
I. Rodomagoulakis, P. Giannoulis, Z. I. Skordilis, P. Maragos, G. Potamianos Experiments on far-field multichannel speech processing in smart homes Conference 2013 18th International Conference on Digital Signal Processing, DSP 2013, 2013, ISBN: 9781467358057. @conference{175, title = {Experiments on far-field multichannel speech processing in smart homes}, author = { I. Rodomagoulakis and P. Giannoulis and Z. I. Skordilis and P. Maragos and G. Potamianos}, url = {http://robotics.ntua.gr/wp-content/uploads/publications/RGSMP_ExperimsFarfieldMultichannelSpeechProcessSmartHomes_DSP2013.pdf}, doi = {10.1109/ICDSP.2013.6622707}, isbn = {9781467358057}, year = {2013}, date = {2013-01-01}, booktitle = {2013 18th International Conference on Digital Signal Processing, DSP 2013}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |