

de Recherche et d’Innovation
en Cybersécurité et Société
Ameyoud, S. Mohamed; Allili, M. Saïd
Multi-modal malware classification with hierarchical consistency and saliency-constrained adversarial training Journal Article
In: Journal of Information Security and Applications, vol. 99, 2026, ISSN: 22142134 (ISSN).
Abstract | Links | BibTeX | Tags: Adversarial training, Capability of detection, Classification (of information), Convolution, convolutional neural network, Convolutional neural networks, Detection system, Hierarchical consistency, Hierarchical systems, Malware, Malware classification, Malware classifications, Malware families, Malwares, Multi-modal, Multi-modal learning, Semantics, Vision transformer, Vision transformers
@article{mohamed_ameyoud_multi-modal_2026,
title = {Multi-modal malware classification with hierarchical consistency and saliency-constrained adversarial training},
author = {S. Mohamed Ameyoud and M. Saïd Allili},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-105031186108&doi=10.1016%2Fj.jisa.2026.104429&partnerID=40&md5=2425da4ab40f9043ba4e67d223a1bdd9},
doi = {10.1016/j.jisa.2026.104429},
issn = {22142134 (ISSN)},
year = {2026},
date = {2026-01-01},
journal = {Journal of Information Security and Applications},
volume = {99},
abstract = {The increasing complexity of malware, including polymorphic, obfuscated, and adversarial variants, continues to outpace the capabilities of detection systems. Here, we introduce a robust multi-modal hierarchical framework that jointly leverages visual and code-level semantics to enhance malware family and type classification. Our architecture fuses convolutional and transformer-based encoders to extract complementary representations from raw malware binaries and decompiled control-flow functions, enabling a rich, cross-modal understanding of malicious behavior. The classification pipeline follows a two-stage hierarchical protocol, where the predicted malware type informs the family-level classification. This enforces ontological consistency between type and family prediction levels. To further bolster robustness against adversarial and obfuscated malware, we integrate a novel adversarial training strategy that generates plausible perturbations guided by attention distributions. Evaluation on multiple large-scale benchmarks including BODMAS, Malimg, Microsoft BIG 2015, and a curated set of from MalwareBazaar, demonstrate that our framework consistently outperforms state-of-the-art baselines, including ResNet, Swin Transformer, and MalBERTv2, across both malware type and family prediction tasks. Notably, our model exhibits outstanding generalization to unpacked, obfuscated, and previously unseen samples, with minimal performance degradation. It achieves accuracy gains of +3-6% over leading methods and exhibits superior resilience under adversarial threat models. These results highlight the effectiveness of hierarchical conditioning, adversarial robustness, and multi-modal fusion in tackling the evolving landscape of malware. The proposed framework thus offers a scalable and generalizable approach for next-generation malware classification in real-world cybersecurity environments. © 2026 Elsevier Ltd.},
keywords = {Adversarial training, Capability of detection, Classification (of information), Convolution, convolutional neural network, Convolutional neural networks, Detection system, Hierarchical consistency, Hierarchical systems, Malware, Malware classification, Malware classifications, Malware families, Malwares, Multi-modal, Multi-modal learning, Semantics, Vision transformer, Vision transformers},
pubstate = {published},
tppubtype = {article}
}
Joudeh, I. O.; Cretu, A. -M.; Bouchard, S.; Guimond, S.
Prediction of Emotional States from Partial Facial Features for Virtual Reality Applications Journal Article
In: Annual Review of CyberTherapy and Telemedicine, vol. 21, no. 12, pp. 17–21, 2023, ISSN: 15548716, (Publisher: MDPI).
Abstract | Links | BibTeX | Tags: Arousal, article, clinical article, convolutional neural network, correlation coefficient, data base, emotion, facies, female, human, human experiment, Image processing, long short term memory network, male, random forest, residual neural network, root mean squared error, videorecording, virtual reality
@article{joudeh_prediction_2023,
title = {Prediction of Emotional States from Partial Facial Features for Virtual Reality Applications},
author = {I. O. Joudeh and A. -M. Cretu and S. Bouchard and S. Guimond},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85182471413&partnerID=40&md5=8190e0dbb5b48ae508515f4029b0a0d1},
doi = {10.3390/s23125613},
issn = {15548716},
year = {2023},
date = {2023-01-01},
journal = {Annual Review of CyberTherapy and Telemedicine},
volume = {21},
number = {12},
pages = {17–21},
publisher = {Interactive Media Institute},
abstract = {The availability of virtual reality (VR) in numerous clinical contexts has been made possible by recent technological advancements. One application is using VR for cognitive interventions with individuals who have mental disorders. Predicting the emotional states of users could help to prevent their discouragement during VR interventions. We can monitor the emotional states of individuals using sensors like an external camera, as they engage in various tasks within VR environments. The emotional state of VR users can be measured through arousal and valence, as per the Circumplex model. We used the Remote Collaborative and Affective Interactions (RECOLA) database of emotional behaviours. We processed video frames from 18 RECOLA videos. Due to the headset in VR systems, we detected faces and cropped the images of faces to use the lower half of the face only. We labeled the images with arousal and valence values to reflect various emotions. Convolutional neural networks (CNNs), specifically MobileNet-v2 and ResNets-18, were then used to predict arousal and valence values. MobileNet-v2 outperforms ResNet-18 as well as others from the literature. We achieved a root mean squared error (RMSE), Pearson’s correlation coefficient (PCC), and Concordance correlation coefficient (CCC) of 0.1495, 0.6387, and 0.6081 for arousal, and 0.0996, 0.6453, and 0.6232 for valence. Our work acts as a proof-of-concept for predicting emotional states from arousal and valence values via visual data of users immersed in VR experiences. In the future, predicted emotions could be used to automatically adjust the VR environment for individuals engaged in cognitive interventions. © 2023, Interactive Media Institute. All rights reserved.},
note = {Publisher: MDPI},
keywords = {Arousal, article, clinical article, convolutional neural network, correlation coefficient, data base, emotion, facies, female, human, human experiment, Image processing, long short term memory network, male, random forest, residual neural network, root mean squared error, videorecording, virtual reality},
pubstate = {published},
tppubtype = {article}
}
Yapi, D.; Nouboukpo, A.; Allili, M. S.; Member, IEEE
Mixture of multivariate generalized Gaussians for multi-band texture modeling and representation Journal Article
In: Signal Processing, vol. 209, 2023, ISSN: 01651684, (Publisher: Elsevier B.V.).
Abstract | Links | BibTeX | Tags: Color texture retrieval, Content-based, Content-based color-texture retrieval, Convolution, convolutional neural network, Gaussians, Image retrieval, Image texture, Mixture of multivariate generalized gaussians, Multi-scale Decomposition, Subbands, Texture representation, Textures
@article{yapi_mixture_2023,
title = {Mixture of multivariate generalized Gaussians for multi-band texture modeling and representation},
author = {D. Yapi and A. Nouboukpo and M. S. Allili and IEEE Member},
url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85151300047&doi=10.1016%2fj.sigpro.2023.109011&partnerID=40&md5=3bf98e9667eb7b60cb3f59ed1dcb029c},
doi = {10.1016/j.sigpro.2023.109011},
issn = {01651684},
year = {2023},
date = {2023-01-01},
journal = {Signal Processing},
volume = {209},
publisher = {Elsevier B.V.},
abstract = {We present a unified statistical model for multivariate and multi-modal texture representation. This model is based on the formalism of finite mixtures of multivariate generalized Gaussians (MoMGG) which enables to build a compact and accurate representation of texture images using multi-resolution texture transforms. The MoMGG model enables to describe the joint statistics of subbands in different scales and orientations, as well as between adjacent locations within the same subband, providing a precise description of the texture layout. It can also combine different multi-scale transforms to build a richer and more representative texture signature for image similarity measurement. We tested our model on both traditional texture transforms (e.g., wavelets, contourlets, maximum response filter) and convolution neural networks (CNNs) features (e.g., ResNet, SqueezeNet). Experiments on color-texture image retrieval have demonstrated the performance of our approach comparatively to state-of-the-art methods. © 2023},
note = {Publisher: Elsevier B.V.},
keywords = {Color texture retrieval, Content-based, Content-based color-texture retrieval, Convolution, convolutional neural network, Gaussians, Image retrieval, Image texture, Mixture of multivariate generalized gaussians, Multi-scale Decomposition, Subbands, Texture representation, Textures},
pubstate = {published},
tppubtype = {article}
}



