@article {Longo201737, title = {An approach for resiliency quantification of large scale systems}, journal = {Performance Evaluation Review}, volume = {44}, number = {4}, year = {2017}, note = {cited By 0}, pages = {37-48}, publisher = {Association for Computing Machinery}, abstract = {

We quantify the resiliency of large scale systems upon changes encountered beyond the normal system behavior. Formal definitions for resiliency and change are provided together with general steps for resiliency quantification and a set of resiliency metrics that can be used to quantify the effects of changes. A formalization of the approach is also shown in the form of a set of four algorithms that can be applied when large scale systems are modeled through stochastic analytic state space models (monolithic models or interacting sub-models). In particular, in the case of interacting submodels, since resiliency quantification involves understanding the transient behavior of the system, fixed-point variables evolve with time leading to non-homogenous Markov chains. At the best of our knowledge, this is the first paper facing this problem in a general way. The proposed approach is applied to an Infrastructure-As-A-Service (IaaS) Cloud use case. Specifically, we assess the impact of changes in demand and available capacity on the Cloud resiliency and we show that the approach proposed in this paper can scale for a real sized Cloud without significantly compromising the accuracy.

}, keywords = {Available capacity, Chains, Formal definition, Homogenous Markov chain, Infrastructure as a service (IaaS), Large scale systems, Markov processes, Non-homogeneous, Resiliency quantification, State - space models, State space methods, Stochastic models, Stochastic systems, Submodels, Transient behavior}, issn = {01635999}, doi = {10.1145/3092819.3092825}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85019853222\&doi=10.1145\%2f3092819.3092825\&partnerID=40\&md5=6266fffb7aad937fefad706e31fcd7da}, author = {Francesco Longo and Rahul Ghosh and Vijay K. Naik and A.J. Rindos and Kishor Trivedi} } @proceedings {526, title = {A new modelling approach to represent the DCF mechanism of the CSMA/CA protocol}, journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)}, volume = {10378 LNCS}, year = {2017}, pages = {181-195}, abstract = {

In this paper, a Markovian agent model is used to represent the behavior of wireless nodes based on CSMA/CA access method. This kind of network was usually modeled by means of bidimensional Markov Chains and more recently using semi-Markov process based models. Both these approaches are based on the assumptions of both full load network and independence of collision probability with respect to retransmission count of each packet. Our model inherently releases the latter hypothesis since it is not necessary to establish a constant collision probability at steady state. Here, we investigate the correctness of our approach analyzing the throughput of a network based on two IEEE 802.11g nodes when the amount of traffic sent by each one varies. Results have been compared with Omnet++ simulations and show the validity of the proposed model. {\^A}{\textcopyright} Springer International Publishing AG 2017.

}, keywords = {Access methods, Carrier sense multiple access, Collision probability, DCF mechanisms, Markov processes, Markovian agents, Network-based, Retransmissions, Semi markov process, Standards, Stochastic systems, Transport properties, Wireless nodes}, isbn = {9783319614274}, issn = {03029743}, doi = {10.1007/978-3-319-61428-1_13}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85021219346\&doi=10.1007\%2f978-3-319-61428-1_13\&partnerID=40\&md5=0dbb7a4c879561f94ca4992e4bc69e18}, author = {Scarpa, M. and Serrano, S.} } @proceedings {Ghosh2017227, title = {Resiliency quantification for large scale systems: An IaaS cloud use case}, journal = {ValueTools 2016 - 10th EAI International Conference on Performance Evaluation Methodologies and Tools}, year = {2016}, note = {cited By 0; Conference of 10th EAI International Conference on Performance Evaluation Methodologies and Tools, ValueTools 2016 ; Conference Date: 25 October 2016 Through 28 October 2016; Conference Code:127816}, pages = {227-234}, publisher = {Association for Computing Machinery}, address = {Taormina; Italy; 25-28 October 2016}, abstract = {

We quantify the resiliency of large scale systems upon changes encountered beyond the normal system behavior. General steps for resiliency quantification are shown and resiliency metrics are defined to quantify the effects of changes. The proposed approach is illustrated through an Infrastructureas-a-Service (IaaS) Cloud use case. Specifically, we assess the impact of changes in demand and available capacity on the Cloud resiliency using interacting state-space based submodels. Since resiliency quantification involves understanding the transient behavior of the system, fixed-point variables evolve with time leading to non-homogenous Markov chains. In this paper, we present an algorithm for resiliency analysis when dealing with such non-homogenous sub-models. A comparison is shown with our past research, where we quantified the resiliency of IaaS Cloud performance using a one level monolithic model. Numerical results show that the approach proposed in this paper can scale for a real sized Cloud without significantly compromising the accuracy. Copyright {\textcopyright} 2016 EAI.

}, keywords = {Available capacity, Chains, Clouds, Homogenous Markov chain, Impact of changes, Infrastructure as a service (IaaS), Large scale systems, Markov processes, Non-homogeneous, Numerical results, Resiliency, Submodels, Transient behavior}, isbn = {9781631901416}, doi = {10.4108/eai.25-10-2016.2266805}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85021320622\&doi=10.4108\%2feai.25-10-2016.2266805\&partnerID=40\&md5=a6167b2554cad8a9aae02963254c8b52}, author = {Rahul Ghosh and Francesco Longo and Vijay K. Naik and A.J. Rindos and Kishor Trivedi} } @article {Longo2015280, title = {Dependability modeling of Software Defined Networking}, journal = {Computer Networks}, volume = {83}, year = {2015}, note = {cited By 6}, pages = {280-296}, publisher = {Elsevier}, abstract = {

Software Defined Networking (SDN) is a new network design paradigm that aims at simplifying the implementation of complex networking infrastructures by separating the forwarding functionalities (data plane) from the network logical control (control plane). Network devices are used only for forwarding, while decisions about where data is sent are taken by a logically centralized yet physically distributed component, i.e., the SDN controller. From a quality of service (QoS) point of view, an SDN controller is a complex system whose operation can be highly dependent on a variety of parameters, e.g., its degree of distribution, the corresponding topology, the number of network devices to control, and so on. Dependability aspects are particularly critical in this context. In this work, we present a new analytical modeling technique that allows us to represent an SDN controller whose components are organized in a hierarchical topology, focusing on reliability and availability aspects and overcoming issues and limitations of Markovian models. In particular, our approach allows to capture changes in the operating conditions (e.g., in the number of managed devices) still allowing to represent the underlying phenomena through generally distributed events. The dependability of a use case on a two-layer hierarchical SDN control plane is investigated through the proposed technique providing numerical results to demonstrate the feasibility of the approach. {\textcopyright} 2015 Elsevier B.V.

}, keywords = {Availability, Complex networks, Controllers, Degree of distributions, Distributed components, Electric network topology, Information dissemination, Markov processes, Networking infrastructure, Non-Markovian, Quality control, Quality of service, Random processes, Reliability, Reliability and availability, Software defined networking (SDN), Software reliability, Software-defined networkings, Stochastic models, Stochastic systems, Topology, Type expansions}, issn = {13891286}, doi = {10.1016/j.comnet.2015.03.018}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-84946489290\&doi=10.1016\%2fj.comnet.2015.03.018\&partnerID=40\&md5=b4f32c89d2b7b79fefcaf97082764960}, author = {Francesco Longo and Salvatore Distefano and Dario Bruneo and Marco Scarpa} } @article {Longo20151540, title = {Two-layer symbolic representation for stochastic models with phase-type distributed events}, journal = {International Journal of Systems Science}, volume = {46}, number = {9}, year = {2015}, note = {cited By 2}, pages = {1540-1571}, publisher = {Taylor and Francis Ltd.}, abstract = {

Among the techniques that have been proposed for the analysis of non-Markovian models, the state space expansion approach showed great flexibility in terms of modelling capacities.The principal drawback is the explosion of the state space. This paper proposes a two-layer symbolic method for efficiently storing the expanded reachability graph of a non-Markovian model in the case in which continuous phase-type distributions are associated with the firing times of system events, and different memory policies are considered. At the lower layer, the reachability graph is symbolically represented in the form of a set of Kronecker matrices, while, at the higher layer, all the information needed to correctly manage event memory is stored in a multi-terminal multi-valued decision diagram. Such an information is collected by applying a symbolic algorithm, which is based on a couple of theorems. The efficiency of the proposed approach, in terms of memory occupation and execution time, is shown by applying it to a set of non-Markovian stochastic Petri nets and comparing it with a classical explicit expansion algorithm. Moreover, a comparison with a classical symbolic approach is performed whenever possible. {\textcopyright} 2013 Taylor \& Francis.

}, keywords = {Decision diagram, Decision theory, efficient memory occupation, Many valued logics, Markov processes, Non-Markovian, Petri nets, phase type distributions, Stochastic models, Stochastic systems, symbolic representation}, issn = {00207721}, doi = {10.1080/00207721.2013.822940}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84928644988\&partnerID=40\&md5=3f853e6fcb8bddeebf2e890076ff798d}, author = {Francesco Longo and Marco Scarpa} } @article {Longo20152506, title = {Variable operating conditions in distributed systems: Modeling and evaluation}, journal = {Concurrency Computation Practice and Experience}, volume = {27}, number = {10}, year = {2015}, note = {cited By 1}, pages = {2506-2530}, publisher = {John Wiley and Sons Ltd}, abstract = {

SummaryPerformance and dependability evaluation plays a key role in the design of a broad range of systems, especially when strict requirements need to be met. This is particularly challenging in distributed contexts, where several components may interact among themselves by influencing each other. In this paper, we present an analytical method that allows the study of a class of systems where different operating conditions alternate by changing the stochastic behavior of the system components but still preserving the continuity of the performance and dependability quantities to investigate. The proposed solution technique, based on phase type distributions, Kronecker algebra, and ad-hoc fitting algorithms, can be applied for the analytical evaluation of a wide class of distributed systems. Examples are provided to show the usefulness and the applicability of the methodology, characterizing and investigating different performance and dependability aspects of three distributed computing systems, that is, a connection-oriented network, an Internet of Things application, and an Infrastructure-as-a-Service Cloud. Copyright {\textcopyright} 2014 John Wiley \& Sons, Ltd.

}, keywords = {Algebra, cloud computing, Conservative systems, Dependability, Distributed computer systems, Internet, Internet of Things, Kronecker algebra, Markov processes, Non-Markovian, Performance, phase type distributions, Stochastic systems}, issn = {15320626}, doi = {10.1002/cpe.3419}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84932604392\&partnerID=40\&md5=b8f197a50fa177e85f0c1ac2a93fe392}, author = {Francesco Longo and Dario Bruneo and Salvatore Distefano and Marco Scarpa} } @article {Ghosh201457, title = {Scalable analytics for IaaS cloud availability}, journal = {IEEE Transactions on Cloud Computing - IEEE Computer Society}, volume = {2}, number = {1}, year = {2014}, note = {cited By 5}, pages = {57-70}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, abstract = {

In a large Infrastructure-as-a-Service (IaaS) cloud, component failures are quite common. Such failures may lead to occasional system downtime and eventual violation of Service Level Agreements (SLAs) on the cloud service availability. The availability analysis of the underlying infrastructure is useful to the service provider to design a system capable of providing a defined SLA, as well as to evaluate the capabilities of an existing one. This paper presents a scalable, stochastic model-driven approach to quantify the availability of a large-scale IaaS cloud, where failures are typically dealt with through migration of physical machines among three pools: hot (running), warm (turned on, but not ready), and cold (turned off). Since monolithic models do not scale for large systems, we use an interacting Markov chain based approach to demonstrate the reduction in the complexity of analysis and the solution time. The three pools are modeled by interacting sub-models. Dependencies among them are resolved using fixed-point iteration, for which existence of a solution is proved. The analytic-numeric solutions obtained from the proposed approach and from the monolithic model are compared. We show that the errors introduced by interacting sub-models are insignificant and that our approach can handle very large size IaaS clouds. The simulative solution is also considered for the proposed model, and solution time of the methods are compared. {\textcopyright} 2014 IEEE.

}, keywords = {Availability, availability analysis, cloud computing, Downtime, Existence of a solutions, Infrastructure as a service (IaaS), Iterative methods, Lakes, Maintenance, Markov processes, Model driven approach, Numeric solutions, Service level agreement (SLAs), simulation, Stochastic models, Stochastic reward nets, Stochastic systems}, issn = {21687161}, doi = {10.1109/TCC.2014.2310737}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84906976356\&partnerID=40\&md5=a63dc67c92ef7620c8a3b33aca08348b}, author = {Rahul Ghosh and Francesco Longo and Flavio Frattini and Stefano Russo and Kishor S. Trivedi} } @article {Distefano20123701, title = {Investigating dynamic reliability and availability through state-space models}, journal = {Computers and Mathematics with Applications - Elsevier Ltd}, volume = {64}, number = {12}, year = {2012}, note = {cited By 7}, pages = {3701-3716}, abstract = {

Quality standards impose increasingly stringent requirements and constraints on quality of service attributes and measures. As a consequence, aspects, phenomena, and behaviors, hitherto approximated or neglected, have to be taken into account in quantitative assessment in order to provide adequate measures satisfying smaller and smaller confidence intervals and tolerances. With specific regards to reliability and availability, this means that interferences and dependencies involving the components of a system can no longer be neglected. Therefore, in order to support such a trend, specific techniques and tools are required to adequately deal with dynamic aspects in reliability and availability assessment. The main goal of this paper is to demonstrate how state-space based techniques can satisfy such a demand. For this purpose some examples of specific dynamic reliability behaviors, such as common cause failure and load sharing, are considered applying state-space based techniques to study the corresponding reliability models. Different repair policies in availability contexts are also explored. Both Markovian and non-Markovian models are studied via phase type expansion and renewal theory in order to adequately represent and evaluate the considered dynamic reliability aspects in case of generally distributed lifetimes and times to repair. {\textcopyright} 2012 Elsevier Ltd. All rights reserved.

}, keywords = {Availability, Common cause failure, Confidence interval, Dynamic aspects, Dynamic reliability, Load sharing, Markov processes, Markov regenerative process, Markovian, Non-Markovian, Quality of service, Quality standard, Quantitative assessments, Reliability, Reliability and availability, Reliability model, Renewal theory, Repair policy, Semi Markov model, State-space, State-space models, Stringent requirement}, issn = {08981221}, doi = {10.1016/j.camwa.2012.02.038}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84870249108\&partnerID=40\&md5=9ecb143c24c7493a16292faa4f50d175}, author = {Salvatore Distefano and Francesco Longo and Kishor S. Trivedi} } @proceedings {Bruneo20128, title = {Software rejuvenation in the cloud}, journal = {Proceedings of the 5th International Conference on Simulation Tools and Techniques (SIMUTools)}, year = {2012}, note = {cited By 0; Conference of 5th International Conference on Simulation Tools and Techniques, SIMUTools 2012 ; Conference Date: 19 March 2012 Through 23 March 2012; Conference Code:110134}, pages = {8-16}, publisher = {ICST}, address = {Desenzano del Garda, Italy, 19-23 March 2012}, abstract = {

In this paper, we investigate how software rejuvenation can be used in a Cloud environment to increase the availability of a virtualized system composed of a single virtual machine monitor (VMM) on top of which a certain number of virtual machines (VMs) can be instantiated. We start from the assumption that the aging of a VMM increases with the number of VMs it is managing, thus characterizing the problem in terms of dynamic reliability. Therefore, by identifying the age of the VMM with its reliability and based on the conservation of reliability principle, we characterize the time to failure of the VMM through continuous phase type distributions. The system availability is thus modeled by an expanded continuous time Markov chain expressed in terms of Kronecker algebra in order to face the state space explosion and to keep memory of the age reached by the VMM in case the number of the hosted VMs change. Time-based rejuvenation is taken into consideration and the optimal timer is evaluated in order to maximize the VMM availability. Copyright {\textcopyright} 2012 ICST.

}, keywords = {Availability, cloud computing, Continuous phase type distributions, Continuous time Markov chain, Continuous time systems, Endocrinology, Java programming language, Markov processes, phase type distributions, Rejuvenation, Reliability, Reliability principles, Software rejuvenation, Virtual machine monitors, Virtual reality, Virtualized environment}, isbn = {9781450315104}, doi = {10.4108/icst.simutools.2012.247772}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84922767370\&partnerID=40\&md5=a03b06eec703c94d55c0f4875f181902}, author = {Dario Bruneo and Francesco Longo and Antonio Puliafito and Marco Scarpa and Salvatore Distefano} } @proceedings {Longo2011335, title = {A scalable availability model for Infrastructure-as-a-Service cloud}, journal = {Proceedings of the 41st IEEE/IFIP International Conference on Dependable Systems and Networks (DSN)}, year = {2011}, note = {cited By 33; Conference of 2011 IEEE/IFIP 41st International Conference on Dependable Systems and Networks, DSN 2011 ; Conference Date: 27 June 2011 Through 30 June 2011; Conference Code:86090}, pages = {335-346}, publisher = {IEEE Computer Society}, address = {Hong Kong, Hong Kong, 27-30 June 2011}, abstract = {

High availability is one of the key characteristics of Infrastructure-as-a- Service (IaaS) cloud. In this paper, we show a scalable method for availability analysis of large scale IaaS cloud using analytic models. To reduce the complexity of analysis and the solution time, we use an interacting Markov chain based approach. The construction and the solution of the Markov chains is facilitated by the use of a high-level Petri net based paradigm known as stochastic reward net (SRN). Overall solution is composed by iteration over individual SRN sub-model solutions. Dependencies among the sub-models are resolved using fixed-point iteration, for which existence of a solution is proved. We compare the solution obtained from the interacting sub-models with a monolithic model and show that errors introduced by decomposition are insignificant. Additionally, we provide closed form solutions of the sub-models and show that our approach can handle very large size IaaS clouds. {\textcopyright} 2011 IEEE.

}, keywords = {Analytic models, availability analysis, Closed form solutions, Fixed-point iterations, High availability, Key characteristics, Large sizes, Markov Chain, Markov model, Markov processes, Petri nets, Scalability, Scalable methods, Solution time, Stochastic reward nets, Submodels}, isbn = {9781424492336}, doi = {10.1109/DSN.2011.5958247}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-80051928903\&partnerID=40\&md5=37f3360476d39837acf5098ca20408c7}, author = {Francesco Longo and Rahul Ghosh and Vijay K. Naik and Kishor S. Trivedi} } @proceedings {Distefano2010265, title = {Availability assessment of HA standby redundant clusters}, journal = {Proceedings of the 29th IEEE Symposium on Reliable Distributed Systems (SRDS)}, year = {2010}, note = {cited By 14; Conference of 29th IEEE Symposium on Reliable Distributed Systems, SRDS 2010 ; Conference Date: 31 October 2010 Through 3 November 2010; Conference Code:82914}, pages = {265-274}, publisher = {IEEE Computer Society}, address = {New Delhi, India, 31 October - 3 November 2010}, abstract = {

Computing systems are becoming the heart of modern technology, implementing critical tasks usually demanded to and implying human interactions. This highlights the problem of dependability in computer science contexts. High availability computing/clusters is a possible solution in such cases, implementing standby redundancy as a tradeoff between dependability and costs. From the engineering perspective, this implies the use of specific techniques and tools for adequately evaluating the reliability/availability of high availability clusters, also taking into account dependencies among nodes (standby, repair, etc.) and the effect of wear and tear into such nodes, especially when failure and repair times are not exponentially distributed. The solution proposed in this paper is based on the use of phase type distributions and Kronecker algebra. In fact, we represent the reliability and maintainability of each component by specific phase type distributions, whose interactions describe the system availability. This latter is thus modeled by an expanded Markov chain expressed in terms of Kronecker algebra in order to face the state space explosion problem of expansion techniques and to represent the memory policies related to the aging process. More specifically, the paper firstly details the technique and then applies it to the evaluation of a standby redundant system representing a high availability cluster taken as example with the aim of demonstrating its effectiveness. Moreover, in order to show the potentiality of the technique, different maintenance strategies are evaluated and therefore compared. {\textcopyright} 2010 IEEE.

}, keywords = {Aging process, Algebra, Availability assessment, Computing system, Critical tasks, Dynamic reliability, Engineering perspective, High availability, High-availability clusters, Human interactions, Kronecker algebra, Maintainability, Maintenance strategies, Markov Chain, Markov processes, Modern technologies, phase type distributions, Possible solutions, Quality assurance, Redundancy, Redundant system, Standby redundancy, State-space explosion, System availability, Wear and tear}, isbn = {9780769542508}, issn = {10609857}, doi = {10.1109/SRDS.2010.37}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-78650560407\&partnerID=40\&md5=9666fbfeaaae60d0a7d7ca8f388600e7}, author = {Salvatore Distefano and Francesco Longo and Marco Scarpa} } @proceedings {Distefano201045, title = {Symbolic representation techniques in dynamic reliability evaluation}, journal = {Proceedings of the 12th IEEE International Symposium on High Assurance Systems Engineering (HASE)}, year = {2010}, note = {cited By 9; Conference of 2010 IEEE 12th International Symposium on High Assurance Systems Engineering, HASE 2010 ; Conference Date: 3 November 2010 Through 4 November 2010; Conference Code:83929}, pages = {45-53}, publisher = {IEEE Computer Society}, address = {San Jose, CA, United States, 3-4 November 2010}, abstract = {

The increasing demand of quality presses towards more specific requirements, tighter constraints, and higher standards. It is thus necessary to provide new paradigms, techniques, and tools to adequately model and evaluate complex systems. This paper mainly focuses on reliability aspects, also taking into account dynamic-dependent interactions among components. Starting from the conservation of reliability principle, we characterize the time to failure of the system components through continuous phase type distributions. The system reliability is thus modeled by an expanded Markov chain expressed in terms of Kronecker algebra in order to face the state space explosion and to represent the memory policies related to the aging process. A two-component system is taken as example to demonstrate the effectiveness of the technique and to validate it. {\textcopyright} 2010 IEEE.

}, keywords = {Aging process, Algebra, Complex systems, Continuous phase, Dynamic reliability, Kronecker algebra, Markov Chain, Markov processes, phase type distributions, Quality assurance, Reliability, State-space explosion, symbolic representation, System components, System reliability, Systems engineering, Time to failure, Two component systems}, isbn = {9780769542928}, issn = {15302059}, doi = {10.1109/HASE.2010.28}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-79951924261\&partnerID=40\&md5=8ed2be2a9f6a7e9d891ed3fb58789d6f}, author = {Salvatore Distefano and Francesco Longo and Marco Scarpa} }