@proceedings {Bruneo201598, title = {An SRN-based resiliency quantification approach}, journal = {Proceedings of the 36th International Conference on Application and Theory of Petri Nets and Concurrency (Petri Nets)}, volume = {9115 Lecture Notes in Computer Science}, year = {2015}, note = {cited By 0; Conference of 36th International Conference on Application and Theory of Petri Nets and Concurrency, Petri Nets 2015 ; Conference Date: 21 June 2015 Through 26 June 2015; Conference Code:119609}, pages = {98-116}, publisher = {Springer Verlag}, address = {Brussels, Belgium, 21-26 June 2015}, abstract = {

Resiliency is often considered as a synonym for faulttolerance and reliability/availability. We start from a different definition of resiliency as the ability to deliver services when encountering unexpected changes. Semantics of change is of extreme importance in order to accurately capture the real behavior of a system. We propose a resiliency analysis technique based on stochastic reward nets that allows the modeler: (1) to reuse an already existing dependability or performance model for a specific system with minimal modifications, and (2) to adapt the given model for specific change semantics. To automate the model analysis an algorithm is designed and the modeler is provided with a formalism that corresponds to the semantics. Our algorithm and approach is implemented to demonstrate the proposed resiliency quantification approach. Finally, we discuss the differences between our approach and an alternative technique based on deterministic and stochastic Petri nets and highlight the advantages of the proposed approach in terms of semantics specification. {\textcopyright} Springer International Publishing Switzerland 2015.

}, keywords = {Analysis techniques, Deterministic and stochastic Petri nets, Model analysis, Performance Model, Petri nets, Resiliency, Semantics, Stochastic models, Stochastic reward nets, Stochastic systems}, isbn = {9783319194875}, issn = {03029743}, doi = {10.1007/978-3-319-19488-2_5}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84937428490\&partnerID=40\&md5=7d0bbe99afba1a79df65b4e35e86a02c}, author = {Dario Bruneo and Francesco Longo and Marco Scarpa and Antonio Puliafito and Rahul Ghosh and Kishor S. Trivedi} } @proceedings {Distefano2014255, title = {Non-Markovian modeling of a BladeCenter chassis midplane}, journal = {Proceedings of the 11th European Workshop on Computer Performance Engineering (EPEW)}, volume = {8721 Lecture Notes in Computer Science}, year = {2014}, note = {cited By 0; Conference of 11th European Workshop on Computer Performance Engineering, EPEW 2014 ; Conference Date: 11 September 2014 Through 12 September 2014; Conference Code:107431}, pages = {255-269}, publisher = {Springer Verlag}, address = {Florence, Italy, 11-12 September 2014}, abstract = {

In distributed contexts such as Cloud computing, the reliability and availability of the provided resources and services have to be assured in order to meet user requirements. At the infrastructure level, this specification is translated into tighter ones on the datacenter hosting physical resources. In this paper, starting from a real case study of the IBM BladeCenter, we provide a technique for the quantitative evaluation of datacenter infrastructure availability. The proposed technique allows one to take into account both aging phenomena and multiple operating conditions. In particular, one subsystem of the BladeCenter, the chassis midplane, is studied. Indeed, based on the stochastic characterization of the midplane reliability through statistic measurements, a model dealing with the non-exponential failure time distribution thus obtained is evaluated to demonstrate the suitability and the effectiveness of the proposed technique. {\textcopyright} 2014 Springer International Publishing.

}, keywords = {Aging phenomena, Chassis, Failure-time distribution, Non-Markovian modeling, Operating condition, Physical resources, Quantitative evaluation, Reliability and availability, Stochastic models, Stochastic systems, User requirements}, isbn = {9783319108841}, issn = {03029743}, doi = {10.1007/978-3-319-10885-8_18}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84906969232\&partnerID=40\&md5=6840d4a28b6ed80a602f4db06c10343b}, author = {Salvatore Distefano and Francesco Longo and Marco Scarpa and Kishor S. Trivedi} } @article {Ghosh201457, title = {Scalable analytics for IaaS cloud availability}, journal = {IEEE Transactions on Cloud Computing - IEEE Computer Society}, volume = {2}, number = {1}, year = {2014}, note = {cited By 5}, pages = {57-70}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, abstract = {

In a large Infrastructure-as-a-Service (IaaS) cloud, component failures are quite common. Such failures may lead to occasional system downtime and eventual violation of Service Level Agreements (SLAs) on the cloud service availability. The availability analysis of the underlying infrastructure is useful to the service provider to design a system capable of providing a defined SLA, as well as to evaluate the capabilities of an existing one. This paper presents a scalable, stochastic model-driven approach to quantify the availability of a large-scale IaaS cloud, where failures are typically dealt with through migration of physical machines among three pools: hot (running), warm (turned on, but not ready), and cold (turned off). Since monolithic models do not scale for large systems, we use an interacting Markov chain based approach to demonstrate the reduction in the complexity of analysis and the solution time. The three pools are modeled by interacting sub-models. Dependencies among them are resolved using fixed-point iteration, for which existence of a solution is proved. The analytic-numeric solutions obtained from the proposed approach and from the monolithic model are compared. We show that the errors introduced by interacting sub-models are insignificant and that our approach can handle very large size IaaS clouds. The simulative solution is also considered for the proposed model, and solution time of the methods are compared. {\textcopyright} 2014 IEEE.

}, keywords = {Availability, availability analysis, cloud computing, Downtime, Existence of a solutions, Infrastructure as a service (IaaS), Iterative methods, Lakes, Maintenance, Markov processes, Model driven approach, Numeric solutions, Service level agreement (SLAs), simulation, Stochastic models, Stochastic reward nets, Stochastic systems}, issn = {21687161}, doi = {10.1109/TCC.2014.2310737}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84906976356\&partnerID=40\&md5=a63dc67c92ef7620c8a3b33aca08348b}, author = {Rahul Ghosh and Francesco Longo and Flavio Frattini and Stefano Russo and Kishor S. Trivedi} } @article {Ghosh2014667, title = {Stochastic model driven capacity planning for an infrastructure-as-a-service cloud}, journal = {IEEE Transactions on Services Computing - IEEE Computer Society}, volume = {7}, number = {4}, year = {2014}, note = {cited By 1}, pages = {667-680}, publisher = {Institute of Electrical and Electronics Engineers}, abstract = {

From an enterprise perspective, one key motivation to transform the traditional IT management into Cloud is the cost reduction of the hosted services. In an Infrastructure-as-a-Service (IaaS) Cloud, virtual machine (VM) instances share the physical machines (PMs) in the provider{\textquoteright}s data center. With large number of PMs, providers can maintain low cost of service downtime at the expense of higher infrastructure and other operational costs (e.g., power consumption and cooling costs). Hence, determining the optimal PM capacity requirements that minimize the overall cost is of interest. In this paper, we show how a cost analysis and optimization framework can be developed using stochastic availability and performance models of an IaaS Cloud. Specifically, we study two cost minimization problems to address the capacity planning in an IaaS Cloud: 1) what is the optimal number of PMs that minimizes the total cost of ownership for a given downtime requirement set by service level agreements? and, 2) is it more economical to use cheaper but less reliable PMs or to use costlier but more reliable PMs for insuring the same availability characteristics? We use simulated annealing, a well-known stochastic search algorithm, to solve these optimization problems. Results from our analysis show that the optimal solutions are found within reasonable time. {\textcopyright} 2013 IEEE.

}, keywords = {Capacity planning, Capacity requirement, Clouds, Cost benefit analysis, Cost reduction, Downtime, Infrastructure as a service (IaaS), Maintenance, Optimization, Optimization framework, Optimization problems, Service Level Agreements, Simulated annealing, Stochastic models, Stochastic search algorithms, Stochastic systems, Total cost of ownership}, issn = {19391374}, doi = {10.1109/TSC.2013.44}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84919717849\&partnerID=40\&md5=34c70f14a9a3f838562335050a628985}, author = {Rahul Ghosh and Francesco Longo and Ruofan Xia and Vijay K. Naik and Kishor S. Trivedi} } @article {Ghosh20131216, title = {Modeling and performance analysis of large scale IaaS clouds}, journal = {Future Generation Computer Systems - Elsevier}, volume = {29}, number = {5}, year = {2013}, note = {cited By 10}, pages = {1216-1234}, abstract = {

For Cloud based services to support enterprise class production workloads, Mainframe like predictable performance is essential. However, the scale, complexity, and inherent resource sharing across workloads make the Cloud management for predictable performance difficult. As a first step towards designing Cloud based systems that achieve such performance and realize the service level objectives, we develop a scalable stochastic analytic model for performance quantification of Infrastructure-as-a-Service (IaaS) Cloud. Specifically, we model a class of IaaS Clouds that offer tiered services by configuring physical machines into three pools with different provisioning delay and power consumption characteristics. Performance behaviors in such IaaS Clouds are affected by a large set of parameters, e.g., workload, system characteristics and management policies. Thus, traditional analytic models for such systems tend to be intractable. To overcome this difficulty, we propose a multi-level interacting stochastic sub-models approach where the overall model solution is obtained iteratively over individual sub-model solutions. By comparing with a single-level monolithic model, we show that our approach is scalable, tractable, and yet retains high fidelity. Since the dependencies among the sub-models are resolved via fixed-point iteration, we prove the existence of a solution. Results from our analysis show the impact of workload and system characteristics on two performance measures: mean response delay and job rejection probability. {\textcopyright} 2012 Elsevier B.V. All rights reserved.

}, keywords = {Analytic modeling, Analytical models, Clouds, CTMC, Fixed-point iterations, IaaS, Infrastructure as a service (IaaS), Performance, Provisioning, Stochastic models, Stochastic systems, Submodels}, issn = {0167739X}, doi = {10.1016/j.future.2012.06.005}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84887062784\&partnerID=40\&md5=5aa7bb3aa9d27ba52b585a03501c8e18}, author = {Rahul Ghosh and Francesco Longo and Vijay K. Naik and Kishor S. Trivedi} } @inbook {Longo2012134, title = {Availability analysis of IaaS cloud using analytic models}, booktitle = {Achieving Federated and Self-Manageable Cloud Infrastructures: Theory and Practice}, year = {2012}, note = {cited By 0}, pages = {134-157}, publisher = {IGI Global}, organization = {IGI Global}, abstract = {

Cloud based systems are inherently large scale. Failures in such a large distributed environment are quite common phenomena. To reduce the overall Cloud downtime and to provide a seamless service, providers need to assess the availability characteristics of their data centers. Such assessments can be done through controlled experimentations, large scale simulations and via analytic models. In the scale of Cloud, conducting repetitive experimentations or simulations might be costly and time consuming. Analytic models, on the other hand, can be used as a complement to small scale measurements and simulations since the analytic results can be obtained quickly. However, accurate analytic modeling requires dealing with large number of system states, leading to state-space explosion problem. To reduce the complexity of analysis, novel analytic methods are required. This chapter introduces the reader to a novel approach using interacting analytic sub-models and shows how such approach can deal with large scale Cloud availability analysis. The chapter puts the work in perspective of other existing and ongoing research in this area, describe how such approach can be useful to Cloud providers, especially in the case of federated scenarios, and summarize the open research questions that are yet to be solved. {\textcopyright} 2012, IGI Global.

}, isbn = {9781466616318}, doi = {10.4018/978-1-4666-1631-8.ch008}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84898578006\&partnerID=40\&md5=db4db018710246f25dad53c6d32d600d}, author = {Francesco Longo and Rahul Ghosh and Vijay K. Naik and Kishor S. Trivedi} } @article {Distefano20123701, title = {Investigating dynamic reliability and availability through state-space models}, journal = {Computers and Mathematics with Applications - Elsevier Ltd}, volume = {64}, number = {12}, year = {2012}, note = {cited By 7}, pages = {3701-3716}, abstract = {

Quality standards impose increasingly stringent requirements and constraints on quality of service attributes and measures. As a consequence, aspects, phenomena, and behaviors, hitherto approximated or neglected, have to be taken into account in quantitative assessment in order to provide adequate measures satisfying smaller and smaller confidence intervals and tolerances. With specific regards to reliability and availability, this means that interferences and dependencies involving the components of a system can no longer be neglected. Therefore, in order to support such a trend, specific techniques and tools are required to adequately deal with dynamic aspects in reliability and availability assessment. The main goal of this paper is to demonstrate how state-space based techniques can satisfy such a demand. For this purpose some examples of specific dynamic reliability behaviors, such as common cause failure and load sharing, are considered applying state-space based techniques to study the corresponding reliability models. Different repair policies in availability contexts are also explored. Both Markovian and non-Markovian models are studied via phase type expansion and renewal theory in order to adequately represent and evaluate the considered dynamic reliability aspects in case of generally distributed lifetimes and times to repair. {\textcopyright} 2012 Elsevier Ltd. All rights reserved.

}, keywords = {Availability, Common cause failure, Confidence interval, Dynamic aspects, Dynamic reliability, Load sharing, Markov processes, Markov regenerative process, Markovian, Non-Markovian, Quality of service, Quality standard, Quantitative assessments, Reliability, Reliability and availability, Reliability model, Renewal theory, Repair policy, Semi Markov model, State-space, State-space models, Stringent requirement}, issn = {08981221}, doi = {10.1016/j.camwa.2012.02.038}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-84870249108\&partnerID=40\&md5=9ecb143c24c7493a16292faa4f50d175}, author = {Salvatore Distefano and Francesco Longo and Kishor S. Trivedi} } @proceedings {Longo2011335, title = {A scalable availability model for Infrastructure-as-a-Service cloud}, journal = {Proceedings of the 41st IEEE/IFIP International Conference on Dependable Systems and Networks (DSN)}, year = {2011}, note = {cited By 33; Conference of 2011 IEEE/IFIP 41st International Conference on Dependable Systems and Networks, DSN 2011 ; Conference Date: 27 June 2011 Through 30 June 2011; Conference Code:86090}, pages = {335-346}, publisher = {IEEE Computer Society}, address = {Hong Kong, Hong Kong, 27-30 June 2011}, abstract = {

High availability is one of the key characteristics of Infrastructure-as-a- Service (IaaS) cloud. In this paper, we show a scalable method for availability analysis of large scale IaaS cloud using analytic models. To reduce the complexity of analysis and the solution time, we use an interacting Markov chain based approach. The construction and the solution of the Markov chains is facilitated by the use of a high-level Petri net based paradigm known as stochastic reward net (SRN). Overall solution is composed by iteration over individual SRN sub-model solutions. Dependencies among the sub-models are resolved using fixed-point iteration, for which existence of a solution is proved. We compare the solution obtained from the interacting sub-models with a monolithic model and show that errors introduced by decomposition are insignificant. Additionally, we provide closed form solutions of the sub-models and show that our approach can handle very large size IaaS clouds. {\textcopyright} 2011 IEEE.

}, keywords = {Analytic models, availability analysis, Closed form solutions, Fixed-point iterations, High availability, Key characteristics, Large sizes, Markov Chain, Markov model, Markov processes, Petri nets, Scalability, Scalable methods, Solution time, Stochastic reward nets, Submodels}, isbn = {9781424492336}, doi = {10.1109/DSN.2011.5958247}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-80051928903\&partnerID=40\&md5=37f3360476d39837acf5098ca20408c7}, author = {Francesco Longo and Rahul Ghosh and Vijay K. Naik and Kishor S. Trivedi} } @proceedings {Ghosh2010343, title = {Quantifying resiliency of IaaS cloud}, journal = {Proceedings of the 29th IEEE Symposium on Reliable Distributed Systems (SRDS)}, year = {2010}, note = {cited By 16; Conference of 29th IEEE Symposium on Reliable Distributed Systems, SRDS 2010 ; Conference Date: 31 October 2010 Through 3 November 2010; Conference Code:82914}, pages = {343-347}, publisher = {IEEE Computer Society}, address = {New Delhi, India, 31 October - 3 November 2010}, abstract = {

Cloud based services may experience changes - internal, external, large, small - at any time. Predicting and quantifying the effects on the quality-of-service during and after a change are important in the resiliency assessment of a cloud based service. In this paper, we quantify the resiliency of infrastructure-as-a-service (IaaS) cloud when subject to changes in demand and available capacity. Using a stochastic reward net based model for provisioning and servicing requests in a IaaS cloud, we quantify the resiliency of IaaS cloud w.r.t. two key performance measures - job rejection rate and provisioning response delay. {\textcopyright} 2010 IEEE.

}, keywords = {Available capacity, Infrastructure as a services, Performance measure, Quality of service, Rejection rates, Response delays, Stochastic models, Stochastic reward nets}, isbn = {9780769542508}, issn = {10609857}, doi = {10.1109/SRDS.2010.49}, url = {http://www.scopus.com/inward/record.url?eid=2-s2.0-78650562592\&partnerID=40\&md5=6542308f50d349fc7456919e658e6840}, author = {Rahul Ghosh and Francesco Longo and Vijay K. Naikz and Kishor S. Trivedi} }