@article {peternier2012CPE,
	title = {High Performance Execution of Service Compositions: a Multicore-aware Engine Design},
	journal = {Concurrency and Computation: Practice and Experience (CPE)},
	volume = {26},
	year = {2014},
	month = {January},
	pages = {71-97},
	publisher = {Wiley},
	abstract = {Although modern computer hardware offers an increasing number of processing elements organized in nonuniform memory access (NUMA) architectures, prevailing middleware engines for executing business processes, workflows, and Web service compositions have not been optimized for properly exploiting the abundant processing resources of such machines. Amongst others, factors limiting performance are inefficient thread scheduling by the operating system, which can result in suboptimal use of system memory and CPU caches, and sequential code sections that cannot take advantage of multiple available cores.
In this article, we study the performance of the JOpera process execution engine on recent multicore machines. We first evaluate its performance without any dedicated optimization for multicore hardware, showing that additional cores do not significantly improve performance, although the engine has a multithreaded design. Therefore, we apply optimizations on the basis of replication together with an improved, hardware-aware usage of the underlying resources such as NUMA nodes and CPU caches. Thanks to our optimizations, we achieve speedups from a factor of 2 up to a factor of 20 (depending on the target machine) when compared with a baseline execution {\textquoteleft}as is{\textquoteright}. },
	keywords = {JOpera, multicores, nonuniform memory access architecture, performance optimization, service composition and execution},
	doi = {10.1002/cpe.2948},
	author = {Achille Peternier and Walter Binder and Cesare Pautasso and Daniele Bonetta}
}
@conference {peternier2012ICPADS,
	title = {Hardware-aware thread scheduling: the case of asymmetric multicore processors},
	booktitle = {18th International Conference on Parallel and Distributed Systems (ICPADS)},
	year = {2012},
	month = {December},
	pages = {400-407},
	address = {Singapore},
	abstract = {Modern processor architectures are increasingly complex and heterogeneous, often requiring solutions tailored to the specific characteristics of each processor model. In this paper we address this problem by targeting the AMD Bulldozer processor as case study for specific hardware-oriented performance optimizations. The Bulldozer architecture features an asymmetric simultaneous multithreading implementation with shared floating point units (FPUs) and per-core arithmetic logic units (ALUs). Bulld Over, presented in this paper, improves thread scheduling by exploiting this hardware characteristic to increase performance of floating point-intensive workloads on Linux-based operating systems. Bulld Over is a user-space monitoring tool that automatically identifies FPU-intensive threads and schedules them in a more efficient way without requiring any patches or modifications at the kernel level. Our measurements using standard benchmark suites show that speedups of up to 10\% can be achieved by simply allowing Bulld Over to monitor applications, without any modification of the workload.},
	doi = {http://doi.ieeecomputersociety.org/10.1109/ICPADS.2012.62},
	author = {Achille Peternier and Danilo Ansaloni and Daniele Bonetta and Cesare Pautasso and Walter Binder}
}