% Encoding: UTF-8
@COMMENT{BibTeX export based on data in FAU CRIS: https://cris.fau.de/}
@COMMENT{For any questions please write to cris-support@fau.de}
@inproceedings{faucris.245967719,
author = {Keszöcze, Oliver and Brand, Marcel and Witterauf, Michael and Heidorn, Christian and Teich, Jürgen},
booktitle = {ACM/SIGAPP Symposium On Applied Computing},
date = {2021-03-22/2021-03-26},
doi = {10.1145/3412841.3442085},
faupublication = {yes},
peerreviewed = {Yes},
title = {{Aarith}: {An} {Arbitrary} {Precision} {Number} {Library}},
venue = {virtual conference},
year = {2021}
}
@article{faucris.225557298,
author = {Mattauch, Sandra and Lohmann, Katja and Hannig, Frank and Lohmann, Daniel and Teich, Jürgen},
doi = {10.1145/3376901},
faupublication = {yes},
journal = {Communications of the ACM},
keywords = {Gender gap, Computer science,
Frauenanteil, MINT, Informatik,},
pages = {39-45},
peerreviewed = {Yes},
title = {{A} {Bibliometric} {Approach} for {Detecting} the {Gender} {Gap} in {Computer} {Science}},
volume = {63},
year = {2020}
}
@inproceedings{faucris.122620784,
abstract = {We present a co-design approach to establish redundancy schemes such as Dual Modular Redundancy (DMR) and Triple Modular Redundancy (TMR) to a whole region of a processor array for a class of Coarse-Grained Reconfigurable Arrays (CGRAs). The approach is applied to applications with mixed-criticality properties and experiencing varying Soft Error Rates (SERs) due to environmental reasons, e. g., changing altitude. The core idea is to adapt the degree of fault protection for loop programs executing in parallel on a CGRA to the level of reliability required as well as SER profiles. This is realized through claiming neighbor regions of processing elements for the execution of replicated loop nests. First, at the source code level, a compiler transformation is proposed that realizes these replication schemes in two steps: (1) replicate given parallel loop program two or three times for DMR or TMR, respectively, and (2) add appropriate error handling functions (voting or comparison) in order to detect respectively correct any single errors. Then, using the opportunities of hardware/software co-design, we propose optimized implementations of the error handling functions in software as well as in hardware. Finally, experimental results are given for the analysis of reliability gains for each proposed scheme of array replication in dependence of different SER},
author = {Lari, Vahid and Tanase, Alexandru-Petru and Teich, Jürgen and Witterauf, Michael and Khosravi, Faramarz and Hannig, Frank and Meyer, Brett},
booktitle = {Proceedings of the 2015 NASA/ESA Conference on Adaptive Hardware and Systems},
date = {2016-06-15/2015-06-18},
doi = {10.1109/AHS.2015.7231157},
faupublication = {yes},
isbn = {9781467375016},
keywords = {Hardware; Parallel processing; Redundancy; Registers; Schedules; Software; Tunneling magnetoresistance},
pages = {1-8},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{A} co-design approach for fault-tolerant loop execution on {Coarse}-{Grained} {Reconfigurable} {Arrays}},
venue = {Montreal},
year = {2015}
}
@inproceedings{faucris.246679180,
abstract = {Many applications vary a lot in execution time depending on their
workload. A prominent example is image processing applications, where
the execution time is dependent on the content or the size of the
processed input images. An interesting case is when these applications
have quality-of-service requirements such as soft deadlines, that they
should meet as good as possible. A further complicated case is when such
applications have one or even multiple further objectives to optimize
like, e.g., energy consumption.
Approaches that dynamically adapt the processing resources to
application needs under multiple optimization goals and constraints can
be characterized into the application-specific and feedback-based
techniques. Whereas application-specific approaches typically statically
use an offline stage to determine the best configuration for each known
workload, feedback-based approaches, using, e.g., control theory, adapt
the system without the need of knowing the effect of workload on these
goals.
In this paper, we evaluate a state-of-the-art approach of each of
the two categories and compare them for image processing applications in
terms of energy consumption and number of deadline misses on a given
many-core architecture. In addition, we propose a second feedback-based
approach that is based on finite state machines (FSMs). The obtained
results suggest that whereas the state-of-the-art application-specific
approach is able to meet a specified latency deadline whenever possible
while consuming the least amount of energy, it requires a perfect
characterization of the workload on a given many-core system. If such
knowledge is not available, the feedback-based approaches have their
strengths in achieving comparable energy savings, but missing deadlines
more ofte},
author = {Esper, Khalil and Wildermann, Stefan and Teich, Jürgen},
booktitle = {Proceedings of the Workshop on Next Generation Real-Time Embedded Systems (NG-RES), OASICS Vol. 87},
date = {2021-01-20/2021-01-20},
doi = {10.4230/OASIcs.NG-RES.2021.1},
faupublication = {yes},
isbn = {978-3-95977-178-8},
keywords = {Soft real-time; Energy optimization; Control-theory; Timing analysis; Dynamic voltage and frequency scaling; Finite state machines; Multi-core; Many-core;},
month = {Jan},
pages = {1:1--1:12},
peerreviewed = {Yes},
title = {{A} {Comparative} {Evaluation} of {Latency}-{Aware} {Energy} {Optimization} {Approaches} in {Many}-{Core} {Systems}},
url = {https://drops.dagstuhl.de/opus/volltexte/2021/13477},
venue = {Budapest},
year = {2021}
}
@inproceedings{faucris.122834624,
author = {Roloff, Sascha and Pöppl, Alexander and Schwarzer, Tobias and Wildermann, Stefan and Baader, Michael and Glaß, Michael and Hannig, Frank and Teich, Jürgen},
booktitle = {Proceedings of the 6th ACM SIGPLAN X10 Workshop (X10)},
faupublication = {yes},
pages = {24-29},
peerreviewed = {unknown},
title = {{ActorX10}: {An} {Actor} {Library} for {X10}},
venue = {Santa Barbara, CA},
year = {2016}
}
@inproceedings{faucris.123471964,
author = {Witterauf, Michael and Tanase, Alexandru-Petru and Hannig, Frank and Teich, Jürgen},
booktitle = {Proceedings of the 11th International Summer School on Advanced Computer Architecture and Compilation for High-Performance and Embedded Systems (ACACES)},
date = {2015-07-12/2015-07-18},
faupublication = {yes},
isbn = {978-88-905806-3-5},
pages = {205-208},
peerreviewed = {Yes},
publisher = {HiPEAC},
title = {{Adaptive} {Fault} {Tolerance} in {Tightly} {Coupled} {Processor} {Arrays} with {Invasive} {Computing}},
venue = {Fiuggi},
year = {2015}
}
@inproceedings{faucris.122626284,
abstract = {Fault tolerance is a basic necessity to make today's complex systems reliable. Adequate fault tolerance, however, demands a high degree of redundancy, possibly wasting resources when the fault probability is low or when some applications do not require fault tolerance. Under the term adaptive fault tolerance, we investigate means to instead provide on-demand fault tolerance on multi-core systems dynamically and according to application and environmental needs. Such means are provided on a per-application basis by invasive computing, a recent paradigm for resource-aware programming and design of parallel systems: applications request resources in an invade phase, infect the acquired resources with code and data, and finally release them in a retreat phase. We show how to use these simple but powerful constructs to adaptively tolerate faults and that invasive computing harmonizes well with many existing fault tolerance approaches. Finally, a case study on adaptively providing fault tolerance for loops demonstrates how effective invasive computing is for adapting to a varying soft error rate and handling of faults.},
author = {Witterauf, Michael and Tanase, Alexandru-Petru and Teich, Jürgen and Lari, Vahid and Zwinkau, Andreas and Snelting, Gregor},
booktitle = {Proceedings of the 2015 NASA/ESA Conference on Adaptive Hardware and Systems},
date = {2016-06-15/2015-06-18},
doi = {10.1109/AHS.2015.7231155},
faupublication = {yes},
isbn = {9781467375016},
keywords = {Adaptation models; Fault tolerant systems; Hardware; Redundancy; Runtime},
pages = {1-8},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Adaptive} fault tolerance through invasive computing},
venue = {Montreal},
year = {2015}
}
@article{faucris.234343622,
abstract = {Inverse problems play a central role for many classical computer vision
and image processing tasks. Many inverse problems are ill-posed, and
hence require a prior to regularize the solution space. However, many of
the existing priors, like total variation, are based on ad-hoc
assumptions that have difficulties to represent the actual distribution
of natural images. Thus, a key challenge in research on image processing
is to find better suited priors to represent natural images. In this
article, we propose the Adaptive Quantile Sparse Image (AQuaSI) prior.
It is based on a quantile filter, can be used as a joint filter on
guidance data, and be readily plugged into a wide range of numerical
optimization algorithms. We demonstrate the efficacy of the proposed
prior in joint RGB/depth upsampling, on RGB/NIR image restoration, and
in a comparison with related regularization by denoising approache},
author = {Schirrmacher, Franziska and Riess, Christian and Köhler, Thomas},
doi = {10.1109/TCI.2019.2956888},
faupublication = {yes},
journal = {IEEE Transactions on Computational Imaging},
keywords = {Inverse problems; universal image prior; weighted quantile filter},
pages = {503--517},
peerreviewed = {Yes},
title = {{Adaptive} {Quantile} {Sparse} {Image} ({AQuaSI}) {Prior} for {Inverse} {Imaging} {Problems}},
url = {https://faui1-files.cs.fau.de/public/publications/mmsec/2020-Schirrmacher-AQS.pdf},
volume = {6},
year = {2020}
}
@article{faucris.203550747,
author = {Weichslgartner, Andreas and Wildermann, Stefan and Gangadharan, Deepak and Glaß, Michael and Teich, Jürgen},
doi = {10.1145/3274665},
faupublication = {yes},
journal = {ACM Transactions on Embedded Computing Systems},
peerreviewed = {Yes},
title = {{A} {Design}-{Time}/{Run}-{Time} {Application} {Mapping} {Methodology} for {Predictable} {Execution} {Time} in {MPSoCs}},
year = {2018}
}
@inproceedings{faucris.316726668,
author = {Walter, Dominik and Brand, Marcel and Heidorn, Christian and Witterauf, Michael and Hannig, Frank and Teich, Jürgen},
booktitle = {Proceedings of the IEEE International Symposium on Circuits and Systems (ISCAS)},
date = {2024-05-19/2024-05-22},
faupublication = {yes},
peerreviewed = {Yes},
title = {{ALPACA}: {An} {Accelerator} {Chip} for {Nested} {Loop} {Programs}},
venue = {Singapore},
year = {2024}
}
@inproceedings{faucris.121755304,
abstract = {The trend of current and future domain-specific MPSoCs towards heterogeneous and tiled architectures as well as the increasing number of cores on a single chip impedes the design and the parallel programming of such computing systems. To tackle this problem a new computing paradigm called invasive computing has recently been proposed. Here, the workload and its distribution are not known at compile-time but are highly dynamic and can be adapted to the status (load, temperature, etc.) of the underlying architecture at run-time. The architectures envisaged for this resource-aware programming approach range from standard RISC core based platforms, tightly-coupled processor arrays for exploiting loop level parallelism to HPC systems. In order to explore such heterogeneous invasive architectures during early design phases, new means for modeling and simulating are required. Therefore, we present a novel and flexible simulation framework, which allows to model and simulate resource-aware applications on invasive architectures (including the associated system software) by integrating different architectural simulators in a modular way. © 2012 ECSI.},
address = {New York, NY, USA},
author = {Gerndt, Michael and Hannig, Frank and Herkersdorf, Andreas and Hollmann, Andreas and Meyer, Marcel and Roloff, Sascha and Weidendorfer, Josef and Wild, Thomas and Zaib, Aurang},
booktitle = {Proc. of the Forum on Specification & Design Languages (FDL)},
date = {2012-09-18/2012-09-20},
faupublication = {yes},
isbn = {978-2-9530504-5-5},
keywords = {Architecture simulation; Hardware/software modeling and simulation; Parallel computing},
note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.aninte},
pages = {185-192},
publisher = {IEEE Press},
title = {{An} {Integrated} {Simulation} {Framework} for {Invasive} {Computing}},
venue = {Vienna},
year = {2012}
}
@inproceedings{faucris.239132293,
abstract = {In this paper, we present anytime instructions for floating-
point additions and multiplications. Specific to such instructions is
their ability to compute an arithmetic operation at a programmable
accuracy of a most significant bits where a is encoded in the instruction
itself. Contrary to reduced-precision architectures, the word length is
maintained throughout the execution. Two approaches are presented for
the efficient implementation of anytime additions and multiplications, one
based on on-line arithmetic and the other on bitmasking. We propose
implementations of anytime functional units for both approaches and
evaluate them in terms of error, latency, area, as well as energy savings.
As a result, 15% of energy can be saved on average while computing a
floating-point addition with an error of less than 0.1%. Moreover, large
latency and energy savings are reported for iterative algorithms such as
a Jacobi algorithm with savings of up to 39% in energy.
},
author = {Brand, Marcel and Witterauf, Michael and Bosio, Alberto and Teich, Jürgen},
booktitle = {Proceedings of the 31st IEEE International Conference on Application-specific Systems, Architectures and Processors},
date = {2020-07-06/2020-07-08},
doi = {10.1109/ASAP49362.2020.00034},
faupublication = {yes},
peerreviewed = {unknown},
title = {{Anytime} {Floating}-{Point} {Addition} and {Multiplication} – {Concepts} and {Implementations}},
venue = {Manchester, U.K.},
year = {2020}
}
@inproceedings{faucris.213198267,
author = {Brand, Marcel and Witterauf, Michael and Hannig, Frank and Teich, Jürgen},
booktitle = {ACM International Conference on Computing Frontiers 2019},
date = {2019-04-30/2019-05-02},
doi = {10.1145/3310273.3322833},
editor = {ACM},
faupublication = {yes},
isbn = {978-1-4503-6685-4},
pages = {215 - 219},
peerreviewed = {Yes},
title = {{Anytime} {Instructions} for {Programmable} {Accuracy} {Floating}-{Point} {Arithmetic}},
venue = {Alghero, Sardinia},
year = {2019}
}
@inproceedings{faucris.109501304,
abstract = {The growing demand of computationally intensive algorithms/applications has resulted in the widespread acceptance of heterogeneous MPSoC platforms. The primary reason for this trend is due to the better performance and power efficiency exhibited by heterogeneous architectures consisting of standard processor cores and hardware accelerators. However, multiple processors accessing shared resources such as cache/memory and buses may lead to significant contention on them, thereby decreasing not only the performance, but also timing predictability. Moreover, the effect of shared resource contention worsens in the presence of multiple application scenarios with different execution and communication bandwidth requirements. To mitigate this problem, we first propose a Dynamic Bus Reconfiguration Policy (DBRP) that decides when to reconfigure a shared bus between Non-Preemptive Fixed Priority (NP-FP) and Time-Division Multiple Access (TDMA) scheduling. The required TDMA slot sizes are computed on-the-fly before NP-FP to TDMA reconfiguration such that deadlines of all Hard Real-Time (HRT) applications are satisfied and all Soft Real-Time (SRT) applications are serviced evenly. Our proposed DBRP has been implemented on a real MPSoC platform consisting of cores connected by the AMBA AHB. The case studies demonstrate that reconfiguration of bus arbitration ensures that communication deadline constraints of HRT applications are maximally satisfied with low hardware and reconfiguration overhead.},
author = {Gangadharan, Deepak and Sousa, Éricles and Lari, Vahid and Hannig, Frank and Teich, Jürgen},
booktitle = {Proceedings of Asilomar Conference on Signals, Systems, and Computers (ASILOMAR)},
date = {2014-11-02/2014-11-05},
doi = {10.1109/ACSSC.2014.7094471},
faupublication = {yes},
isbn = {9781479982974},
pages = {398-403},
peerreviewed = {unknown},
publisher = {IEEE Computer Society},
title = {{Application}-driven reconfiguration of shared resources for timing predictability of {MPSoC} platforms},
venue = {Pacific Grove, CA},
year = {2015}
}
@inproceedings{faucris.118569704,
abstract = {The design and the programming of heterogeneous future MPSoCs including thousands of processor cores is a hard challenge. Means are necessary to program and simulate the dynamic behavior of such systems in order to dimension the hardware design and to verify the software functionality as well as performance goals. Cycle-accurate simulation of multiple parallel applications simultaneously running on different cores of the architecture would be much too slow and is not the desired level of detail. In this paper, we therefore present a novel high-level simulation approach which tackles the complexity and the heterogeneity of such systems and enables the investigation of a new computing paradigm called invasive computing. Here, the workload and its distribution are not known at compile-time but are highly dynamic and have to be adapted to the status (load, temperature, etc.) of the underlying architecture at run-time. We propose an approach for the modeling of tiled MPSoC architectures and the simulation of resource-aware programming concepts on these. This approach delivers important timing information about the parallel execution and also is taking into account the computational properties of possibly different types of cores. © 2012 IEEE.},
address = {New York, NY, USA},
author = {Roloff, Sascha and Hannig, Frank and Teich, Jürgen},
booktitle = {Proc. of the 17th Asia and South Pacific Design Automation Conference (ASP-DAC)},
date = {2012-01-30/2012-02-02},
doi = {10.1109/ASPDAC.2012.6164943},
faupublication = {yes},
isbn = {978-1-4673-0770-3},
note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.approx},
pages = {187-192},
publisher = {IEEE Press},
title = {{Approximate} {Time} {Functional} {Simulation} of {Resource}-{Aware} {Programming} {Concepts} for {Heterogeneous} {MPSoCs}},
venue = {Sydney},
year = {2012}
}
@inproceedings{faucris.120088364,
abstract = {To cope with the strict reliability requirements of safety-critical
ADAS applications, the upcoming TSN standard introduces mechanisms
that enable transmission redundancy at any switch or end
node. However, it is up to the designer to decide at which points
and for which messages to activate transmission redundancy. This
significantly increases the design space and requires to trade-off
reliability with other routing-related design objectives like network
load, transmission timing, or the monetary cost of the hardware. As
a remedy, this paper a) presents two different exact approaches to
generate feasible redundant message routings and b) proposes an
extension of the state-of-the-art approach for the multi-objective
routing optimization, enabling the optimizer to directly adjust system
features that are relevant for the design objectives. A case study
with an application from the automotive domain compares the optimization
capabilities of the presented approaches for the routing
generation and demonstrates the significant gain in optimization
power that is achieved with the proposed optimization extension.},
author = {Smirnov, Fedor and Reimann, Felix and Teich, Jürgen and Han, Zhao and Glaß, Michael},
booktitle = {Proceedings of 21st International Workshop on Software and Compilers for Embedded Systems (SCOPES 2018)},
date = {2018-05-28/2018-05-30},
doi = {10.1145/3207719.3207725},
editor = {ACM},
faupublication = {yes},
keywords = {Design Space Exploration; Network Optimization; Automotive Ethernet},
peerreviewed = {unknown},
title = {{Automatic} {Optimization} of {Redundant} {Message} {Routings} in {Automotive} {Networks}},
venue = {Sankt Goar},
year = {2018}
}
@article{faucris.309853755,
abstract = {
Embedded system applications often require guarantees regarding non-functional properties when executed on a given MPSoC platform. Examples of such requirements include real-time, energy or safety properties on corresponding programs. One option to implement the enforcement of such requirements is by a reactive control loop, where an enforcer decides based on a system response (feedback) how to control the system, e.g., by adapting the number of cores allocated to a program or by scaling the voltage/frequency mode of involved processors. Typically, a violation of a requirement must either never happen in case of strict enforcement, or only happen temporally (in case of so-called loose enforcement). However, it is a challenge to design enforcers for which it is possible to give formal guarantees with respect to requirements, especially in the presence of typically largely varying environmental input (workload) per execution. Technically, an enforcement strategy can be formally modeled by a finite state machine (FSM) and the uncertain environment determining the workload by a discrete-time Markov chain. It has been shown in previous work that this formalization allows the formal verification of temporal properties (verification goals) regarding the fulfillment of requirements for a given enforcement strategy. In this paper, we consider the so far unsolved problem of design space exploration and automatic synthesis of enforcement automata that maximize a number of deterministic and probabilistic verification goals formulated on a given set of non-functional requirements. For the design space exploration (DSE), an approach based on multi-objective evolutionary algorithms is proposed in which enforcement automata are encoded as genes of states and state transition conditions. For each individual, the verification goals are evaluated using probabilistic model checking. At the end, the DSE returns a set of efficient FSMs in terms of probabilities of meeting given requirements. As experimental results, we present three use cases while considering requirements on latency and energy consumption.
},
author = {Esper, Khalil and Wildermann, Stefan and Teich, Jürgen},
doi = {10.1145/3617832},
faupublication = {yes},
journal = {ACM Transactions on Design Automation of Electronic Systems},
keywords = {Finite State Machine; Genetic Algorithm; Probabilistic Model Cheking; Design Space Exploration; Verification; Runtime Requirement Enforcement; Optimization; Enforcement FSM; MPSoC; Steady State; Evolutionary Algorithm; PCTL; Markov Chain;},
pages = {1-20},
peerreviewed = {Yes},
title = {{Automatic} {Synthesis} of {FSMs} for {Enforcing} {Non}-{Functional} {Requirements} on {MPSoCs} {Using} {Multi}-{Objective} {Evolutionary} {Algorithms}},
volume = {28},
year = {2023}
}
@incollection{faucris.281549226,
author = {Teich, Jürgen and Esper, Khalil and Falk, Joachim and Pourmohseni, Behnaz and Schwarzer, Tobias and Wildermann, Stefan},
booktitle = {Invasive Computing},
doi = {10.25593/978-3-96147-571-1},
editor = {Jürgen Teich, Jörg Henkel, Andreas Herkersdorf},
faupublication = {yes},
isbn = {978-3-96147-571-1},
pages = {69-95},
peerreviewed = {No},
publisher = {FAU University Press},
title = {{Basics} of {Invasive} {Computing}},
year = {2022}
}
@incollection{faucris.281549663,
author = {Bader, Michael and Wildermann, Stefan and Glaß, Michael and Pöppl, Alexander and Pourmohseni, Behnaz and Schwarzer, Tobias and Spieck, Jan and Wille, Mario},
booktitle = {Invasive Computing},
doi = {10.25593/978-3-96147-571-1},
editor = {Jürgen Teich, Jörg Henkel, Andreas Herkersdorf},
faupublication = {yes},
isbn = {978-3-96147-571-1},
pages = {97-122},
peerreviewed = {No},
publisher = {FAU University Press},
title = {{Characterisation} and {Analysis} of {Invasive} {Algorithmic} {Patterns}},
year = {2022}
}
@incollection{faucris.122915144,
author = {Teich, Jürgen and Boppu, Srinivas and Hannig, Frank and Lari, Vahid},
booktitle = {Transforming Reconfigurable Systems: A Festschrift Celebrating the 60th Birthday of Professor Peter Cheung},
doi = {10.1142/9781783266975{\_}0010},
editor = {Luk, Wayne, Constantinides, George A.},
faupublication = {yes},
isbn = {978-1-78326-696-8},
pages = {167-206},
peerreviewed = {unknown},
title = {{Compact} {Code} {Generation} and {Throughput} {Optimization} for {Coarse}-{Grained} {Reconfigurable} {Arrays}},
year = {2015}
}
@inproceedings{faucris.251996265,
abstract = {
With the abundance of computing devices in our everyday life such as IoT
devices, improving their security has become a number one priority. While
the major focus lies on software security, hardware vulnerabilities are
often not considered. Here, particularly side-channel attacks pose a
realistic threat to such systems. However, conducting Side-Channel Analysis
(SCA) to evaluate those threats currently requires deep expert knowledge, a
lab environment, and numerous manual steps. Therefore, it is often ignored
in security considerations.
In this paper, we analyze the challenges when conducting SCA on
consumer-grade devices using template-matching based triggering techniques.
By introducing a three-staged framework called CORSICA, we elaborate the
obstacles and deficiencies of current state-of-the-art techniques and
provide potential solutions for them. Moreover, we validate our claims by
introducing a method for semi-automatic extraction of a waveform template
of an AES 128 encryption that can be used in combination with a
template-matching triggering system. This extraction is based on generic
meta information and is demonstrated on a consumer-grade ARM processor board.
},
author = {Schlumberger, Jens and Wildermann, Stefan and Teich, Jürgen},
booktitle = {11th IFIP International Conference on New Technologies, Mobility and Security (NTMS)},
date = {2021-04-19/2021-04-21},
doi = {10.1109/NTMS49979.2021.9432644},
edition = {2},
editor = {IEEE},
faupublication = {yes},
keywords = {Side-Channel Analysis; Off-The-Shelves Devices; template-matching trigger},
pages = {1-5},
peerreviewed = {Yes},
title = {{CORSICA}: {A} {Framework} for {Conducting} {Real}-{World} {Side}-{Channel} {Analysis}},
venue = {Paris, France},
year = {2021}
}
@inproceedings{faucris.121605044,
abstract = {Future many-core systems are envisaged to support the concurrent execution of varying mixes of different applications. Because of the vast number of binding options for such mixes on heterogeneous resources, enabling predictable application execution is far from trivial. Hybrid application mapping is an efficient way of achieving run-time predictability by combining design-time analysis of application mappings with run-time management. Existing hybrid mapping strategies focus on computation resources and either ignore communication details or make significantly simplifying assumptions like unlimited bandwidth or exclusive usage. But, actual many-core systems consist of constrained and shared computation and communication resources where the run-time decision of whether a feasible application binding on a set of preoccupied resources exists or not is an NP-complete problem. As a remedy, we present a novel hybrid application mapping approach that considers constrained shared communication and computation resources. Here, (a) a design space exploration coupled with a formal performance analysis delivers several resource reservation configurations with verified real-time guarantees for each individual application. The configurations are then transformed to (b) a novel efficient intermediate representation that is passed to the run-time management where we (c) formulate run-time resource reservation and application binding as a constraint satisfaction problem and present an adequate solving mechanism. Our experimental evaluation shows that existing approaches may produce infeasible outcomes and are thus not applicable for predictable application execution, while the proposed approach enables predictable and efficient run-time management of dynamic application mixes. Copyright is held by the owner/author(s).},
author = {Weichslgartner, Andreas and Gangadharan, Deepak and Wildermann, Stefan and Glaß, Michael and Teich, Jürgen},
booktitle = {Proceedings of the International Conference on Hardware/Software Codesign and System Synthesis (CODES+ISSS 2014)},
date = {2014-10-12/2014-10-17},
doi = {10.1145/2656075.2656083},
faupublication = {yes},
isbn = {9781450330510},
keywords = {Dse; Hybrid mapping; Many-core; Networks-on-chip; Predictability},
pages = {10},
peerreviewed = {unknown},
publisher = {Association for Computing Machinery, Inc},
title = {{DAARM}: {Design}-time application analysis and run-time mapping for predictable execution in many-core systems},
venue = {New Delhi},
year = {2014}
}
@inproceedings{faucris.254758364,
author = {Villar-Corrales, Angel and Schirrmacher, Franziska and Rieß, Christian},
booktitle = {ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
doi = {10.1109/ICASSP39728.2021.9414733},
faupublication = {yes},
pages = {1635-1639},
peerreviewed = {Yes},
title = {{Deep} {Learning} {Architectural} {Designs} for {Super}-{Resolution} of {Noisy} {Images}},
year = {2021}
}
@inproceedings{faucris.117240684,
author = {Weichslgartner, Andreas and Wildermann, Stefan and Götzfried, Johannes and Freiling, Felix and Glaß, Michael and Teich, Jürgen},
booktitle = {In Proceedings of the 19th International Workshop on Software and Compilers for Embedded Systems (SCOPES)},
date = {2016-05-23/2016-05-25},
doi = {10.1145/2906363.2906370},
faupublication = {yes},
pages = {153-162},
peerreviewed = {Yes},
title = {{Design}-{Time}/{Run}-{Time} {Mapping} of {Security}-{Critical} {Applications} in {Heterogeneous} {MPSoCs}},
venue = {St. Goar},
year = {2016}
}
@inproceedings{faucris.122148004,
abstract = {For the next generations of Processor-Arrays-on-Chip(e. g., coarse-grained reconfigurable or programmable arrays)— including more than 100s to 1000s of processing elements—it is very important to keep the on-chip configuration/instruction memories as small as possible. Hence, compilers must take into account the scarceness of available instruction memory and create the code as compact as possible [1]. However, Very Long Instruction Word (VLIW) processors have thewell-known problem that compilers typically produce lengthycodes. A lot of unnecessary code is produced due to unused Functional Units (FUs) or repeating operations for single FUs in instruction sequences. Techniques like software pipeliningcan be used to improve the utilization of the FUs, yet with therisk of code explosion [2] due to the overlapped scheduling of multiple loop iterations or other control flow statements. This is, where our proposed Orthogonal Instruction Processing (OIP) architecture (see Fig. 1) shows benefits in reducing the code size of compute-intensive loop programs. The idea is, contrary to lightweight VLIW processors used in arrays like Tightly Coupled Processor Arrays (TCPAs) [4], to equip each FU with its own instruction memory, branch unit, andprogram counter, but still let the FUs share the register files as well as input and output signals. This enables a processorto orthogonally execute a loop program. Each FU can execute its own sub-program while exchanging data over the register files. The branch unit and its instruction format have to beslightly changed by introducing a counter to each instructionthat determines how often the instruction is repeated until the specified branch is executed. This enables repeating instructions without repeating them in the code. Those kind of processors have to be carefully programmed, e. g., to not run into data dependency problems while optimizing throughput. For solving this resource-constrained modulo scheduling problem, we use techniques based on mixed integer linear programming [5], [3].
Obviously, the modifications of the processor produce architectural overhead in form of additional branch units and an increase of instruction memory compared to the lightweight VLIW processors. Thus, we created an analytical model of both the lightweight VLIW processor and our proposed architecture to analyze the overhead. The model gives an upper bound of the hardware costs and the memory consumption according to [7]. We examined the HW costs of a lightweight VLIW processor with different instruction memory lengths mVLIW and compared them to our OIP processor with varying instruction ratios IR and thus instruction memory lengths mOIP of each FU’s instruction memory. In the examination, we covered processors containingten FUs and averaged the HW costs over the instruction ratio. Figure 2 shows that the overhead is negligible as soon as we are able to reduce program sizes to 50 % (i. e., IR = 2), whichis usually achieved by our compile},
author = {Brand, Marcel and Hannig, Frank and Tanase, Alexandru-Petru and Teich, Jürgen},
booktitle = {2017 IEEE 28th International Conference on Application-specific Systems, Architectures and Processors (ASAP)},
date = {2017-07-10/2017-07-12},
doi = {10.1109/ASAP.2017.7995282},
editor = {IEEE},
faupublication = {yes},
isbn = {978-1-5090-4825-0},
pages = {207},
peerreviewed = {unknown},
title = {{Efficiency} in {ILP} {Processing} by {Using} {Orthogonality}},
venue = {Seattle},
year = {2017}
}
@article{faucris.224634960,
author = {Heidorn, Christian and Witterauf, Michael and Hannig, Frank and Teich, Jürgen},
doi = {10.17706/jcp.14.8.541-556},
faupublication = {yes},
journal = {Journal of Computers},
pages = {541-556},
peerreviewed = {Yes},
title = {{Efficient} {Mapping} of {CNNs} onto {Tightly} {Coupled} {Processor} {Arrays}},
volume = {14},
year = {2019}
}
@incollection{faucris.248819626,
author = {Smirnov, Fedor and Pourmohseni, Behnaz and Glaß, Michael and Teich, Jürgen},
booktitle = {Smart Cities, Green Technologies and Intelligent Transport Systems},
doi = {10.1007/978-3-030-68028-2{\_}9},
faupublication = {yes},
isbn = {978-3-030-68028-2},
pages = {173 - 199},
peerreviewed = {unknown},
publisher = {Springer},
title = {{Efficient} {Symbolic} {Routing} {Encoding} for {In}-vehicle {Network} {Optimization}},
url = {https://link.springer.com/chapter/10.1007/978-3-030-68028-2{\_}9},
year = {2021}
}
@article{faucris.108398444,
author = {Zaib, Aurang and Heisswolf, Jan and Weichslgartner, Andreas and Wild, Thomas and Teich, Jürgen and Becker, Jürgen and Herkersdorf, Andreas},
doi = {10.1016/j.sysarc.2017.03.004},
faupublication = {yes},
journal = {Journal of Systems Architecture},
pages = {72-82},
peerreviewed = {Yes},
title = {{Efficient} {Task} {Spawning} for {Shared} {Memory} and {Message} {Passing} in {Many}-core {Architectures}},
volume = {77},
year = {2017}
}
@inproceedings{faucris.263351898,
abstract = {
Many embedded system applications impose hard real-time, energy or safety requirements on corresponding programs typically concurrently executed on a given MPSoC target platform. Even when mutually isolating applications in space or time, the enforcement of such properties, e.g., by adjusting the number of processors allocated to a program or by scaling the voltage/frequency mode of involved processors, is a difficult problem to solve, particularly in view of typically largely varying environmental input (workload) per execution. In this paper, we formalize the related control problem using finite state machine models for the uncertain environment determining the workload, the system response (feedback), as well as the enforcer strategy. The contributions of this paper are as follows: a) Rather than trace-based simulation, the uncertain environment is modeled by a discrete-time Markov chain (DTMC) as a random process to characterize possible input sequences an application may experience. b) A number of important verification goals to analyze different enforcer FSMs are formulated in PCTL for the resulting stochastic verification problem, i.e., the likelihood of violating a timing or energy constraint, or the expected number of steps for a system to return to a given execution time corridor. c) Applying stochastic model checking, i.e., PRISM to analyze and compare enforcer FSMs in these properties, and finally d) proposing an approach for reducing the environment DTMC by partitioning equivalent environmental states (i.e., input states leading to an equal system response in each MPSoC mode) such that verification times can be reduced by orders of magnitude to just a few ms for real-world examples.
},
address = {New York, NY, USA},
author = {Esper, Khalil and Wildermann, Stefan and Teich, Jürgen},
booktitle = {Proceedings of the 19th ACM-IEEE International Conference on Formal Methods and Models for System Design},
date = {2021-11-20/2021-11-22},
doi = {10.1145/3487212.3487348},
faupublication = {yes},
isbn = {9781450391276},
keywords = {probabilistic model cheking, MPSoC, PCTL, Markov chain, verification, finite state machine, runtime requirement enforcement},
pages = {21–31},
peerreviewed = {Yes},
publisher = {Association for Computing Machinery},
series = {MEMOCODE '21},
title = {{Enforcement} {FSMs} - {Specification} and {Verification} of {Non}-{Functional} {Properties} of {Program} {Executions} on {MPSoCs}},
venue = {Beijing},
year = {2021}
}
@inproceedings{faucris.234583133,
abstract = {Approximate Computing is a novel design paradigm sacrificing computational accuracy for gains in other non-functional properties. We present a tool that allows to automatically analyze approximated loop programs for the resulting error and range of output values. Our tool allows to model non-uniform distributions for the input variables. We further support the novel concept of Anytime Instructions: Anytime Instructions encode the number of most-significant mantissa bits to be computed in floating point operations. They are typically used to achieve execution time and energy reductions. First experiments using the tool show promising results.
During execution, new I/O data is continuously fetched forth and back to memory.
This data exchange is very often performance-critical and a careful orchestration thus vital.
To satisfy the I/O demand for accelerators of loop nests, it was shown that
the individual reads and writes can be merged into larger blocks, which are subsequently transferred by a single DMA transfer.
Furthermore, the order in which such DMA transfers must be issued, was shown to be reducible to a real-time task scheduling problem to be solved at run time.
Rather than just concepts, we investigate in this paper the architecture and implementation of such a Loop I/O Controller (LION) for a class of CGRAs called TCPAs.
Based on a novel heap-based priority queue approach, the proposed controller is able to issue every 6 cycles a new DMA request to a memory bus.
Even on a simple FPGA prototype running at just 200 MHz, this allows for more than 33 million DMA requests to be issued per second.
Since the execution time of a typical DMA request is in general at least one order of magnitude longer, we can conclude that this rate is sufficient to fully utilize a memory bus.
In the evaluation of our architecture, we compare the proposed priority queue with various state-of-the-art approaches and conclude that our priority queue is not only cheaper, but also guaranteeing real-time constraints.
Furthermore, we discuss many different optimizations for the Loop I/O Controller, which can reduce the area of the proposed architecture significantly.
in hardware/software co-design. Therefore, to keep up with this trend, an automated systematic design flow that directly incorporates model simulation with High-Level Synthesis (HLS) for hybrid hardware and software implementations is necessary. In order to address this issue, the work at hand makes use of the modeling and simulation environment MATLAB/Simulink, a de facto standard in model-based development. Additionally, we present a novel design flow, which enables the assignment of functional tasks of a Simulink application to individual hardware/software solutions for Xilinx Zynq PSoCs. Thereby, the proposed method-
ology enables control and system engineers to automatically explore different hardware and software implementation variants from a behavioral model. As a case study, we present a JPEG decoder application and investigate design objectives like resource costs and throughput to show the practicability of our approach.
Future Advanced Driver Assistance Systems (ADAS) require the continuous computation of detailed maps of the vehicle’s environment. Due to the high demand of accuracy and the enormous amount of data to be fused and processed, common architectures used today, like single-core processors in automotive Electronic Control Units (ECUs), do not provide enough computing power. Here, emerging embedded multi-core architectures are appealing such as embedded Graphics Processing Units (GPUs). In this paper, we (a) identify and analyze common subalgorithms of ADAS algorithms for computing environment maps, such as interval maps, for suitability to be parallelized and run on embedded GPUs. From this analysis, (b) performance models are derived on achievable speedups with respect to sequential single-core CPU implementations. (c) As a third contribution of this paper, these performance models are validated by presenting and comparing a novel parallelized interval map GPU implementation against a parallel occupancy grid map implementation. For both types of environment maps, implementations on an Nvidia Tegra K1 prototype are compared to verify the correctness of the introduced performance models. Finally, the achievable speedups with respect to a single-core CPU solution are reported. These range from 3x to 275x for interval and grid map computations.
},
author = {Fickenscher, Jörg and Reiche, Oliver and Schlumberger, Jens and Hannig, Frank and Teich, Jürgen},
booktitle = {Proceedings of the 18th IEEE International High-Level Design Validation and Test Workshop (HLDVT)},
date = {2016-10-07/2016-10-08},
doi = {10.1109/HLDVT.2016.7748257},
faupublication = {yes},
pages = {70-77},
peerreviewed = {Yes},
title = {{Modeling}, {Programming} and {Performance} {Analysis} of {Automotive} {Environment} {Map} {Representations} on {Embedded} {GPUs}},
venue = {Santa Cruz, CA},
year = {2016}
}
@misc{faucris.266447917,
abstract = {Writing well-maintainable parallel programs that efficiently utilize many processor cores is still a significant challenge. Threads are hard to use, and so are event-based schemes. Furthermore, threads are affected by the blocking anomaly, that is, the loss of parallelism when threads execute a blocking system call—often resulting in low core utilization and unnecessarily high response times. This paper introduces pseudo-blocking system calls built upon modern asynchronous queue-based system-call techniques (like Linux’s io{\_}uring) circumventing the blocking anomaly. They are similar to Go’s programming model, where one develops against a blocking interface to keep the code structure clean. However, instead of using synchronous non-blocking system calls as the underlying technique, our approach internally uses an asynchronous queue-based interface. We further present a novel architecture for concurrency platforms, like Cilk and Go, enabling low latencies and high throughput via pseudo-blocking system calls. Finally, we discuss future OS enhancements that would improve our proposed architecture. We implemented and evaluated a concurrency platform based on the concept of pseudo-blocking system calls. Our platform can outperform state-of-the-art systems like Go by 1.17× in a file-content search benchmark. It is able to increase the throughput of a echo-server benchmark by 4 % when compared to Go, and by 17.8 % when compared to Rust’s Tokio while improving the tail latenc},
author = {Schmaus, Florian and Fischer, Florian and Hönig, Timo and Schröder-Preikschat, Wolfgang},
doi = {10.25593/issn.2191-5008/CS-2021-02},
faupublication = {yes},
keywords = {concurrency platform; concurrent programming; operating systems; parallel programming},
peerreviewed = {automatic},
title = {{Modern} {Concurrency} {Platforms} {Require} {Modern} {System}-{Call} {Techniques}},
url = {https://opus4.kobv.de/opus4-fau/files/17655/paper.pdf},
year = {2021}
}
@inproceedings{faucris.122257784,
author = {Witterauf, Michael and Tanase, Alexandru-Petru and Hannig, Frank and Teich, Jürgen},
booktitle = {Proceedings of the 27th IEEE International Conference on Application-specific Systems, Architectures and Processors (ASAP)},
date = {2016-07-06/2016-07-08},
faupublication = {yes},
peerreviewed = {unknown},
title = {{Modulo} {Scheduling} of {Symbolically} {Tiled} {Loops} for {Tightly} {Coupled} {Processor} {Arrays}},
venue = {London},
year = {2016}
}
@inproceedings{faucris.266541602,
abstract = {Embedded system applications usually have to meet hard real-time, energy or safety requirements on programs typically concurrently executed on a given MPSoC target platform. Enforcing such properties, e.g., by adapting the number of processors allocated to a program or by scaling the voltage/frequency mode of involved processors, is a difficult problem to solve, especially with a typically large varying environmental input (workload) per execution. In a previous work [1], we formalized the related enforcement problem using (a) finite state machines to model enforcement strategies, (b) discrete-time Markov chains to model the uncertain environment determining the system’s workload, and (c) the system response that defines the feedback for the reactive enforcer. In this paper, we apply that approach to specify and verify multi-requirement enforcement strategies and assess a case study for enforcing two independent requirements at the same time, i.e., latency and energy consumption. We evaluate and compare different enforcement strategies using probabilistic verification for the use case of an object detection application},
address = {Dagstuhl, Germany},
author = {Esper, Khalil and Wildermann, Stefan and Teich, Jürgen},
booktitle = {Third Workshop on Next Generation Real-Time Embedded Systems (NG-RES 2022)},
date = {2022-06-22/2022-06-22},
doi = {10.4230/OASIcs.NG-RES.2022.2},
faupublication = {yes},
isbn = {978-3-95977-221-1},
keywords = {Runtime Requirement Enforcement, Verification, Finite State Machine, Markov Chain, Probabilistic Model Cheking, PCTL, MPSoC},
pages = {2:1--2:13},
peerreviewed = {Yes},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
title = {{Multi}-requirement {Enforcement} of {Non}-{Functional} {Properties} on {MPSoCs} {Using} {Enforcement} {FSMs} - {A} {Case} {Study}},
url = {https://drops.dagstuhl.de/opus/volltexte/2022/16110/},
venue = {Budapest},
volume = {98},
year = {2022}
}
@inproceedings{faucris.118281724,
abstract = {Multi- And many-core systems become more and more mainstream and therefore new communication infrastructures like Networks-on-Chip (NoC) and new programming languages like IBM's X10 with its partitioned global address space (PGAS) are introduced. In this paper we present an X10- based simulator, which is capable to simulate the network traffic that occurs inside the X10 program. This holistic approach enables to simulate the functionality and the indicated traffic together, in contrast to pure network simulators where usually only synthetic traffic or traces are used. We explain how the communication overhead is extracted from the X10 run-time and how to simulate the NoC behavior. In experiments we show that the proposed simulator is up to 10× faster than a comparable SystemC-based simulator and at the same time preserves high accuracy. Furthermore, we present a quality and simulation speed tradeoff by using different simulation modes for a set of real world parallel applications. Copyright © 2013 ACM.},
address = {New York, NY, USA},
author = {Roloff, Sascha and Weichslgartner, Andreas and Heißwolf, Jan and Hannig, Frank and Teich, Jürgen},
booktitle = {Proc. 16th International Workshop on Software and Compilers for Embedded Systems},
date = {2013-06-19/2013-06-21},
doi = {10.1145/2463596.2463606},
faupublication = {yes},
keywords = {Modeling; Network-on-chip; Parallel programming; Simulation},
note = {UnivIS-Import:2015-04-16:Pub.2013.tech.IMMD.inform.nocsim},
pages = {77-85},
publisher = {ACM Press},
title = {{NoC} {Simulation} in {Heterogeneous} {Architectures} for {PGAS} {Programming} {Model}},
venue = {St. Goar},
year = {2013}
}
@inproceedings{faucris.117096144,
abstract = {We present a compilation-based technique for providing on-demand structural redundancy for massively parallel processor arrays. Thereby, application programmers gain the capability to trade throughput for reliability according to application requirements. To protect parallel loop computations against errors, we propose to apply the well-known fault tolerance schemes dual modular redundancy (DMR) and triple modular redundancy (TMR) to a whole region of the processor array rather than individual processing elements. At the source code level, the compiler realizes these replication schemes with a program transformation that: (1) replicates a parallel loop program two or three times for DMR or TMR, respectively, and (2) introduces appropriate voting operations whose frequency and location may be chosen from three proposed variants. Which variant to choose depends, for example, on the error resilience needs of the application or the expected soft error rates. Finally, we explore the different tradeoffs of these variants in terms of performance overheads and error detection latenc},
author = {Tanase, Alexandru-Petru and Witterauf, Michael and Teich, Jürgen and Hannig, Frank and Lari, Vahid},
booktitle = {In Proceedings of the 26th IEEE International Conference on Application-specific Systems, Architectures and Processors (ASAP)},
date = {2015-07-27/2015-07-29},
doi = {10.1109/ASAP.2015.7245734},
faupublication = {yes},
isbn = {9781479919246},
pages = {194-201},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{On}-demand fault-tolerant loop processing on massively parallel processor arrays},
venue = {Toronto},
year = {2015}
}
@inproceedings{faucris.124029224,
abstract = {We propose a new processor architecture called
Orthogonal Instruction Processing (OIP). Contrary to Very Long
Instruction Word (VLIW) decoding, we propose to orthogonally
decode the sub-instruction words of each Functional Unit (FU)
instead. Hereby, the OIP architecture is able to reduce the overall
machine code size of VLIW programs significantly. We will
show analytically as well as experimentally that, compared to
a VLIW processor, the savings in instruction memory size easily
compensate the overhead of one separate branch unit needed for
each FU.
For the analytical analysis, a mathematical model of hardware
costs of an OIP processor is developed and compared to a
conventional VLIW processor. In addition, we compare the code
size of selected representative programs of the new processor
architecture and show big savings of program memory. Here, the
instruction memory requirements can be decreased by a factor of
0.465. This decrease in instruction memory, despite the discussed
overhead, leads to savings in the overall hardware costs of one
processor by a factor of 0.98},
author = {Brand, Marcel and Hannig, Frank and Tanase, Alexandru-Petru and Teich, Jürgen},
booktitle = {2017 IEEE 11th International Symposium on Embedded Multicore/Many-core Systems-on-Chip},
date = {2017-09-18/2017-09-20},
doi = {10.1109/MCSoC.2017.17},
faupublication = {yes},
isbn = {978-1-5386-3441-7},
pages = {5-12},
peerreviewed = {unknown},
title = {{Orthogonal} {Instruction} {Processing}: {An} {Alternative} to {Lightweight} {VLIW} {Processors}},
venue = {Korea University, Seoul, Korea},
year = {2017}
}
@article{faucris.276788186,
abstract = {High performance and, at the same time, energy efficiency are important
yet often conflicting requirements in many fields of emerging
applications. Those applications range from multi-dimensional and
multi-sensor digital signal processing to machine learning, such as
neural network processing. Whereas conventional fixed-point and
floating-point processor architectures cannot adapt to quite diverging
demands related to required precision and accuracy of computations, even
within a single application, e.g., in different layers of a neural
network, domain-specific accelerators may be much too specific and thus
rigid to cover a wide enough spectrum of applications. In this tutorial
brief, we give an overview of existing processor solutions that are
reconfigurable or tunable in precision or accuracy of computations. The
spectrum of reviewed architectures ranges from processors with
vectorizable processors over multi- and trans-precision solutions,
including GPUs to any-time instruction-set processors. The latter works
with a fixed precision, but the accuracy of the result of floating-point
operations is encoded in the instruction word. It can thus vary from
instruction to instruction. This allows realizing accuracy vs. execution
time or energy tradeoffs. Subsequently, we investigate several
application domains, including neural network processing, linear
algebra, and approximate computing, where such emerging processor
architectures can be beneficially use},
author = {Brand, Marcel and Hannig, Frank and Keszöcze, Oliver and Teich, Jürgen},
doi = {10.1109/TCSII.2022.3173753},
faupublication = {yes},
journal = {IEEE Transactions on Circuits and Systems II: Express Briefs},
keywords = {Computer architecture; Arithmetic; Circuits and systems; Neural networks; Adders; Machine learning algorithms; Linear algebra; Accuracy; Reconfigurable architectures; Convolutional neural networks},
pages = {2661 - 2666},
peerreviewed = {Yes},
title = {{Precision}- and {Accuracy}-{Reconfigurable} {Processor} {Architectures}—{An} {Overview}},
volume = {69},
year = {2022}
}
@article{faucris.209242123,
abstract = {The fulfillment of non-functional requirements like timing or energy consumption is of utmost
importance in many embedded systems and respective applications. Especially, with the introduc-
tion of multi-core architectures, the ability to predict non-functional execution qualities becomes
more and more difficult, as multiple concurrent application programs may interfere in execution
when typcially sharing all the resources. In this paper, we advocate a novel parallel computing
paradigm called invasive computing that allows to isolate application programs on multi-core
targets. For a presented case study of a cyber-physical real-time control system, we show that
invasive computing enables composability that in fact allows to characterize and analyze each ap-
plication program statically and independent from each other. More specifically, it is shown that a
distributed object detection algorithm for controlling an inverted pendulum and implemented on
a heterogeneous invasive multi-processor SoC (MPSoC) is able to provide real-time guarantees
as well as reliability requirements on demand.
Feedback-based controllers have been proposed that react to transient environmental factors by adapting the DVFS settings or degree of parallelism following some predefined control strategy. However, it is, in general, not possible to give formal guarantees for the obtained controllers to satisfy a given set of non-functional requirements. Run-time requirement enforcement has emerged as a field of research for the enforcement of non-functional requirements at run-time, allowing to define and formally verify properties on respective control strategies specified by automata. However, techniques for the automatic generation of such controllers have not yet been established.
In this paper, we propose a technique using reinforcement learning to automatically generate verifiable feedback-based enforcers. For that, we train a control policy based on a representative input sequence at design time. The learned control strategy is then transformed into a verifiable enforcement automaton which constitutes our run-time control model that can handle unseen input data. As a case study, we apply the approach to generate controllers that are able to increase the probability of satisfying a given set of requirement verification goals compared to multiple state-of-the-art approaches, as can be verified by model checkers.
celerator architectures is the management of typically small
peripheral I/O buffers that decouple the accelerator from an
external memory. Very often, these buffers cannot store the entire
input and output data of one execution and must be updated,
i.e., filled or drained, frequently. Moreover, if a processor array
performs either a read on an empty bank or a write on a
full bank, it must interrupt its execution immediately until the
corresponding data transfer between the accelerator and an
external memory has been carried out. As a consequence, the
timing predictability of the array execution might be impaired.
Therefore, a precise analysis of a schedule for all data transfers
is inevitable. But the sequence of all data transfers cannot
be stored entirely inside most accelerators. Thus, we must
determine and schedule all necessary data transfers dynamically
at runtime. In this paper, we present an approach to characterize
all necessary data transfers and to issue them in advance so
that the peripheral I/O buffers never run full or empty. Here,
it is shown first that a deadline for each data transfer can be
derived from a given loop schedule resulting in a traditional task
scheduling program. Unfortunately, however, standard real-time
scheduling techniques such as earliest deadline first (EDF) cannot
be applied here, as each data transfer must not be interrupted
and even existing non-preemptive variants of EDF are known
to be prone to timing anomalies. As a solution, we present a
strictly non-work-conserving variant of EDF together with an
efficient schedulability test for periodic loop executions. In an
experimental section, the scheduling approach is applied to a
randomly generated set of loop programs observing that our
algorithm is able to feasibly schedule 95% of the theoretically
schedulable problem instances. Altogether, we provide a fully
timing-predictable buffer management for massively parallel
processor arrays that avoids any I/O related stalls of a processor
array by construction.
Here, jitter in non-functional program execution qualities is caused either by outer influences such as faults injected by the environment,
but can be induced also from the system management software itself, including thread-to-core mapping, scheduling and power management.
A second huge source of variability typically stems from data-dependent workloads.
In this paper, we classify and present techniques to enforce non-functional execution properties on multi-core programs.
Based on a static design space exploration and analysis of influences of variability of non-functional properties, enforcement strategies are generated to guide
the execution of periodically executed applications in given requirement corridors.
Using the case study of a complex image streaming application, we show that by controlling DVFS settings of cores proactively, not only tight execution times, but also reliability requirements may be enforced dynamically while trying to minimize energy consumption.
},
author = {Teich, Jürgen and Pourmohseni, Behnaz and Keszöcze, Oliver and Spieck, Jan and Wildermann, Stefan},
booktitle = {Asia and South Pacific Design Automation Conference (ASP-DAC)},
date = {2020-01-13/2020-01-16},
doi = {10.1109/ASP-DAC47756.2020.9045536},
faupublication = {yes},
keywords = {run-time enforcement, many-core systems, reliability, realtime},
month = {Jan},
pages = {629--636},
peerreviewed = {unknown},
title = {{Run}-{Time} {Enforcement} of {Non}-{Functional} {Application} {Requirements} in {Heterogeneous} {Many}-{Core} {Systems}},
venue = {China National Convention Center, Beijing, China},
year = {2020}
}
@incollection{faucris.227350948,
author = {Teich, Jürgen and Mahmoody, Pouya and Pourmohseni, Behnaz and Roloff, Sascha and Schröder-Preikschat, Wolfgang and Wildermann, Stefan},
booktitle = {A Journey of Embedded and Cyber-Physical Systems},
doi = {10.1007/978-3-030-47487-4},
editor = {Jian-Jia Chen},
faupublication = {yes},
isbn = {978-3-030-47487-4},
peerreviewed = {unknown},
publisher = {Springer},
title = {{Run}-{Time} {Enforcement} of {Non}-functional {Program} {Properties} on {MPSoCs}},
year = {2020}
}
@inproceedings{faucris.122814164,
abstract = {This paper describes a runtime reconfigurable bus arbitration technique for concurrent applications on heterogeneous MPSoC architectures. Here, a hardware/software approach is introduced as part of a runtime framework that enables selecting and adapting different policies (i. e., fixed-priority, TDMA, and Round-Robin) such that the performance goals of concurrent applications can be satisfied. To evaluate the hardware cost, we compare our proposed solution with respect to a well-known SPARC V8 architecture supporting fixed-priority arbitration. Notably, even providing the flexibility for selecting up to three different policies, our reconfigurable arbiter needs only 25% and 7% more LUTs and slices registers, respectively. The reconfiguration overhead for changing between different policies is 56 cycles and for programming new time slots, only 28 cycles are necessary. For demonstrating the benefits of this reconfiguration framework, we setup a mixed hard/soft real-time scenario by considering four applications with different timeliness requirements. The experimental results show that by reconfiguring the arbiter, less processing elements can be used for achieving a specific target frame rate. Moreover, adjusting the time slots for TDMA, we can speedup a soft real-time algorithm while still satisfying the deadline for hard real-time applications.},
author = {Sousa, Éricles and Gangadharan, Deepak and Hannig, Frank and Teich, Jürgen},
booktitle = {Proceedings of the EUROMICRO Digital System Design Conference (DSD)},
date = {2014-08-27/2014-08-29},
doi = {10.1109/DSD.2014.105},
faupublication = {yes},
isbn = {9781479957934},
keywords = {Concurrent Applications; Heterogeneous MPSoC Architectures; Reconfigurable Bus Arbitration; Runtime},
pages = {74-81},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Runtime} reconfigurable bus arbitration for concurrent applications on heterogeneous {MPSoC} architectures},
venue = {Verona},
year = {2014}
}
@inproceedings{faucris.232448347,
abstract = {In modern embedded systems, the trust in comprehensive security
standards all along the product life cycle has become an increasingly
important access-to-market requirement.
However, these security standards rely on mandatory immunity assumptions
such as the integrity and authenticity of an initial system
configuration typically loaded from Non-Volatile Memory
(NVM). This applies especially to FPGA-based programmable system-on-chip
(PSoC) architectures, since object codes as well as configuration data
easily exceed the capacity of a secure boot
ROM. In this context, an attacker could try to alter the content of the
NVM device in order to manipulate the system. The PSoC therefore relies
on the integrity of the NVM particularly at
boot-time. In this paper, we propose a methodology for securely booting
from an NVM in a potentially unsecure environment by exploiting the
reconfigurable logic of the FPGA. Here, the
FPGA serves as a secure anchor point by performing required integrity
and authenticity verifications prior to the configuration and execution
of any user application loaded from the NVM
on the PSoC. The proposed secure boot process is based on the following
assumptions and steps: 1) The boot configuration is stored on a fully
encrypted Secure Digital memory card
(SD card) or alternatively Flash acting as NVM. 2) At boot time, a
hardware design called Trusted Memory-Interface Unit (TMIU) is loaded to
verify first the authenticity of the deployed NVM and
then after decryption the integrity of its content. To demonstrate the
practicability of our approach, we integrated the methodology into the
vendor-specific secure boot process of a Xilinx Zynq
PSoC and evaluated the design objectives performance, power and resource
cost},
author = {Streit, Franz-Josef and Fritz, Florian and Becher, Andreas and Wildermann, Stefan and Werner, Stefan and Schmidt-Korth, Martin and Pschyklenk, Michael and Teich, Jürgen},
booktitle = {IEEE Proceedings of the 13th International Symposium on Hardware Oriented Security and Trust},
date = {2020-12-07/2020-12-11},
doi = {10.1109/HOST45689.2020.9300126},
faupublication = {yes},
keywords = {Secure Boot; Non-Volatile Memory Protection; Programmable System-on-Chip; FPGA; Hardware/Software Co-Design;},
peerreviewed = {Yes},
title = {{Secure} {Boot} from {Non}-{Volatile} {Memory} for {Programmable} {SoC}-{Architectures}},
venue = {San José, USA},
year = {2020}
}
@inproceedings{faucris.119107604,
abstract = {The recent years have shown the emergence of heterogeneous system architecture (HSA), which offers massive computational power assembled into a compact design. Computer vision applications with massive inherent parallelism highly benefits from such heterogeneous processors with on-chip CPU and GPU units. The highly parallel and compute intensive parts of the application program can be mapped to the GPU while the control flow and high level tasks may run on the CPU. However, they pose considerable challenge to software development due to their hybrid architecture. Sharing of resources (GPU or CPU) among applications running concurrently, leads to variations in processing interval and prolonged processing intervals leads to low quality results (frame drops) for computer vision algorithms. In this work, we propose resource-awareness and self organisation within the application layer to adapt to available resources on the heterogeneous processor. The benefits of the new model is demonstrated using a widely used computer vision algorithm called Harris corner detector. A resource-aware runtime-system and a heterogeneous processor were used for evaluation and the results indicate a well constrained processing interval and reduced frame-drops. Our evaluations demonstrate up to 20% improvements in processing rate and accuracy of the detected corner points for Harris corner detectio},
address = {Gières, France},
author = {Paul, Johny and Stechele, Walter and Sousa, Éricles and Lari, Vahid and Hannig, Frank and Teich, Jürgen and Kröhnert, Manfred and Asfour, Tamim},
booktitle = {Proc. of the Conference on Design and Architectures for Signal and Image Processing (DASIP)},
date = {2014-10-08/2014-10-10},
doi = {10.1109/DASIP.2014.7115616},
faupublication = {yes},
isbn = {979-10-92279-06-1},
note = {UnivIS-Import:2015-04-16:Pub.2014.tech.IMMD.inform.selfad},
pages = {1-6},
peerreviewed = {unknown},
publisher = {ECSI Media},
title = {{Self}-{Adaptive} {Harris} {Corner} {Detection} on {Heterogeneous} {Many}-core {Processor}},
venue = {Madrid},
year = {2014}
}
@inproceedings{faucris.263944121,
abstract = {Criminal investigations regularly involve the deciphering of license plates (LPs) of vehicles. Unfortunately, the image or video source material typically stems from uncontrolled sources, and may be subject to severe degradations such as extremely low resolution, strong compression, low contrast or over- resp. underexposure. While LP recognition has a long history in computer vision research, the deciphering under such severe degradations is still an open issue. Moreover, since the data source is not controlled, it cannot be assumed that the exact form of degradation is covered in the training set. In this work, we propose using convolutional recurrent neural networks (CRNN) for the recognition of LPs from images with strong unseen degradations. The CRNN clearly outperforms an existing conventional CNN in this scenario. It also provides an additional particular advantage for criminal investigations, namely to create top-n sequence predictions. Even a low number of top-n candidates improves the recognition performance considerably.},
author = {Moussa, Denise and Maier, Anatol and Schirrmacher, Franziska and Rieß, Christian},
booktitle = {Computer Analysis of Images and Patterns},
date = {2021-09-27/2021-10-01},
doi = {10.1007/978-3-030-89131-2{\_}16},
editor = {Springer, Cham},
faupublication = {yes},
isbn = {978-3-030-89131-2},
keywords = {License plate recognition; Forensics; Low-quality images},
pages = {175-185},
peerreviewed = {Yes},
publisher = {Springer Nature Switzerland AG},
series = {Lecture Notes in Computer Science},
title = {{Sequence}-based {Recognition} of {License} {Plates} with {Severe} {Out}-of-{Distribution} {Degradations}},
url = {https://faui1-files.cs.fau.de/public/publications/mmsec/2021-Moussa-SRLP.pdf},
venue = {Virtual Conference},
volume = {vol 13053},
year = {2021}
}
@inproceedings{faucris.223333583,
abstract = {We present SEVGuard, a minimal virtual execution environment that protects the condentiality of applications based on AMD's Secure Encrypted Virtualization (SEV). Although SEV was primarily designed for the protection of VMs, we found a way to overcome this limitation and exclusively protect user mode applications. Therefore, we migrate the application into a hardware-accelerated VM and encrypt both its memory and register state. To avoid the overhead of a typical hypervisor, we built our solution on top of the plain Linux Kernel Virtual Machine (KVM) API. With the help of an advanced trapping mechanism, we fully support system and library calls from within the encrypted guest. Furthermore, we allow unmodied code to be transparently virtualized and encrypted by appropriate memory mappings. The memory needed for our minimal VM can be directly allocated within SEVGuard's address space. We evaluated our execution environment regarding correctness and performance, conrming that SEVGuard can be practically used to protect existing legacy applications.
},
address = {New York City, United States of America},
author = {Palutke, Ralph and Neubaum, Andreas and Götzfried, Johannes},
booktitle = {SecureComm 2019 Proceedings},
date = {2019-10-23/2019-10-25},
doi = {10.1007/978-3-030-37231-6{\_}12},
faupublication = {yes},
keywords = {AMD SEV; virtual machine encryption; confidentiality},
peerreviewed = {Yes},
publisher = {Springer},
title = {{SEVGuard}: {Protecting} {User} {Mode} {Applications} using {Secure} {Encrypted} {Virtualization}},
venue = {Orlando},
year = {2019}
}
@inproceedings{faucris.280961539,
address = {New York},
author = {Brand, Peter and Falk, Joachim and Maier, Tanja and Teich, Jürgen},
booktitle = {2021 International Conference on Computational Science and Computational Intelligence (CSCI)},
doi = {10.1109/CSCI54926.2021.00276},
faupublication = {yes},
month = {Jan},
note = {CRIS-Team WoS Importer:2022-08-26},
pages = {1377-1380},
peerreviewed = {unknown},
publisher = {IEEE},
title = {{Simulating} {Realistic} {IoT} {Network} {Traffic} {Using} {Similarity}-based {DSE}},
venue = {Las Vegas, NV},
year = {2021}
}
@inproceedings{faucris.118576524,
address = {Ghent, Belgium},
author = {Roloff, Sascha and Hannig, Frank and Teich, Jürgen},
booktitle = {Proc. of the 8th International Summer School on Advanced Computer Architecture and Compilation for High-Performance and Embedded Systems (ACACES)},
date = {2012-07-08/2012-07-14},
faupublication = {yes},
isbn = {978-90-382-1987-5},
note = {UnivIS-Import:2015-04-16:Pub.2012.tech.IMMD.inform.simula},
pages = {127-130},
publisher = {Academia Press},
title = {{Simulation} of {Resource}-{Aware} {Applications} on {Heterogeneous} {Architectures}},
venue = {Fiuggi},
year = {2012}
}
@inproceedings{faucris.243513704,
abstract = {Image reconstruction is particularly difficult when the type of image degradations are unknown. This may be the case if the acquisition device is unknown or the images stem from an uncontrolled environment like the internet. Yet, it may be important to reconstruct a specific piece of information from the image, such as digits from signs or vehicle license plates. Existing works incorporate such prior information with a
sequential super-resolution and classification pipeline. However, this approach is prone to error propagation. In this work, we propose a new approach of connecting classification and super-resolution in parallel within a multi-task network. We show that this architecture is able to preserve structures and to remove noisy pixels although the network itself has never been trained on noisy data. We also show that this
design allows to transparently trade classification and super-resolution
quality. On upsampling by factor 4, we outperform sequential approaches in terms of SSIM by 10% and improve classification by 69},
author = {Schirrmacher, Franziska and Lorch, Benedikt and Stimpel, Bernhard and Köhler, Thomas and Riess, Christian},
booktitle = {2020 IEEE International Conference on Image Processing (ICIP)},
date = {2020-10-25/2020-10-28},
doi = {10.1109/ICIP40778.2020.9191253},
faupublication = {yes},
keywords = {Deep learning; Multi-task learning; Super-resolution; Classification},
pages = {533-537},
peerreviewed = {Yes},
title = {{SR²}: {Super}-{Resolution} {With} {Structure}-{Aware} {Reconstruction}},
url = {https://faui1-files.cs.fau.de/public/publications/mmsec/2020-Schirrmacher-SR2.pdf},
venue = {Online},
year = {2020}
}
@inproceedings{faucris.109562244,
abstract = {Loop parallelization techniques for massively parallel processor arrays using one-level tiling are often either I/O- or memory-bounded, exceeding the target architecture's capabilities. Furthermore, if the number of available processing elements is only known at runtime - as in adaptive systems - static approaches fail. To solve these problems, we present a hybrid compile/runtime technique to symbolically parallelize loop nests with uniform dependences on multiple levels. At compile time, two novel transformations are performed: (a) symbolic hierarchical tiling followed by (b) symbolic multi-level scheduling. By tuning the size of the tiles on multiple levels, a trade-off between the necessary I/O-bandwidth and memory is possible, which facilitates obeying resource constraints. The resulting schedules are symbolic with respect to the number of tiles; thus, the number of processing elements to map onto does not need to be known at compile time. At runtime, when the number is known, a simple prolog chooses a feasible schedule with respect to I/O and memory constraints that is latency-optimal for the chosen tile size. In this way, our approach dynamically chooses latency-optimal and feasible schedules while avoiding expensive re-compilations.},
author = {Tanase, Alexandru-Petru and Witterauf, Michael and Hannig, Frank and Teich, Jürgen},
booktitle = {Proceedings of the 13th ACM-IEEE International Conference on Formal Methods and Models for System Design (MEMOCODE)},
date = {2015-09-21/2015-09-23},
doi = {10.1109/MEMCOD.2015.7340486},
faupublication = {yes},
isbn = {9781509002375},
keywords = {Mathematical model; Memory management; Processor scheduling; Runtime; Schedules; Silicon},
pages = {188-197},
peerreviewed = {unknown},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {{Symbolic} loop parallelization for balancing {I}/{O} and memory accesses on processor arrays},
venue = {Austin},
year = {2015}
}
@inproceedings{faucris.246716421,
author = {Pourmohseni, Behnaz and Teich, Jürgen},
booktitle = {PhD Forum at the Design, Automation, and Test in Europe (DATE) Conference and Exhibition},
date = {2020-03-09/2020-03-13},
faupublication = {yes},
pages = {1-2},
peerreviewed = {unknown},
title = {{System}-{Level} {Mapping}, {Analysis}, and {Management} of {Real}-{Time} {Applications} in {Many}-{Core} {Systems}},
url = {https://www12.cs.fau.de/downloads/pourmohseni/pub/phdForumDATE20.pdf},
venue = {Grenoble, France},
year = {2020}
}
@inproceedings{faucris.119659144,
author = {Sousa, Éricles and Chakraborty, Arindam and Tanase, Alexandru-Petru and Hannig, Frank and Teich, Jürgen},
booktitle = {Demo Night at the IEEE International Conference on Reconfigurable Computing and FPGAs (ReConFig)},
date = {2017-12-04/2017-12-06},
doi = {10.1109/RECONFIG.2017.8279818},
faupublication = {yes},
peerreviewed = {Yes},
title = {{TCPA} {Editor}: {A} {Design} {Automation} {Environment} for a {Class} of {Coarse}-{Grained} {Reconfigurable} {Arrays}},
url = {http://ieeexplore.ieee.org/document/8279818/},
venue = {Cancun, Mexico},
year = {2017}
}
@article{faucris.117161704,
abstract = {This contribution provides an approach for emulating the behaviour of an ASIC temperature monitoring system (TMon) during run-time for a tightly-coupled processor array (TCPA) of a heterogeneous invasive multi-tile architecture to be used for FPGA prototyping. It is based on a thermal RC modeling approach. Also different usage scenarios of TCPA are analyzed and compared.},
author = {Glocker, E. and Boppu, Srinivas and Chen, Q. and Schlichtmann, U. and Teich, Jürgen and Schmitt-Landsiedel, D.},
doi = {10.5194/ars-12-103-2014},
faupublication = {yes},
journal = {Advances in Radio Science},
pages = {103-109},
peerreviewed = {unknown},
title = {{Temperature} modeling and emulation of an {ASIC} temperature monitor system for {Tightly}-{Coupled} {Processor} {Arrays} ({TCPAs})},
volume = {12},
year = {2014}
}
@misc{faucris.112803064,
author = {Mattauch, Sandra and Lohmann, Katja and Hannig, Frank and Lohmann, Daniel and Teich, Jürgen},
doi = {10.25593/issn.2191-5008/CS-2018-02},
faupublication = {yes},
peerreviewed = {automatic},
title = {{The} {Gender} {Gap} in {Computer} {Science} --- {A} {Bibliometric} {Analysis}},
year = {2018}
}
@inproceedings{faucris.224490668,
author = {Pourmohseni, Behnaz and Smirnov, Fedor and Khdr, Heba and Wildermann, Stefan and Teich, Jürgen and Henkel, Jörg},
booktitle = {Proceedings of the 40th IEEE Real-Time Systems Symposium (RTSS)},
date = {2019-12-03/2019-12-06},
doi = {10.1109/RTSS46320.2019.00029},
faupublication = {yes},
pages = {1-13},
peerreviewed = {Yes},
title = {{Thermally} {Composable} {Hybrid} {Application} {Mapping} for {Real}-{Time} {Applications} in {Heterogeneous} {Many}-{Core} {Systems}},
venue = {Hong Kong},
year = {2019}
}
@inproceedings{faucris.108964724,
author = {Gangadharan, Deepak and Tanase, Alexandru-Petru and Hannig, Frank and Teich, Jürgen},
booktitle = {DATE Friday Workshop on Performance, Power and Predictability of Many-Core Embedded Systems (3PMCES)},
date = {2014-03-28/2014-03-28},
faupublication = {yes},
peerreviewed = {unknown},
title = {{Timing} {Analysis} of a {Heterogeneous} {Architecture} with {Massively} {Parallel} {Processor} {Arrays}},
url = {https://ecsi.org/resource/workshop/2014/3PMCES/DATE/paper/timing-analysis-heterogeneous-architecture-massively-parallel-processor-arrays},
venue = {Dresden, Germany},
year = {2014}
}
@inproceedings{faucris.122616164,
author = {Roloff, Sascha and Hannig, Frank and Teich, Jürgen},
booktitle = {Proc. of the first International Workshop on Multi-Objective Many-Core Design (MOMAC) in conjunction with International Conference on Architecture of Computing Systems (ARCS)},
faupublication = {yes},
note = {UnivIS-Import:2015-04-17:Pub.2014.tech.IMMD.inform.toward{\_}7},
pages = {1-2},
title = {{Towards} {Actor}-oriented {Programming} on {PGAS}-based {Multicore} {Architectures}},
venue = {Lübeck},
year = {2014}
}
@inproceedings{faucris.124202364,
abstract = {
The parallelization of programs and distributing their workloads to multiple threads can be a challenging task. In addition to multi-threading, harnessing vector units in CPUs proves highly desirable. However, employing vector units to speed up programs can be quite tedious. Either a program developer solely relies on the auto-vectorization capabilities of the compiler or he manually applies vector intrinsics, which is extremely error-prone, difficult to maintain, and not portable at all.
Based on whole-function vectorization, a method to replace control flow with data flow, we propose auto-vectorization techniques for image processing DSLs in the context of source-to-source compilation. The approach does not require the input to be available in SSA form. Moreover, we formulate constraints under which the vectorization analysis and code transformations may be greatly simplified in the context of image processing DSLs. As part of our methodology, we present control flow to data flow transformation as a source-to-source translation. Moreover, we propose a method to efficiently analyze algorithms with mixed bit-width data types to determine the optimal SIMD width, independently of the target instruction set. The techniques are integrated into an open source DSL framework. Subsequently, the
vectorization capabilities are compared to a variety of existing state-of-the-art C/C++ compilers. A geometric mean speedup of up to 3.14 is observed for benchmarks taken from ISPC and image processing, compared to non-vectorized executions.
},
author = {Reiche, Oliver and Kobylko, C. and Hannig, Frank and Teich, Jürgen},
booktitle = {Proceedings of the 18th International Conference on Languages, Compilers, Tools, and Theory for Embedded Systems (LCTES)},
date = {2017-06-21/2017-06-22},
doi = {10.1145/3078633.3081039},
faupublication = {yes},
keywords = {Domain-Specific Languages, Vectorization, Image Processing},
pages = {21 - 30},
peerreviewed = {Yes},
publisher = {ACM},
title = {{Auto}-vectorization for {Image} {Processing} {DSLs}},
venue = {Barcelona},
year = {2017}
}
@inproceedings{faucris.111754544,
abstract = {Domain-Specific Languages (DSLs) provide a high-level and domain-specific abstraction to describe algorithms within a certain domain concisely. Since a DSL separates the algorithm description from the actual target implementation, it offers a high flexibility among heterogeneous hardware targets, such as CPUs and GPUs. With the recent uprise of promising High-Level Synthesis (HLS) tools, like Vivado HLS and Altera OpenCL, FPGAs are becoming another attractive target architecture. Particularly in the domain of image processing, applications often come with stringent requirements regarding performance, energy efficiency, and power, for which FPGAs have been proven to be among the most suitable architectures.
In this work, we present the Hipacc framework, a DSL and source-to-source compiler for image processing. We show that domain knowledge can be captured to generate tailored implementations for C-based HLS from a common high-level DSL description targeting FPGAs. Our approach includes FPGA-specific memory architectures for handling point and local operators, as well as several high-level transformations. We evaluate our approach by comparing the resulting hardware accelerators to GPU implementations, generated from exactly the same DSL source code.},
author = {Reiche, Oliver and Özkan, Mehmet Akif and Membarth, Richard and Teich, Jürgen and Hannig, Frank},
booktitle = {Proceedings of the International Conference on Computer Aided Design (ICCAD)},
date = {2017-11-13/2017-11-16},
doi = {10.1109/ICCAD.2017.8203894},
faupublication = {yes},
isbn = {978-1-5386-3093-8},
keywords = {Domain-Specific Languages, High-Level Synthesis, Image Processing},
pages = {1026-1033},
peerreviewed = {unknown},
publisher = {IEEE},
title = {{Generating} {FPGA}-based {Image} {Processing} {Accelerators} with {Hipacc}},
venue = {Irvine},
year = {2017}
}
@article{faucris.114013504,
abstract = {We introduce a novel class of massively parallel processor architectures called invasive Tightly-Coupled Processor Arrays (TCPAs). The presented processor class is a highly parameterizable template which can be tailored before runtime to fulfill costumers' requirements such as performance, area cost, and energy efficiency. These programmable accelerators are well suited for domain-specific computing from the areas of signal, image, and video processing as well as other streaming processing applications. To overcome future scaling issues (e.g., power consumption, reliability, resource management, as well as application parallelization and mapping), TCPAs are inherently designed in way that they support self-adaptivity and resource awareness at hardware level. Here, we follow a recently introduced resource-aware parallel computing paradigm called invasive computing where an application can dynamically claim, execute, and release the resources. Furthermore, we show how invasive computing can be used as an enabler for power management. For the first time, we present a seamless mapping flow for TCPAs, based on a domain-specific language. Moreover, we outline a complete symbolic mapping approach. Finally, we support our claims by comparing a TCPA against an ARM Mali-T604 GPU in terms of performance and energy efficiency. © 2014 ACM.},
author = {Hannig, Frank and Lari, Vahid and Boppu, Srinivas and Tanase, Alexandru-Petru and Reiche, Oliver},
doi = {10.1145/2584660},
faupublication = {yes},
journal = {ACM Transactions on Embedded Computing Systems},
keywords = {Code generation; Energy efficiency; Performance; Processor arrays},
note = {UnivIS-Import:2015-03-09:Pub.2014.tech.IMMD.inform.invasi},
pages = {133:1-133:29},
peerreviewed = {Yes},
title = {{Invasive} {Tightly}-{Coupled} {Processor} {Arrays}: {A} {Domain}-{Specific} {Architecture}/{Compiler} {Co}-{Design} {Approach}},
volume = {13},
year = {2014}
}
@phdthesis{faucris.203495885,
author = {Sousa, Éricles},
faupublication = {yes},
peerreviewed = {automatic},
school = {Friedrich-Alexander-Universität Erlangen-Nürnberg},
title = {{Memory} and {Interface} {Architectures} for {Invasive} {Tightly} {Coupled} {Processor} {Arrays}},
year = {2018}
}
@phdthesis{faucris.203504965,
author = {Roloff, Sascha},
doi = {10.1007/978-981-13-8387-8},
faupublication = {yes},
peerreviewed = {automatic},
school = {Friedrich-Alexander-Universität Erlangen-Nürnberg},
title = {{Modeling} and {Simulation} of {Invasive} {Applications} and {Architectures}},
year = {2018}
}
@inproceedings{faucris.117333964,
address = {New York},
author = {Götzfried, Johannes and Müller, Tilo and Drescher, Gabor and Nürnberger, Stefan and Backes, Michael},
booktitle = {11th ACM Asia Conference on Computer and Communications Security},
doi = {10.1145/2897845.2897924},
faupublication = {yes},
note = {UnivIS-Import:2016-06-01:Pub.2016.tech.IMMD.lehrst.ramcry},
peerreviewed = {unknown},
publisher = {ACM},
series = {Special Interest Group on Security, Audit and Control (SIGSAC)},
title = {{RamCrypt}: {Kernel}-based {Address} {Space} {Encryption} for {User}-mode {Processes}},
url = {https://www1.cs.fau.de/ramcrypt},
venue = {Xi'an, China},
year = {2016}
}